Scientific Programming: data structures - NumPy, Pandas & beyond

Federica Lionetto (federica.lionetto@gmail.com)

The content of the lecture might be reused, also in parts, under the CC-licence by-sa 4.0

NumPy

NumPy vs. standard Python, need for speed

In [1]:
# Creating a standard Python list
L = list(range(1000))
In [2]:
# How long does it take to calculate the element-wise square?
%timeit [i**2 for i in L]
258 µs ± 9.61 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [3]:
# Now do the same with a NumPy array
import numpy as np
a = np.arange(1000)
In [4]:
%timeit a**2
1.05 µs ± 23.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

Details about NumPy

In [5]:
np.__version__
Out[5]:
'1.18.1'
In [6]:
np.show_config()
blas_mkl_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Applications/anaconda3/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Applications/anaconda3/include']
blas_opt_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Applications/anaconda3/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Applications/anaconda3/include']
lapack_mkl_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Applications/anaconda3/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Applications/anaconda3/include']
lapack_opt_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Applications/anaconda3/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Applications/anaconda3/include']

Creating NumPy arrays

In [7]:
a = np.array([1,2,4])
print(a)
[1 2 4]
In [8]:
b = np.arange(1,15,2)
print(b)
[ 1  3  5  7  9 11 13]
In [9]:
c = np.linspace(0,1,6)
print(c)
[0.  0.2 0.4 0.6 0.8 1. ]
In [10]:
d = np.empty((1,3))
print(d)
[[-2.68156159e+154  1.29073722e-231  2.24788568e-314]]
In [11]:
e = np.zeros((2,5,3))
print(e)
[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]]
In [12]:
f = np.ones((3,3))
print(f)
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
In [13]:
g = np.eye(4)
print(g)
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
In [14]:
h = np.identity(4)
print(h)
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
In [15]:
i = np.diag(np.array([1,2,3,4]))
print(i)
[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]
In [16]:
l = np.diag(np.array([1,2,3,4]),k=-1)
print(l)
[[0 0 0 0 0]
 [1 0 0 0 0]
 [0 2 0 0 0]
 [0 0 3 0 0]
 [0 0 0 4 0]]
In [17]:
m = np.diag(np.array([1,2,3,4]),k=2)
print(m)
[[0 0 1 0 0 0]
 [0 0 0 2 0 0]
 [0 0 0 0 3 0]
 [0 0 0 0 0 4]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
In [18]:
# arrays = [a,b,c,d,e,f,g,h,i,l,m]
# for array in arrays :
#     print(array)
#     print('')

NumPy arrays of random numbers

In [19]:
a = np.random.rand(4)
b = np.random.rand(4,3)
c = np.random.randint(1,3,(2,3)) 
d = np.random.randn(4,5) 
e = np.random.poisson(3,5) 

arrays = [a,b,c,d,e]
for array in arrays :
    print(array)
    print('')
[0.12487955 0.32540027 0.55050082 0.82159694]

[[0.03779511 0.84623009 0.21351958]
 [0.71318591 0.19478024 0.09046315]
 [0.50086882 0.72788469 0.02673014]
 [0.58718376 0.83127449 0.51603429]]

[[2 1 2]
 [1 1 1]]

[[ 0.25616476  1.36766857  0.65396252 -1.19638717 -0.03445382]
 [-0.39290307 -0.31627675 -1.07589886 -0.46325619  0.1533276 ]
 [-0.89151165  2.8337644  -0.02512282  0.21594233  0.2650443 ]
 [ 0.68145115  2.13745762 -0.09847417 -0.85500224 -0.1632327 ]]

[3 4 3 4 4]

In [20]:
# Random seed
np.random.seed(10)
arr1 = np.random.rand(5)
print('Array with 5 elements, random seed 10:')
print(arr1)
Array with 5 elements, random seed 10:
[0.77132064 0.02075195 0.63364823 0.74880388 0.49850701]
In [21]:
arr2 = np.random.rand(10)
print('Array with 10 elements, random seed not set:')
print(arr2)
Array with 10 elements, random seed not set:
[0.22479665 0.19806286 0.76053071 0.16911084 0.08833981 0.68535982
 0.95339335 0.00394827 0.51219226 0.81262096]
In [22]:
np.random.seed(10)
arr3 = np.random.rand(10)
print('Array with 10 elements, random seed 10:')
print(arr3)
Array with 10 elements, random seed 10:
[0.77132064 0.02075195 0.63364823 0.74880388 0.49850701 0.22479665
 0.19806286 0.76053071 0.16911084 0.08833981]

Basic operations

In [23]:
a = np.random.rand(3,4)
b = np.random.rand(3,4)
print(a)
print('')
print(b)
[[0.68535982 0.95339335 0.00394827 0.51219226]
 [0.81262096 0.61252607 0.72175532 0.29187607]
 [0.91777412 0.71457578 0.54254437 0.14217005]]

[[0.37334076 0.67413362 0.44183317 0.43401399]
 [0.61776698 0.51313824 0.65039718 0.60103895]
 [0.8052232  0.52164715 0.90864888 0.31923609]]
In [24]:
a+b
Out[24]:
array([[1.05870058, 1.62752696, 0.44578144, 0.94620626],
       [1.43038794, 1.12566431, 1.3721525 , 0.89291502],
       [1.72299732, 1.23622294, 1.45119325, 0.46140614]])
In [25]:
a-b
Out[25]:
array([[ 0.31201906,  0.27925973, -0.43788491,  0.07817827],
       [ 0.19485398,  0.09938782,  0.07135814, -0.30916289],
       [ 0.11255093,  0.19292863, -0.36610451, -0.17706604]])
In [26]:
a*b
Out[26]:
array([[0.25587276, 0.6427145 , 0.00174448, 0.22229861],
       [0.5020104 , 0.31431055, 0.46942762, 0.17542889],
       [0.73901301, 0.37275642, 0.49298233, 0.04538581]])
In [27]:
a/b
Out[27]:
array([[1.83574871, 1.41424982, 0.0089361 , 1.18012845],
       [1.31541664, 1.19368625, 1.10971471, 0.48561922],
       [1.13977606, 1.36984508, 0.59708913, 0.44534453]])
In [28]:
# Add 3.0 to every element
a+3.0
Out[28]:
array([[3.68535982, 3.95339335, 3.00394827, 3.51219226],
       [3.81262096, 3.61252607, 3.72175532, 3.29187607],
       [3.91777412, 3.71457578, 3.54254437, 3.14217005]])
In [29]:
# Conditions
a>b
Out[29]:
array([[ True,  True, False,  True],
       [ True,  True,  True, False],
       [ True,  True, False, False]])
In [30]:
a.min()
Out[30]:
0.003948266327914451
In [31]:
a.min(axis=0)
Out[31]:
array([0.68535982, 0.61252607, 0.00394827, 0.14217005])
In [32]:
a.min(axis=1)
Out[32]:
array([0.00394827, 0.29187607, 0.14217005])
In [33]:
# Numpy has its own set of functions
np.exp(b)
Out[33]:
array([[1.45257924, 1.9623321 , 1.55555621, 1.54344047],
       [1.85478165, 1.67052549, 1.9163018 , 1.82401288],
       [2.23719578, 1.68480049, 2.48096818, 1.37607616]])
In [34]:
# Numpy has its own set of functions
np.cos(b)
Out[34]:
array([[0.93111407, 0.78124806, 0.90396932, 0.90728511],
       [0.81517389, 0.87120819, 0.79584337, 0.82474853],
       [0.69295033, 0.86699957, 0.6148119 , 0.94947544]])
In [35]:
# Functions in the math library are not able to handle multi-element data
import math
math.exp(b)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-a89c724cb502> in <module>
      1 # Functions in the math library are not able to handle multi-element data
      2 import math
----> 3 math.exp(b)

TypeError: only size-1 arrays can be converted to Python scalars

Data representation

In [36]:
a = np.array([1,0,-2],dtype=np.int64)
print(a)
[ 1  0 -2]
In [37]:
b = np.array(a,dtype=np.int8)
print(b)
[ 1  0 -2]
In [38]:
c = np.array(a,dtype=np.int)
print(c)
[ 1  0 -2]
In [39]:
d = np.array(a,dtype=np.float64)
print(d)
[ 1.  0. -2.]
In [40]:
e = np.array(a,dtype=np.bool)
print(e)
[ True False  True]
In [41]:
e.dtype
Out[41]:
dtype('bool')
In [42]:
print('3 elements np.int64 correspond to', a.nbytes, 'bytes')
print('3 elements np.int8 correspond to', b.nbytes, 'bytes')
print('3 elements np.int correspond to', c.nbytes, 'bytes')
print('3 elements np.float64 correspond to', d.nbytes, 'bytes')
print('3 elements np.bool correspond to', e.nbytes, 'bytes')
3 elements np.int64 correspond to 24 bytes
3 elements np.int8 correspond to 3 bytes
3 elements np.int correspond to 24 bytes
3 elements np.float64 correspond to 24 bytes
3 elements np.bool correspond to 3 bytes
In [43]:
a = np.ones((3,4),dtype=np.int8)
b = np.ones((3,4),dtype=np.int64)
In [44]:
print('np.int8 bytes:')
print(a.tobytes())
print('')
print('np.int64 bytes:')
print(b.tobytes())
np.int8 bytes:
b'\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01'

np.int64 bytes:
b'\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00'

Data structure

In [45]:
print(a.ndim)
print(a.shape)
print(a.size)
print(a.itemsize)
2
(3, 4)
12
1
In [46]:
print(b.ndim)
print(b.shape)
print(b.size)
print(b.itemsize)
2
(3, 4)
12
8
In [47]:
print(a.nbytes)
print(b.nbytes)
12
96
In [48]:
print(a.data)
<memory at 0x11ee0f050>
In [49]:
print(a.data.tobytes())
print(a.tobytes())
b'\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01'
b'\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01'
In [50]:
print(a.flags)
print('')
print(a.T.flags)
  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False


  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [51]:
print(a.strides)
print(b.strides)
print(a.T.strides)
(4, 1)
(32, 8)
(1, 4)

Shape manipulation

In [52]:
# Let's define an array of values distributed according to a Normal distribution
a = np.random.randn(3,4)
In [53]:
print(a.reshape(1,12))
print('')
print(a)
[[-0.26731719 -0.54930901  0.1327083  -0.47614201  1.30847308  0.19501328
   0.40020999 -0.33763234  1.25647226 -0.7319695   0.66023155 -0.35087189]]

[[-0.26731719 -0.54930901  0.1327083  -0.47614201]
 [ 1.30847308  0.19501328  0.40020999 -0.33763234]
 [ 1.25647226 -0.7319695   0.66023155 -0.35087189]]
In [54]:
print(a.resize(1,12))
print('')
print(a)
None

[[-0.26731719 -0.54930901  0.1327083  -0.47614201  1.30847308  0.19501328
   0.40020999 -0.33763234  1.25647226 -0.7319695   0.66023155 -0.35087189]]
In [55]:
# Need to define a as in the beginning again
a = np.random.randn(3,4)
In [56]:
print(a.ravel())
print('')
print(a)
[-0.93943336 -0.48933722 -0.80459114 -0.21269764 -0.33914025  0.31216994
  0.56515267 -0.14742026 -0.02590534  0.2890942  -0.53987907  0.70816002]

[[-0.93943336 -0.48933722 -0.80459114 -0.21269764]
 [-0.33914025  0.31216994  0.56515267 -0.14742026]
 [-0.02590534  0.2890942  -0.53987907  0.70816002]]
In [57]:
# Need to define a as in the beginning again
a = np.random.randn(3,4)
In [58]:
print(a.T)
[[ 0.84222474 -0.11227247  1.12878515]
 [ 0.2035808  -0.36218045 -0.69781003]
 [ 2.39470366 -0.23218226 -0.08112218]
 [ 0.91745894 -0.5017289  -0.52929608]]
In [59]:
# Bad practices
b = np.random.randn(4)
print(b.shape)
print(b.T.shape)
(4,)
(4,)
In [60]:
# Good practices
c = np.random.randn(4,1)
print(c.shape)
print(c.T.shape)
(4, 1)
(1, 4)

Accessing array elements

In [61]:
a = np.ones((3,4),dtype=np.int64)
print(a)
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
In [62]:
b = a
In [63]:
a[0,0]=0
In [64]:
print(b)
[[0 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
In [65]:
c = a.copy()
In [66]:
a[1,1]=0
In [67]:
print(a)
[[0 1 1 1]
 [1 0 1 1]
 [1 1 1 1]]
In [68]:
print(c)
[[0 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
In [69]:
print(b)
[[0 1 1 1]
 [1 0 1 1]
 [1 1 1 1]]
In [70]:
d = 1*a
In [71]:
a[2,2] = 0
In [72]:
print(a)
[[0 1 1 1]
 [1 0 1 1]
 [1 1 0 1]]
In [73]:
print(d)
[[0 1 1 1]
 [1 0 1 1]
 [1 1 1 1]]

Get the data

In [74]:
# Let's have a look at the loadEx.txt file
!head loadEx.txt
94.820 76.280 33.020 29.660 25.460
91.610 71.480 31.710 29.610 25.460
94.820 68.130 32.630 31.460 25.910
93.190 71.050 35.250 31.780 27.020
92.780 71.320 35.950 32.700 27.490
96.460 72.880 35.780 32.240 26.750
97.280 73.260 35.160 31.640 26.500
97.690 72.880 35.250 31.780 26.410
96.660 72.990 35.250 31.270 26.650
97.790 73.960 34.750 32.470 26.480
In [75]:
data = np.loadtxt('loadEx.txt',delimiter=' ',comments="#")
In [76]:
print(data)
[[ 94.82  76.28  33.02  29.66  25.46]
 [ 91.61  71.48  31.71  29.61  25.46]
 [ 94.82  68.13  32.63  31.46  25.91]
 ...
 [160.51 196.19 166.71 132.29 113.32]
 [160.05 195.71 165.84 131.83 113.11]
 [160.44 193.83 164.32 129.98 112.18]]
In [77]:
print(data.shape)
(3773, 5)
In [78]:
data[0,1]
Out[78]:
76.28
In [79]:
# A more complex example
dt = np.dtype([('name','S7'),('mass',np.float),
 ('position',[('x',np.float),('y',np.float),('z',np.float)]),
 ('velocity',[('x',np.float),('y',np.float),('z',np.float)])])
In [80]:
solarData = np.loadtxt('Solar.txt',dtype=dt)
In [81]:
print(solarData)
[(b'Sun', 3.32946000e+05, ( 2.13e-03, -1.60e-03, -1.20e-04), ( 5.01e-06,  4.08e-06, -1.24e-07))
 (b'Mercury', 5.52735260e-02, ( 1.62e-01,  2.64e-01,  6.94e-03), (-2.97e-02,  1.56e-02,  4.00e-03))
 (b'Venus', 8.14997513e-01, ( 3.02e-01,  6.54e-01, -8.44e-03), (-1.85e-02,  8.32e-03,  1.18e-03))
 (b'Earth', 1.00000000e+00, ( 5.66e-01, -8.46e-01, -9.12e-05), ( 1.40e-02,  9.49e-03, -5.81e-07))
 (b'Mars', 1.07446849e-01, (-4.34e-01, -1.43e+00, -1.93e-02), ( 1.39e-02, -2.88e-03, -4.02e-04))
 (b'Jupiter', 3.17828133e+02, (-2.78e+00,  4.47e+00,  4.35e-02), (-6.50e-03, -3.62e-03,  1.61e-04))
 (b'Saturn', 9.51609041e+01, (-6.08e+00, -7.84e+00,  3.78e-01), ( 4.10e-03, -3.43e-03, -1.04e-04))
 (b'Uranus', 1.45357566e+01, ( 1.95e+01,  4.68e+00, -2.35e-01), (-9.48e-04,  3.64e-03,  2.58e-05))
 (b'Neptune', 1.71470000e+01, ( 2.73e+01, -1.23e+01, -3.77e-01), ( 1.27e-03,  2.88e-03, -8.85e-05))
 (b'Pluto', 2.19100000e-03, ( 6.91e+00, -3.19e+01,  1.42e+00), ( 3.14e-03,  3.08e-05, -9.18e-04))
 (b'Halley', 3.68000000e-11, (-2.05e+01,  2.51e+01, -9.76e+00), (-7.71e-05,  9.54e-04, -1.79e-04))
 (b'Moon', 1.23031000e-02, ( 5.64e-01, -8.44e-01, -3.23e-04), ( 1.36e-02,  9.18e-03,  8.97e-06))]
In [82]:
solarData['name']
Out[82]:
array([b'Sun', b'Mercury', b'Venus', b'Earth', b'Mars', b'Jupiter',
       b'Saturn', b'Uranus', b'Neptune', b'Pluto', b'Halley', b'Moon'],
      dtype='|S7')
In [83]:
solarData['position']['x']
Out[83]:
array([ 2.13e-03,  1.62e-01,  3.02e-01,  5.66e-01, -4.34e-01, -2.78e+00,
       -6.08e+00,  1.95e+01,  2.73e+01,  6.91e+00, -2.05e+01,  5.64e-01])
In [84]:
solarData['position']['x'][np.where(solarData['name']==b'Sun')]
Out[84]:
array([0.00213])

Broadcasting

In [85]:
a = np.random.rand(3,5)
b = np.random.rand(8)
In [86]:
c = a[...,np.newaxis]*b
print(c.shape)
(3, 5, 8)
In [87]:
d = np.random.rand(1,10)
e = np.random.rand(10,1)
print(d.shape)
print(d)
print('')
print(e.shape)
print(e)
(1, 10)
[[0.46453081 0.78194912 0.71860281 0.58602198 0.03709441 0.35065639
  0.56319068 0.29972987 0.51233415 0.67346693]]

(10, 1)
[[0.15919373]
 [0.05047767]
 [0.33781589]
 [0.10806377]
 [0.17890281]
 [0.8858271 ]
 [0.36536497]
 [0.21876935]
 [0.75249617]
 [0.10687958]]
In [88]:
# Explicit broadcasting.
dd,ee = np.broadcast_arrays(d,e)
print(dd.shape)
print(ee.shape)
(10, 10)
(10, 10)
In [89]:
d[0,0]=-1.0
In [90]:
dd
Out[90]:
array([[-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693],
       [-1.        ,  0.78194912,  0.71860281,  0.58602198,  0.03709441,
         0.35065639,  0.56319068,  0.29972987,  0.51233415,  0.67346693]])
In [91]:
# ee
In [92]:
print(dd.strides)
print(ee.strides)
(0, 8)
(8, 0)

Simple indexing

In [93]:
# Notice that this does not use additional memory!!!
a = np.arange(100).reshape(10,10)
In [94]:
# Access rows
a[4:9]
Out[94]:
array([[40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89]])
In [95]:
# Access columns
a[:,3:8]
Out[95]:
array([[ 3,  4,  5,  6,  7],
       [13, 14, 15, 16, 17],
       [23, 24, 25, 26, 27],
       [33, 34, 35, 36, 37],
       [43, 44, 45, 46, 47],
       [53, 54, 55, 56, 57],
       [63, 64, 65, 66, 67],
       [73, 74, 75, 76, 77],
       [83, 84, 85, 86, 87],
       [93, 94, 95, 96, 97]])
In [96]:
# Negative indices
a[:,-1]
Out[96]:
array([ 9, 19, 29, 39, 49, 59, 69, 79, 89, 99])
In [97]:
# Ranges
a[-2::-3,1:6:2]
Out[97]:
array([[81, 83, 85],
       [51, 53, 55],
       [21, 23, 25]])

Fancy indexing

In [98]:
a[:,[1,3,1]]
Out[98]:
array([[ 1,  3,  1],
       [11, 13, 11],
       [21, 23, 21],
       [31, 33, 31],
       [41, 43, 41],
       [51, 53, 51],
       [61, 63, 61],
       [71, 73, 71],
       [81, 83, 81],
       [91, 93, 91]])
In [99]:
a[[1,3,1]][:,[1,3,1]]
Out[99]:
array([[11, 13, 11],
       [31, 33, 31],
       [11, 13, 11]])
In [100]:
a[[1,3,1],[1,3,1]]
Out[100]:
array([11, 33, 11])
In [101]:
# Multidimensional arrays indexed by multidimensional arrays.
y = np.arange(35).reshape(5,7)
print(y)
[[ 0  1  2  3  4  5  6]
 [ 7  8  9 10 11 12 13]
 [14 15 16 17 18 19 20]
 [21 22 23 24 25 26 27]
 [28 29 30 31 32 33 34]]
In [102]:
# If the index arrays have a matching shape, 
# and there is an index array for each dimension of the array being indexed, 
# the resultant array has the same shape as the index arrays, 
# and the values correspond to the index set for each position in the index arrays.
# [0,0], [2,1], and [4,2] elements of the indexed array.
y[np.array([0,2,4]), np.array([0,1,2])]
Out[102]:
array([ 0, 15, 30])
In [103]:
# If the index arrays do not have the same shape, a broadcasting is tried.
# [0,1], [2,1], and [4,1] elements of the indexed array.
y[np.array([0,2,4]), 1]
Out[103]:
array([ 1, 15, 29])
In [104]:
# If we provide just one index array, the rows are selected but the columns are kept as they were in the indexed array.
y[np.array([0,2,4])]
Out[104]:
array([[ 0,  1,  2,  3,  4,  5,  6],
       [14, 15, 16, 17, 18, 19, 20],
       [28, 29, 30, 31, 32, 33, 34]])
In [105]:
# Fancy indexing.
i0 = np.random.randint(0,10,(8,1,8)) # Matrix of random integers between 0 and 10 with shape (8,1,8).
i1 = np.random.randint(0,10,(2,8)) # Matrix of random integers between 0 and 10 with shape (2,8).
In [106]:
a[i0,i1] # creates a 8×2×8 array
Out[106]:
array([[[86, 48, 90, 24, 16, 13, 65, 39],
        [83, 48, 93, 20, 19, 16, 62, 33]],

       [[46, 58, 40,  4, 26, 83,  5, 79],
        [43, 58, 43,  0, 29, 86,  2, 73]],

       [[66, 58, 80, 24, 26,  3, 15, 29],
        [63, 58, 83, 20, 29,  6, 12, 23]],

       [[36, 58, 70, 84, 76, 73, 65, 99],
        [33, 58, 73, 80, 79, 76, 62, 93]],

       [[26,  8, 40, 54, 46, 63, 95,  9],
        [23,  8, 43, 50, 49, 66, 92,  3]],

       [[46, 68, 40, 94, 86,  3, 75, 19],
        [43, 68, 43, 90, 89,  6, 72, 13]],

       [[76, 18, 10, 54, 56,  3, 85, 39],
        [73, 18, 13, 50, 59,  6, 82, 33]],

       [[76, 18, 50, 24, 36, 43, 55, 79],
        [73, 18, 53, 20, 39, 46, 52, 73]]])
In [107]:
a[i0,i1].shape
Out[107]:
(8, 2, 8)

Pandas

In [108]:
import pandas as pd
In [109]:
!head SMI.csv
Date,Open,High,Low,Close,Adj Close,Volume
1990-11-09,1378.900024,1389.0,1375.300049,1387.099976,1387.099976,0.0
1990-11-12,1388.099976,1408.099976,1388.099976,1407.5,1407.5,0.0
1990-11-13,1412.199951,1429.400024,1411.400024,1415.199951,1415.199951,0.0
1990-11-14,1413.599976,1413.599976,1402.099976,1410.300049,1410.300049,0.0
1990-11-15,1410.599976,1416.699951,1405.099976,1405.699951,1405.699951,0.0
1990-11-16,1405.699951,1407.400024,1389.400024,1395.199951,1395.199951,0.0
1990-11-19,1395.599976,1417.900024,1395.599976,1416.0,1416.0,0.0
1990-11-20,1414.800049,1415.0,1404.699951,1405.800049,1405.800049,0.0
1990-11-21,1405.599976,1405.599976,1396.699951,1398.400024,1398.400024,0.0

Loading of data and basic manipulation

In [110]:
# Series object
s = pd.Series([1,3,5,np.nan,6,8])
print(s)
print(type(s))
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
<class 'pandas.core.series.Series'>
In [111]:
# Dataframe object
ts = pd.read_csv('SMI.csv')
In [112]:
type(ts)
Out[112]:
pandas.core.frame.DataFrame
In [113]:
ts.head()
Out[113]:
Date Open High Low Close Adj Close Volume
0 1990-11-09 1378.900024 1389.000000 1375.300049 1387.099976 1387.099976 0.0
1 1990-11-12 1388.099976 1408.099976 1388.099976 1407.500000 1407.500000 0.0
2 1990-11-13 1412.199951 1429.400024 1411.400024 1415.199951 1415.199951 0.0
3 1990-11-14 1413.599976 1413.599976 1402.099976 1410.300049 1410.300049 0.0
4 1990-11-15 1410.599976 1416.699951 1405.099976 1405.699951 1405.699951 0.0
In [114]:
ts.tail()
Out[114]:
Date Open High Low Close Adj Close Volume
6738 2017-08-28 8864.230469 8864.230469 8864.230469 8864.230469 8864.230469 0.0
6739 2017-08-29 8814.540039 8814.540039 8814.540039 8814.540039 8814.540039 0.0
6740 2017-08-30 8851.259766 8851.259766 8851.259766 8851.259766 8851.259766 0.0
6741 2017-08-31 8925.450195 8925.450195 8925.450195 8925.450195 8925.450195 0.0
6742 2017-09-01 8941.620117 8941.620117 8941.620117 8941.620117 8941.620117 0.0
In [115]:
ts.index
Out[115]:
RangeIndex(start=0, stop=6743, step=1)
In [116]:
ts.columns
Out[116]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')
In [117]:
ts['Open'][:10].values
Out[117]:
array([1378.900024, 1388.099976, 1412.199951, 1413.599976, 1410.599976,
       1405.699951, 1395.599976, 1414.800049, 1405.599976, 1400.      ])
In [118]:
ts = ts.sort_values('Date')
ts.head()
Out[118]:
Date Open High Low Close Adj Close Volume
0 1990-11-09 1378.900024 1389.000000 1375.300049 1387.099976 1387.099976 0.0
1 1990-11-12 1388.099976 1408.099976 1388.099976 1407.500000 1407.500000 0.0
2 1990-11-13 1412.199951 1429.400024 1411.400024 1415.199951 1415.199951 0.0
3 1990-11-14 1413.599976 1413.599976 1402.099976 1410.300049 1410.300049 0.0
4 1990-11-15 1410.599976 1416.699951 1405.099976 1405.699951 1405.699951 0.0
In [119]:
# Find minimum and maximum values in a given column
print(ts['Volume'].min())
print(ts['Volume'].max())
0.0
346767700.0
In [120]:
# Find index corresponding to mininum and maximum values in a given column
# Careful!!! 
print(ts['Volume'].idxmin())
print(ts['Volume'].idxmax())
0
6079
In [121]:
# Access rows
ts[6079:6080]
Out[121]:
Date Open High Low Close Adj Close Volume
6079 2015-01-15 9259.200195 9277.200195 7932.200195 8400.599609 8400.599609 346767700.0
In [122]:
# Modify index
ts.index = pd.to_datetime(ts.pop("Date"))
In [123]:
ts = ts.sort_index()
In [124]:
ts.tail()
Out[124]:
Open High Low Close Adj Close Volume
Date
2017-08-28 8864.230469 8864.230469 8864.230469 8864.230469 8864.230469 0.0
2017-08-29 8814.540039 8814.540039 8814.540039 8814.540039 8814.540039 0.0
2017-08-30 8851.259766 8851.259766 8851.259766 8851.259766 8851.259766 0.0
2017-08-31 8925.450195 8925.450195 8925.450195 8925.450195 8925.450195 0.0
2017-09-01 8941.620117 8941.620117 8941.620117 8941.620117 8941.620117 0.0
In [125]:
import datetime as dt
ts[ts.index>dt.datetime(2010,1,1)].head()
Out[125]:
Open High Low Close Adj Close Volume
Date
2010-01-04 6578.500000 6631.399902 6576.000000 6631.399902 6631.399902 59150000.0
2010-01-05 6620.700195 6622.399902 6547.399902 6579.299805 6579.299805 65848500.0
2010-01-06 6598.200195 6607.799805 6550.100098 6559.399902 6559.399902 52305400.0
2010-01-07 6536.500000 6574.200195 6494.899902 6555.399902 6555.399902 64539000.0
2010-01-08 6574.700195 6635.799805 6574.000000 6617.899902 6617.899902 74761300.0
In [126]:
ts["Adj Close"].head()
Out[126]:
Date
1990-11-09    1387.099976
1990-11-12    1407.500000
1990-11-13    1415.199951
1990-11-14    1410.300049
1990-11-15    1405.699951
Name: Adj Close, dtype: float64
In [127]:
ts["Adj Close"].describe()
Out[127]:
count    6743.000000
mean     5957.266658
std      2236.843089
min      1287.599976
25%      4561.000000
50%      6374.700195
75%      7790.649902
max      9531.500000
Name: Adj Close, dtype: float64
In [128]:
# Access parameters of describe 
ts['Adj Close'].describe()['count']
Out[128]:
6743.0

Timeseries applications

In [129]:
# Resampling of time series
# Creating a series with 9 timestamps, each one corresponding to one minute
index = pd.date_range('1/6/2018', periods=9, freq='T')
series = pd.Series(range(9), index=index)
print(series)
2018-01-06 00:00:00    0
2018-01-06 00:01:00    1
2018-01-06 00:02:00    2
2018-01-06 00:03:00    3
2018-01-06 00:04:00    4
2018-01-06 00:05:00    5
2018-01-06 00:06:00    6
2018-01-06 00:07:00    7
2018-01-06 00:08:00    8
Freq: T, dtype: int64
In [130]:
# Downsample the series in bins of 3 minutes each and sum over the same bin
series.resample('3T').sum()
Out[130]:
2018-01-06 00:00:00     3
2018-01-06 00:03:00    12
2018-01-06 00:06:00    21
Freq: 3T, dtype: int64
In [131]:
# Label the bin using the upper bound
series.resample('3T', label='right').sum()
Out[131]:
2018-01-06 00:03:00     3
2018-01-06 00:06:00    12
2018-01-06 00:09:00    21
Freq: 3T, dtype: int64
In [132]:
# DataFrame.resample(rule, axis=0)
# The object must have a datetime-like index
ts_monthly = ts["Adj Close"].resample("M").apply(["median","mean","std","count","max","min"]).head()
In [133]:
ts_monthly
Out[133]:
median mean std count max min
Date
1990-11-30 1392.000000 1390.387497 20.156853 16 1416.000000 1353.699951
1990-12-31 1405.349976 1404.744446 21.675076 18 1450.300049 1371.199951
1991-01-31 1350.000000 1357.580956 44.510815 21 1438.599976 1287.599976
1991-02-28 1538.799988 1530.924988 50.931718 20 1603.199951 1448.099976
1991-03-31 1614.649964 1611.519995 24.735199 20 1650.599976 1559.000000

Basic visualisation

In [134]:
day_return = ts["Adj Close"].pct_change().dropna()
mean_30day = day_return.rolling(30).mean()

import numpy as np

minmax_30day = day_return.rolling(30).apply(lambda x: (np.max(x)+np.min(x))*0.5)

mean_30day.resample("M").apply(["mean"]).plot()
minmax_30day.resample("M").apply(["mean"]).plot()

import matplotlib.pyplot as plt
plt.show()

Creating timeseries and filling missing values

In [135]:
dates = pd.date_range(ts.index.min(),ts.index.max(),freq="D")
print(dates)
DatetimeIndex(['1990-11-09', '1990-11-10', '1990-11-11', '1990-11-12',
               '1990-11-13', '1990-11-14', '1990-11-15', '1990-11-16',
               '1990-11-17', '1990-11-18',
               ...
               '2017-08-23', '2017-08-24', '2017-08-25', '2017-08-26',
               '2017-08-27', '2017-08-28', '2017-08-29', '2017-08-30',
               '2017-08-31', '2017-09-01'],
              dtype='datetime64[ns]', length=9794, freq='D')
In [136]:
ts_alldays = pd.Series(index=dates,data=ts["Adj Close"])
In [137]:
ts_alldays.head()
Out[137]:
1990-11-09    1387.099976
1990-11-10            NaN
1990-11-11            NaN
1990-11-12    1407.500000
1990-11-13    1415.199951
Freq: D, Name: Adj Close, dtype: float64
In [138]:
ts_alldays.fillna(method="ffill",inplace=True)
ts_alldays.head()
Out[138]:
1990-11-09    1387.099976
1990-11-10    1387.099976
1990-11-11    1387.099976
1990-11-12    1407.500000
1990-11-13    1415.199951
Freq: D, Name: Adj Close, dtype: float64

Hook up to data sources

In [139]:
# pd.__version__
In [140]:
# pip install pandas-datareader
In [141]:
version = [int(v) for v in pd.__version__.split('.')]
if (version[0] == 0 and version[1] >= 17) or (version[0] == 1): # Test if version is >= 0.17
    from pandas_datareader import data, wb
else:
    from pandas.io import data, wb
/Applications/anaconda3/lib/python3.7/site-packages/pandas_datareader/compat/__init__.py:7: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  from pandas.util.testing import assert_frame_equal
In [142]:
# Retrieve information from FRED
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2018, 1, 1)
# df = data.DataReader('F', 'google', start, end)
df = data.DataReader('GDP', 'fred', start, end)
print(df.shape)
print(df.head())
print(df.tail())
(33, 1)
                  GDP
DATE                 
2010-01-01  14721.350
2010-04-01  14926.098
2010-07-01  15079.917
2010-10-01  15240.843
2011-01-01  15285.828
                  GDP
DATE                 
2017-01-01  19190.431
2017-04-01  19356.649
2017-07-01  19611.704
2017-10-01  19918.910
2018-01-01  20163.159
In [143]:
# Let's say we want to compare the Gross Domestic Products per capita in constant dollars in North America
# wb.search('gdp.*capita.*const')

Spreadsheet operations

In [144]:
# Let's use the download function to acquire the data from the World Bank’s servers
# gdp_data = wb.download(indicator='NY.GDP.PCAP.KD',country=['CH','US','GB','DE'],start=2006,end=2016)
# gdp_data.head(20)
In [145]:
# gdp_data.shape
In [146]:
# gdp_data.columns
In [147]:
# gdp_data.unstack(level=0)
In [148]:
# gdp_data.unstack(level=1)
In [149]:
# gdp_data.groupby(level=0).mean()
In [150]:
# gdp_data.groupby(level=0).std()

And some further work with Dataframes

In [151]:
df_us_zip = pd.read_csv("us_postal_codes.csv")
In [152]:
df_us_zip.shape
Out[152]:
(40933, 7)
In [153]:
df_us_zip.columns
Out[153]:
Index(['Zip Code', 'Place Name', 'State', 'State Abbreviation', 'County',
       'Latitude', 'Longitude'],
      dtype='object')
In [154]:
df_us_zip.describe()
Out[154]:
Zip Code Latitude Longitude
count 40933.000000 40933.000000 40933.000000
mean 49819.569858 38.596225 -91.082332
std 27808.948650 5.255750 15.763730
min 501.000000 7.112800 -176.658100
25% 26451.000000 35.052600 -97.308100
50% 49036.000000 39.152200 -87.976700
75% 73042.000000 41.894300 -80.142300
max 99950.000000 71.234600 171.237000
In [155]:
df_us_zip.dtypes
Out[155]:
Zip Code                int64
Place Name             object
State                  object
State Abbreviation     object
County                 object
Latitude              float64
Longitude             float64
dtype: object
In [156]:
df_us_zip.head()
Out[156]:
Zip Code Place Name State State Abbreviation County Latitude Longitude
0 501 Holtsville New York NY Suffolk 40.8154 -73.0451
1 544 Holtsville New York NY Suffolk 40.8154 -73.0451
2 1001 Agawam Massachusetts MA Hampden 42.0702 -72.6227
3 1002 Amherst Massachusetts MA Hampshire 42.3671 -72.4646
4 1003 Amherst Massachusetts MA Hampshire 42.3919 -72.5248
In [157]:
df_us_state_coord = df_us_zip.get(["State Abbreviation","Latitude","Longitude"]).groupby(["State Abbreviation"]).mean()
In [158]:
df_us_state_coord.shape
Out[158]:
(57, 2)
In [159]:
df_us_state_coord.head()
Out[159]:
Latitude Longitude
State Abbreviation
AA 33.036400 -82.249300
AK 61.456423 -152.486981
AL 32.886361 -86.813639
AP 32.349325 -112.935950
AR 35.124723 -92.402676
In [160]:
# How many entries have "Washington" as "Place Name"?
# Let's count the unique values in the "Place Name" field
# Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)
df_us_zip["Place Name"].value_counts().head()
Out[160]:
Washington    295
Houston       187
New York      146
El Paso       139
Dallas        114
Name: Place Name, dtype: int64
In [161]:
# Cross-check
df_us_zip[df_us_zip['Place Name']=='Washington'].shape
Out[161]:
(295, 7)
In [162]:
df_us_places = df_us_zip.get(["Place Name","State Abbreviation","Latitude","Longitude"])
df_us_places = df_us_places.groupby(["Place Name","State Abbreviation"]).mean()
print(df_us_places.shape)
df_us_places
(29545, 2)
Out[162]:
Latitude Longitude
Place Name State Abbreviation
APO AA 33.03640 -82.2493
AP 34.28285 -105.0628
Aaronsburg PA 40.89870 -77.4562
Abbeville AL 31.57550 -85.2790
GA 31.96480 -83.3068
... ... ... ...
Zuni NM 35.06840 -108.8336
VA 36.84370 -76.8110
Zurich MT 48.58440 -109.0304
Zwingle IA 42.27750 -90.7507
Zwolle LA 31.61380 -93.6636

29545 rows × 2 columns

In [163]:
df_us_places.reset_index(inplace=True)
print(df_us_places.shape)
print(df_us_places.columns)
(29545, 4)
Index(['Place Name', 'State Abbreviation', 'Latitude', 'Longitude'], dtype='object')
In [164]:
df_us_places["Place Name"].value_counts().head()
Out[164]:
Franklin      27
Clinton       26
Madison       26
Washington    26
Salem         24
Name: Place Name, dtype: int64
In [165]:
# Cross-check
df_us_places[df_us_places['Place Name']=='Franklin'].shape
Out[165]:
(27, 4)
In [166]:
# Mapping
# Map values of Series using input correspondence (a dict, Series, or function).
# Series.map(arg, na_action=None)
df_us_places["isSwiss"] = df_us_places["Place Name"].map(lambda x: any([s in x for s in ["Zurich", "Berne", "Basel", "Lucerne", "Glarus", "Geneva"]]))
df_us_places[df_us_places["isSwiss"]]
Out[166]:
Place Name State Abbreviation Latitude Longitude isSwiss
2096 Berne IN 40.6716 -84.9343 True
2097 Berne NY 42.6108 -74.1466 True
7523 East Berne NY 42.6191 -74.0555 True
9916 Geneva AL 31.0414 -85.8847 True
9917 Geneva FL 28.7503 -81.1114 True
9918 Geneva GA 32.5799 -84.5508 True
9919 Geneva IA 42.6755 -93.1294 True
9920 Geneva ID 42.3136 -111.0722 True
9921 Geneva IL 41.8860 -88.3110 True
9922 Geneva IN 40.6071 -84.9621 True
9923 Geneva MN 43.8235 -93.2671 True
9924 Geneva NE 40.5277 -97.6096 True
9925 Geneva NY 42.8637 -76.9913 True
9926 Geneva OH 41.8029 -80.9474 True
14181 Lake Geneva FL 29.7683 -81.9907 True
14182 Lake Geneva WI 42.5881 -88.4554 True
14253 Lake Zurich IL 42.2165 -88.0769 True
15522 Lucerne CA 39.0783 -122.7846 True
15523 Lucerne CO 40.4824 -104.7054 True
15524 Lucerne IN 40.8614 -86.4077 True
15525 Lucerne MO 40.4382 -93.2867 True
15526 Lucerne Valley CA 34.4470 -116.9189 True
15527 Lucernemines PA 40.5567 -79.1515 True
18544 New Geneva PA 39.7884 -79.9092 True
18547 New Glarus WI 42.8143 -89.6437 True
29542 Zurich MT 48.5844 -109.0304 True

Merging data

In [167]:
df1 = df_us_zip[:5].copy()
df2 = df_us_zip[5:10].copy()
print(df1.head())
print(df2.head())
   Zip Code  Place Name          State State Abbreviation     County  \
0       501  Holtsville       New York                 NY    Suffolk   
1       544  Holtsville       New York                 NY    Suffolk   
2      1001      Agawam  Massachusetts                 MA    Hampden   
3      1002     Amherst  Massachusetts                 MA  Hampshire   
4      1003     Amherst  Massachusetts                 MA  Hampshire   

   Latitude  Longitude  
0   40.8154   -73.0451  
1   40.8154   -73.0451  
2   42.0702   -72.6227  
3   42.3671   -72.4646  
4   42.3919   -72.5248  
   Zip Code   Place Name          State State Abbreviation     County  \
5      1004      Amherst  Massachusetts                 MA  Hampshire   
6      1005        Barre  Massachusetts                 MA  Worcester   
7      1007  Belchertown  Massachusetts                 MA  Hampshire   
8      1008    Blandford  Massachusetts                 MA    Hampden   
9      1009   Bondsville  Massachusetts                 MA    Hampden   

   Latitude  Longitude  
5   42.3845   -72.5132  
6   42.4097   -72.1084  
7   42.2751   -72.4110  
8   42.1829   -72.9361  
9   42.2061   -72.3405  
In [168]:
dfs = [df1,df2]
In [169]:
result = df1.append(df2)
print(result)
   Zip Code   Place Name          State State Abbreviation     County  \
0       501   Holtsville       New York                 NY    Suffolk   
1       544   Holtsville       New York                 NY    Suffolk   
2      1001       Agawam  Massachusetts                 MA    Hampden   
3      1002      Amherst  Massachusetts                 MA  Hampshire   
4      1003      Amherst  Massachusetts                 MA  Hampshire   
5      1004      Amherst  Massachusetts                 MA  Hampshire   
6      1005        Barre  Massachusetts                 MA  Worcester   
7      1007  Belchertown  Massachusetts                 MA  Hampshire   
8      1008    Blandford  Massachusetts                 MA    Hampden   
9      1009   Bondsville  Massachusetts                 MA    Hampden   

   Latitude  Longitude  
0   40.8154   -73.0451  
1   40.8154   -73.0451  
2   42.0702   -72.6227  
3   42.3671   -72.4646  
4   42.3919   -72.5248  
5   42.3845   -72.5132  
6   42.4097   -72.1084  
7   42.2751   -72.4110  
8   42.1829   -72.9361  
9   42.2061   -72.3405  
In [170]:
result = pd.concat(dfs)
print(result)
   Zip Code   Place Name          State State Abbreviation     County  \
0       501   Holtsville       New York                 NY    Suffolk   
1       544   Holtsville       New York                 NY    Suffolk   
2      1001       Agawam  Massachusetts                 MA    Hampden   
3      1002      Amherst  Massachusetts                 MA  Hampshire   
4      1003      Amherst  Massachusetts                 MA  Hampshire   
5      1004      Amherst  Massachusetts                 MA  Hampshire   
6      1005        Barre  Massachusetts                 MA  Worcester   
7      1007  Belchertown  Massachusetts                 MA  Hampshire   
8      1008    Blandford  Massachusetts                 MA    Hampden   
9      1009   Bondsville  Massachusetts                 MA    Hampden   

   Latitude  Longitude  
0   40.8154   -73.0451  
1   40.8154   -73.0451  
2   42.0702   -72.6227  
3   42.3671   -72.4646  
4   42.3919   -72.5248  
5   42.3845   -72.5132  
6   42.4097   -72.1084  
7   42.2751   -72.4110  
8   42.1829   -72.9361  
9   42.2061   -72.3405  
In [171]:
df1 = df_us_zip[['Zip Code','Place Name','State']][:5].copy()
df2 = df_us_zip[['Zip Code','Latitude','Longitude']][3:8].copy()
dfs = [df1,df2]
print(df1)
print(df2)
   Zip Code  Place Name          State
0       501  Holtsville       New York
1       544  Holtsville       New York
2      1001      Agawam  Massachusetts
3      1002     Amherst  Massachusetts
4      1003     Amherst  Massachusetts
   Zip Code  Latitude  Longitude
3      1002   42.3671   -72.4646
4      1003   42.3919   -72.5248
5      1004   42.3845   -72.5132
6      1005   42.4097   -72.1084
7      1007   42.2751   -72.4110
In [172]:
result = pd.concat(dfs,axis=1)
print(result)
   Zip Code  Place Name          State  Zip Code  Latitude  Longitude
0     501.0  Holtsville       New York       NaN       NaN        NaN
1     544.0  Holtsville       New York       NaN       NaN        NaN
2    1001.0      Agawam  Massachusetts       NaN       NaN        NaN
3    1002.0     Amherst  Massachusetts    1002.0   42.3671   -72.4646
4    1003.0     Amherst  Massachusetts    1003.0   42.3919   -72.5248
5       NaN         NaN            NaN    1004.0   42.3845   -72.5132
6       NaN         NaN            NaN    1005.0   42.4097   -72.1084
7       NaN         NaN            NaN    1007.0   42.2751   -72.4110
In [173]:
result = pd.merge(df1,df2,how='inner',on='Zip Code')
print(result)
   Zip Code Place Name          State  Latitude  Longitude
0      1002    Amherst  Massachusetts   42.3671   -72.4646
1      1003    Amherst  Massachusetts   42.3919   -72.5248
In [174]:
result = pd.merge(df1,df2,how='left',on='Zip Code')
print(result)
   Zip Code  Place Name          State  Latitude  Longitude
0       501  Holtsville       New York       NaN        NaN
1       544  Holtsville       New York       NaN        NaN
2      1001      Agawam  Massachusetts       NaN        NaN
3      1002     Amherst  Massachusetts   42.3671   -72.4646
4      1003     Amherst  Massachusetts   42.3919   -72.5248
In [175]:
result = pd.merge(df1,df2,how='right',on='Zip Code')
print(result)
   Zip Code Place Name          State  Latitude  Longitude
0      1002    Amherst  Massachusetts   42.3671   -72.4646
1      1003    Amherst  Massachusetts   42.3919   -72.5248
2      1004        NaN            NaN   42.3845   -72.5132
3      1005        NaN            NaN   42.4097   -72.1084
4      1007        NaN            NaN   42.2751   -72.4110
In [176]:
result = pd.merge(df1,df2,how='outer',on='Zip Code')
print(result)
   Zip Code  Place Name          State  Latitude  Longitude
0       501  Holtsville       New York       NaN        NaN
1       544  Holtsville       New York       NaN        NaN
2      1001      Agawam  Massachusetts       NaN        NaN
3      1002     Amherst  Massachusetts   42.3671   -72.4646
4      1003     Amherst  Massachusetts   42.3919   -72.5248
5      1004         NaN            NaN   42.3845   -72.5132
6      1005         NaN            NaN   42.4097   -72.1084
7      1007         NaN            NaN   42.2751   -72.4110

Pickle, JSON and YAML files

In [177]:
import pickle
import json
import yaml

Let's define a class Foo().

In [178]:
class Foo():
    def __init__(self):
        self.x = "bar"
In [179]:
# Create object of class Foo() and write to Pickle file
obj = Foo()
with open("example.pkl","wb") as f_o:
    pickle.dump(obj,f_o)
In [180]:
# Show as string
pickle.dumps(obj)
Out[180]:
b'\x80\x03c__main__\nFoo\nq\x00)\x81q\x01}q\x02X\x01\x00\x00\x00xq\x03X\x03\x00\x00\x00barq\x04sb.'
In [181]:
# Read from Pickle file
with open("example.pkl","rb") as f_i:
    new_obj = pickle.load(f_i)
print(new_obj.x)
bar
In [182]:
# Create a dictionary and write to JSON file
entry = {"1" : "Hello", "2" : "Bye", "3" : 4.35}
with open("example.json","w") as f_o:
    json.dump(entry,f_o)
In [183]:
# Show as string
json.dumps(entry)
Out[183]:
'{"1": "Hello", "2": "Bye", "3": 4.35}'
In [184]:
# Read from JSON file
with open("example.json","r") as f_i:
    new_entry = json.load(f_i)
print(new_entry)
{'1': 'Hello', '2': 'Bye', '3': 4.35}
In [185]:
# Create a dictionary and write to YAML file
data = {
    'first_data':[1,2,3,4,5],
    'second_data':'Just a string.',
    'third_data': dict(a=1.1,b=1.2,c=1.3),
}
with open('example.yaml','w') as f_o :
    yaml.dump(data,f_o,default_flow_style=False)
In [186]:
# Read from YAML file
with open('example.yaml','r') as f_i:
    new_data = yaml.load(f_i)
print(new_data)
print(new_data['third_data']['a'])
{'first_data': [1, 2, 3, 4, 5], 'second_data': 'Just a string.', 'third_data': {'a': 1.1, 'b': 1.2, 'c': 1.3}}
1.1
/Applications/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  This is separate from the ipykernel package so we can avoid doing imports until
In [187]:
%%writefile example2.yaml
- &flag red
- green
- blue
- *flag
Writing example2.yaml
In [188]:
!head example2.yaml
- &flag red
- green
- blue
- *flag
In [189]:
with open('example2.yaml','r') as f_i:
    data_example2 = yaml.load(f_i)
print(data_example2)
['red', 'green', 'blue', 'red']
/Applications/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  

sqlite3

In [190]:
import sqlite3 as sql
!cp Solar_bkup.db Solar.db
In [191]:
conn = sql.connect("Solar.db")
In [192]:
results = conn.execute("SELECT * FROM solarsystem")
In [193]:
for row in results:
    print(row)
(0, 'Sun', 332946.0, 0.00213, -0.0016, -0.00011999999999999999, 5.01e-06, 4.08e-06, -1.24e-07)
(1, 'Mercury', 0.055273525999999996, 0.162, 0.264, 0.006940000000000001, -0.0297, 0.0156, 0.004)
(2, 'Venus', 0.814997513, 0.302, 0.654, -0.008440000000000001, -0.0185, 0.008320000000000001, 0.00118)
(3, 'Earth', 1.0, 0.5660000000000001, -0.846, -9.120000000000001e-05, 0.014, 0.00949, -5.81e-07)
(4, 'Mars', 0.107446849, -0.434, -1.43, -0.0193, 0.0139, -0.00288, -0.00040199999999999996)
(5, 'Jupiter', 317.828133, -2.78, 4.47, 0.0435, -0.0065, -0.0036200000000000004, 0.000161)
(6, 'Saturn', 95.1609041, -6.08, -7.84, 0.37799999999999995, 0.0041, -0.00343, -0.00010400000000000001)
(7, 'Uranus', 14.5357566, 19.5, 4.68, -0.235, -0.0009480000000000001, 0.00364, 2.58e-05)
(8, 'Neptune', 17.147000000000002, 27.3, -12.3, -0.377, 0.0012699999999999999, 0.00288, -8.85e-05)
(9, 'Pluto', 0.002191, 6.91, -31.9, 1.42, 0.00314, 3.0799999999999996e-05, -0.000918)
(10, 'Halley', 3.68e-11, -20.5, 25.1, -9.76, -7.709999999999999e-05, 0.0009539999999999999, -0.00017900000000000001)
(11, 'Moon', 0.0123031, 0.564, -0.8440000000000001, -0.000323, 0.0136, 0.009179999999999999, 8.97e-06)
In [194]:
conn.execute("DELETE FROM solarsystem WHERE name='Pluto'")
conn.commit()
In [195]:
results = conn.execute("SELECT * FROM solarsystem")
for row in results:
    print(row)
(0, 'Sun', 332946.0, 0.00213, -0.0016, -0.00011999999999999999, 5.01e-06, 4.08e-06, -1.24e-07)
(1, 'Mercury', 0.055273525999999996, 0.162, 0.264, 0.006940000000000001, -0.0297, 0.0156, 0.004)
(2, 'Venus', 0.814997513, 0.302, 0.654, -0.008440000000000001, -0.0185, 0.008320000000000001, 0.00118)
(3, 'Earth', 1.0, 0.5660000000000001, -0.846, -9.120000000000001e-05, 0.014, 0.00949, -5.81e-07)
(4, 'Mars', 0.107446849, -0.434, -1.43, -0.0193, 0.0139, -0.00288, -0.00040199999999999996)
(5, 'Jupiter', 317.828133, -2.78, 4.47, 0.0435, -0.0065, -0.0036200000000000004, 0.000161)
(6, 'Saturn', 95.1609041, -6.08, -7.84, 0.37799999999999995, 0.0041, -0.00343, -0.00010400000000000001)
(7, 'Uranus', 14.5357566, 19.5, 4.68, -0.235, -0.0009480000000000001, 0.00364, 2.58e-05)
(8, 'Neptune', 17.147000000000002, 27.3, -12.3, -0.377, 0.0012699999999999999, 0.00288, -8.85e-05)
(10, 'Halley', 3.68e-11, -20.5, 25.1, -9.76, -7.709999999999999e-05, 0.0009539999999999999, -0.00017900000000000001)
(11, 'Moon', 0.0123031, 0.564, -0.8440000000000001, -0.000323, 0.0136, 0.009179999999999999, 8.97e-06)
In [196]:
death_star = [12,'Death Star',0.1,0.564,-0.845,-9.12e-05,0.014,0.00949,-5.81e-07]
conn.execute("INSERT INTO solarsystem VALUES (?,?,?,?,?,?,?,?,?)",death_star)
Out[196]:
<sqlite3.Cursor at 0x1234c5650>
In [197]:
results = conn.execute("SELECT * FROM solarsystem")
for row in results:
    print(row)
(0, 'Sun', 332946.0, 0.00213, -0.0016, -0.00011999999999999999, 5.01e-06, 4.08e-06, -1.24e-07)
(1, 'Mercury', 0.055273525999999996, 0.162, 0.264, 0.006940000000000001, -0.0297, 0.0156, 0.004)
(2, 'Venus', 0.814997513, 0.302, 0.654, -0.008440000000000001, -0.0185, 0.008320000000000001, 0.00118)
(3, 'Earth', 1.0, 0.5660000000000001, -0.846, -9.120000000000001e-05, 0.014, 0.00949, -5.81e-07)
(4, 'Mars', 0.107446849, -0.434, -1.43, -0.0193, 0.0139, -0.00288, -0.00040199999999999996)
(5, 'Jupiter', 317.828133, -2.78, 4.47, 0.0435, -0.0065, -0.0036200000000000004, 0.000161)
(6, 'Saturn', 95.1609041, -6.08, -7.84, 0.37799999999999995, 0.0041, -0.00343, -0.00010400000000000001)
(7, 'Uranus', 14.5357566, 19.5, 4.68, -0.235, -0.0009480000000000001, 0.00364, 2.58e-05)
(8, 'Neptune', 17.147000000000002, 27.3, -12.3, -0.377, 0.0012699999999999999, 0.00288, -8.85e-05)
(10, 'Halley', 3.68e-11, -20.5, 25.1, -9.76, -7.709999999999999e-05, 0.0009539999999999999, -0.00017900000000000001)
(11, 'Moon', 0.0123031, 0.564, -0.8440000000000001, -0.000323, 0.0136, 0.009179999999999999, 8.97e-06)
(12, 'Death Star', 0.1, 0.564, -0.845, -9.12e-05, 0.014, 0.00949, -5.81e-07)
In [198]:
more_death_stars = list()
for i in range(10):
    death_star[0] +=1
    death_star[1]  = "Death Star "+str(i)
    more_death_stars.append(death_star.copy())
In [199]:
conn.executemany("INSERT INTO solarsystem VALUES (?,?,?,?,?,?,?,?,?)",more_death_stars)
Out[199]:
<sqlite3.Cursor at 0x1234c5a40>
In [200]:
conn.commit()
In [201]:
def dict_factory(cursor, row):
    d = {}
    for idx,col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d
conn.row_factory = dict_factory
In [202]:
max_mass = 1.0
results = conn.execute("SELECT name,mass FROM solarsystem WHERE mass<?",[max_mass])
for row in results:
    print(row)
{'name': 'Mercury', 'mass': 0.055273525999999996}
{'name': 'Venus', 'mass': 0.814997513}
{'name': 'Mars', 'mass': 0.107446849}
{'name': 'Halley', 'mass': 3.68e-11}
{'name': 'Moon', 'mass': 0.0123031}
{'name': 'Death Star', 'mass': 0.1}
{'name': 'Death Star 0', 'mass': 0.1}
{'name': 'Death Star 1', 'mass': 0.1}
{'name': 'Death Star 2', 'mass': 0.1}
{'name': 'Death Star 3', 'mass': 0.1}
{'name': 'Death Star 4', 'mass': 0.1}
{'name': 'Death Star 5', 'mass': 0.1}
{'name': 'Death Star 6', 'mass': 0.1}
{'name': 'Death Star 7', 'mass': 0.1}
{'name': 'Death Star 8', 'mass': 0.1}
{'name': 'Death Star 9', 'mass': 0.1}
In [203]:
results = conn.execute("SELECT AVG(mass) as mean_mass, COUNT(*) as n, mass>1.0 as larger_than_earth "+
                       "FROM solarsystem WHERE mass<>1.0 GROUP BY mass<1.0")
for row in results:
    print(row)
{'mean_mass': 66678.13435873999, 'n': 5, 'larger_than_earth': 1}
{'mean_mass': 0.13062631175230005, 'n': 16, 'larger_than_earth': 0}

MongoDB

You need to install MongoDB and start the server with mongod in your terminal

In [204]:
# pip install pymongo
In [205]:
import pymongo
In [206]:
# Connecting to the server, localhost and 27017 as port would be default arguments
client = pymongo.MongoClient("localhost",27017)
# Get the database (if not existing it will be created)
db = client["tweets"]
# Get the collection (if not existing it will be created)
collection = db["uzh"]
In [207]:
print(collection)
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'tweets'), 'uzh')
In [208]:
with open("uzh.json","r") as f_i:
    tweets = json.load(f_i)
In [209]:
# print(tweets)
In [210]:
# collection.insert_one(tweets[0])
In [211]:
# print(collection)
In [212]:
# print(collection.find()[0])
In [213]:
# collection.insert_many(tweets[1:])
In [214]:
# collection.find_one()
In [215]:
# for tweet in collection.find({'timestamp' : {'$gt':'2017-01-01'}}):
      # print(tweet["text"])
In [216]:
import pprint
In [217]:
# pprint.pprint(collection.find_one())
In [218]:
# collection.delete_many({})
In [ ]: