Federica Lionetto (federica.lionetto@gmail.com)
The content of the lecture might be reused, also in parts, under the CC-licence by-sa 4.0
# Creating a standard Python list
L = list(range(1000))
# How long does it take to calculate the element-wise square?
%timeit [i**2 for i in L]
# Now do the same with a NumPy array
import numpy as np
a = np.arange(1000)
%timeit a**2
np.__version__
np.show_config()
a = np.array([1,2,4])
print(a)
b = np.arange(1,15,2)
print(b)
c = np.linspace(0,1,6)
print(c)
d = np.empty((1,3))
print(d)
e = np.zeros((2,5,3))
print(e)
f = np.ones((3,3))
print(f)
g = np.eye(4)
print(g)
h = np.identity(4)
print(h)
i = np.diag(np.array([1,2,3,4]))
print(i)
l = np.diag(np.array([1,2,3,4]),k=-1)
print(l)
m = np.diag(np.array([1,2,3,4]),k=2)
print(m)
# arrays = [a,b,c,d,e,f,g,h,i,l,m]
# for array in arrays :
# print(array)
# print('')
a = np.random.rand(4)
b = np.random.rand(4,3)
c = np.random.randint(1,3,(2,3))
d = np.random.randn(4,5)
e = np.random.poisson(3,5)
arrays = [a,b,c,d,e]
for array in arrays :
print(array)
print('')
# Random seed
np.random.seed(10)
arr1 = np.random.rand(5)
print('Array with 5 elements, random seed 10:')
print(arr1)
arr2 = np.random.rand(10)
print('Array with 10 elements, random seed not set:')
print(arr2)
np.random.seed(10)
arr3 = np.random.rand(10)
print('Array with 10 elements, random seed 10:')
print(arr3)
a = np.random.rand(3,4)
b = np.random.rand(3,4)
print(a)
print('')
print(b)
a+b
a-b
a*b
a/b
# Add 3.0 to every element
a+3.0
# Conditions
a>b
a.min()
a.min(axis=0)
a.min(axis=1)
# Numpy has its own set of functions
np.exp(b)
# Numpy has its own set of functions
np.cos(b)
# Functions in the math library are not able to handle multi-element data
import math
math.exp(b)
a = np.array([1,0,-2],dtype=np.int64)
print(a)
b = np.array(a,dtype=np.int8)
print(b)
c = np.array(a,dtype=np.int)
print(c)
d = np.array(a,dtype=np.float64)
print(d)
e = np.array(a,dtype=np.bool)
print(e)
e.dtype
print('3 elements np.int64 correspond to', a.nbytes, 'bytes')
print('3 elements np.int8 correspond to', b.nbytes, 'bytes')
print('3 elements np.int correspond to', c.nbytes, 'bytes')
print('3 elements np.float64 correspond to', d.nbytes, 'bytes')
print('3 elements np.bool correspond to', e.nbytes, 'bytes')
a = np.ones((3,4),dtype=np.int8)
b = np.ones((3,4),dtype=np.int64)
print('np.int8 bytes:')
print(a.tobytes())
print('')
print('np.int64 bytes:')
print(b.tobytes())
print(a.ndim)
print(a.shape)
print(a.size)
print(a.itemsize)
print(b.ndim)
print(b.shape)
print(b.size)
print(b.itemsize)
print(a.nbytes)
print(b.nbytes)
print(a.data)
print(a.data.tobytes())
print(a.tobytes())
print(a.flags)
print('')
print(a.T.flags)
print(a.strides)
print(b.strides)
print(a.T.strides)
# Let's define an array of values distributed according to a Normal distribution
a = np.random.randn(3,4)
print(a.reshape(1,12))
print('')
print(a)
print(a.resize(1,12))
print('')
print(a)
# Need to define a as in the beginning again
a = np.random.randn(3,4)
print(a.ravel())
print('')
print(a)
# Need to define a as in the beginning again
a = np.random.randn(3,4)
print(a.T)
# Bad practices
b = np.random.randn(4)
print(b.shape)
print(b.T.shape)
# Good practices
c = np.random.randn(4,1)
print(c.shape)
print(c.T.shape)
a = np.ones((3,4),dtype=np.int64)
print(a)
b = a
a[0,0]=0
print(b)
c = a.copy()
a[1,1]=0
print(a)
print(c)
print(b)
d = 1*a
a[2,2] = 0
print(a)
print(d)
# Let's have a look at the loadEx.txt file
!head loadEx.txt
data = np.loadtxt('loadEx.txt',delimiter=' ',comments="#")
print(data)
print(data.shape)
data[0,1]
# A more complex example
dt = np.dtype([('name','S7'),('mass',np.float),
('position',[('x',np.float),('y',np.float),('z',np.float)]),
('velocity',[('x',np.float),('y',np.float),('z',np.float)])])
solarData = np.loadtxt('Solar.txt',dtype=dt)
print(solarData)
solarData['name']
solarData['position']['x']
solarData['position']['x'][np.where(solarData['name']==b'Sun')]
a = np.random.rand(3,5)
b = np.random.rand(8)
c = a[...,np.newaxis]*b
print(c.shape)
d = np.random.rand(1,10)
e = np.random.rand(10,1)
print(d.shape)
print(d)
print('')
print(e.shape)
print(e)
# Explicit broadcasting.
dd,ee = np.broadcast_arrays(d,e)
print(dd.shape)
print(ee.shape)
d[0,0]=-1.0
dd
# ee
print(dd.strides)
print(ee.strides)
# Notice that this does not use additional memory!!!
a = np.arange(100).reshape(10,10)
# Access rows
a[4:9]
# Access columns
a[:,3:8]
# Negative indices
a[:,-1]
# Ranges
a[-2::-3,1:6:2]
a[:,[1,3,1]]
a[[1,3,1]][:,[1,3,1]]
a[[1,3,1],[1,3,1]]
# Multidimensional arrays indexed by multidimensional arrays.
y = np.arange(35).reshape(5,7)
print(y)
# If the index arrays have a matching shape,
# and there is an index array for each dimension of the array being indexed,
# the resultant array has the same shape as the index arrays,
# and the values correspond to the index set for each position in the index arrays.
# [0,0], [2,1], and [4,2] elements of the indexed array.
y[np.array([0,2,4]), np.array([0,1,2])]
# If the index arrays do not have the same shape, a broadcasting is tried.
# [0,1], [2,1], and [4,1] elements of the indexed array.
y[np.array([0,2,4]), 1]
# If we provide just one index array, the rows are selected but the columns are kept as they were in the indexed array.
y[np.array([0,2,4])]
# Fancy indexing.
i0 = np.random.randint(0,10,(8,1,8)) # Matrix of random integers between 0 and 10 with shape (8,1,8).
i1 = np.random.randint(0,10,(2,8)) # Matrix of random integers between 0 and 10 with shape (2,8).
a[i0,i1] # creates a 8×2×8 array
a[i0,i1].shape
import pandas as pd
!head SMI.csv
# Series object
s = pd.Series([1,3,5,np.nan,6,8])
print(s)
print(type(s))
# Dataframe object
ts = pd.read_csv('SMI.csv')
type(ts)
ts.head()
ts.tail()
ts.index
ts.columns
ts['Open'][:10].values
ts = ts.sort_values('Date')
ts.head()
# Find minimum and maximum values in a given column
print(ts['Volume'].min())
print(ts['Volume'].max())
# Find index corresponding to mininum and maximum values in a given column
# Careful!!!
print(ts['Volume'].idxmin())
print(ts['Volume'].idxmax())
# Access rows
ts[6079:6080]
# Modify index
ts.index = pd.to_datetime(ts.pop("Date"))
ts = ts.sort_index()
ts.tail()
import datetime as dt
ts[ts.index>dt.datetime(2010,1,1)].head()
ts["Adj Close"].head()
ts["Adj Close"].describe()
# Access parameters of describe
ts['Adj Close'].describe()['count']
# Resampling of time series
# Creating a series with 9 timestamps, each one corresponding to one minute
index = pd.date_range('1/6/2018', periods=9, freq='T')
series = pd.Series(range(9), index=index)
print(series)
# Downsample the series in bins of 3 minutes each and sum over the same bin
series.resample('3T').sum()
# Label the bin using the upper bound
series.resample('3T', label='right').sum()
# DataFrame.resample(rule, axis=0)
# The object must have a datetime-like index
ts_monthly = ts["Adj Close"].resample("M").apply(["median","mean","std","count","max","min"]).head()
ts_monthly
day_return = ts["Adj Close"].pct_change().dropna()
mean_30day = day_return.rolling(30).mean()
import numpy as np
minmax_30day = day_return.rolling(30).apply(lambda x: (np.max(x)+np.min(x))*0.5)
mean_30day.resample("M").apply(["mean"]).plot()
minmax_30day.resample("M").apply(["mean"]).plot()
import matplotlib.pyplot as plt
plt.show()
dates = pd.date_range(ts.index.min(),ts.index.max(),freq="D")
print(dates)
ts_alldays = pd.Series(index=dates,data=ts["Adj Close"])
ts_alldays.head()
ts_alldays.fillna(method="ffill",inplace=True)
ts_alldays.head()
# pd.__version__
# pip install pandas-datareader
version = [int(v) for v in pd.__version__.split('.')]
if (version[0] == 0 and version[1] >= 17) or (version[0] == 1): # Test if version is >= 0.17
from pandas_datareader import data, wb
else:
from pandas.io import data, wb
# Retrieve information from FRED
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2018, 1, 1)
# df = data.DataReader('F', 'google', start, end)
df = data.DataReader('GDP', 'fred', start, end)
print(df.shape)
print(df.head())
print(df.tail())
# Let's say we want to compare the Gross Domestic Products per capita in constant dollars in North America
# wb.search('gdp.*capita.*const')
# Let's use the download function to acquire the data from the World Bank’s servers
# gdp_data = wb.download(indicator='NY.GDP.PCAP.KD',country=['CH','US','GB','DE'],start=2006,end=2016)
# gdp_data.head(20)
# gdp_data.shape
# gdp_data.columns
# gdp_data.unstack(level=0)
# gdp_data.unstack(level=1)
# gdp_data.groupby(level=0).mean()
# gdp_data.groupby(level=0).std()
df_us_zip = pd.read_csv("us_postal_codes.csv")
df_us_zip.shape
df_us_zip.columns
df_us_zip.describe()
df_us_zip.dtypes
df_us_zip.head()
df_us_state_coord = df_us_zip.get(["State Abbreviation","Latitude","Longitude"]).groupby(["State Abbreviation"]).mean()
df_us_state_coord.shape
df_us_state_coord.head()
# How many entries have "Washington" as "Place Name"?
# Let's count the unique values in the "Place Name" field
# Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)
df_us_zip["Place Name"].value_counts().head()
# Cross-check
df_us_zip[df_us_zip['Place Name']=='Washington'].shape
df_us_places = df_us_zip.get(["Place Name","State Abbreviation","Latitude","Longitude"])
df_us_places = df_us_places.groupby(["Place Name","State Abbreviation"]).mean()
print(df_us_places.shape)
df_us_places
df_us_places.reset_index(inplace=True)
print(df_us_places.shape)
print(df_us_places.columns)
df_us_places["Place Name"].value_counts().head()
# Cross-check
df_us_places[df_us_places['Place Name']=='Franklin'].shape
# Mapping
# Map values of Series using input correspondence (a dict, Series, or function).
# Series.map(arg, na_action=None)
df_us_places["isSwiss"] = df_us_places["Place Name"].map(lambda x: any([s in x for s in ["Zurich", "Berne", "Basel", "Lucerne", "Glarus", "Geneva"]]))
df_us_places[df_us_places["isSwiss"]]
df1 = df_us_zip[:5].copy()
df2 = df_us_zip[5:10].copy()
print(df1.head())
print(df2.head())
dfs = [df1,df2]
result = df1.append(df2)
print(result)
result = pd.concat(dfs)
print(result)
df1 = df_us_zip[['Zip Code','Place Name','State']][:5].copy()
df2 = df_us_zip[['Zip Code','Latitude','Longitude']][3:8].copy()
dfs = [df1,df2]
print(df1)
print(df2)
result = pd.concat(dfs,axis=1)
print(result)
result = pd.merge(df1,df2,how='inner',on='Zip Code')
print(result)
result = pd.merge(df1,df2,how='left',on='Zip Code')
print(result)
result = pd.merge(df1,df2,how='right',on='Zip Code')
print(result)
result = pd.merge(df1,df2,how='outer',on='Zip Code')
print(result)
import pickle
import json
import yaml
Let's define a class Foo().
class Foo():
def __init__(self):
self.x = "bar"
# Create object of class Foo() and write to Pickle file
obj = Foo()
with open("example.pkl","wb") as f_o:
pickle.dump(obj,f_o)
# Show as string
pickle.dumps(obj)
# Read from Pickle file
with open("example.pkl","rb") as f_i:
new_obj = pickle.load(f_i)
print(new_obj.x)
# Create a dictionary and write to JSON file
entry = {"1" : "Hello", "2" : "Bye", "3" : 4.35}
with open("example.json","w") as f_o:
json.dump(entry,f_o)
# Show as string
json.dumps(entry)
# Read from JSON file
with open("example.json","r") as f_i:
new_entry = json.load(f_i)
print(new_entry)
# Create a dictionary and write to YAML file
data = {
'first_data':[1,2,3,4,5],
'second_data':'Just a string.',
'third_data': dict(a=1.1,b=1.2,c=1.3),
}
with open('example.yaml','w') as f_o :
yaml.dump(data,f_o,default_flow_style=False)
# Read from YAML file
with open('example.yaml','r') as f_i:
new_data = yaml.load(f_i)
print(new_data)
print(new_data['third_data']['a'])
%%writefile example2.yaml
- &flag red
- green
- blue
- *flag
!head example2.yaml
with open('example2.yaml','r') as f_i:
data_example2 = yaml.load(f_i)
print(data_example2)
import sqlite3 as sql
!cp Solar_bkup.db Solar.db
conn = sql.connect("Solar.db")
results = conn.execute("SELECT * FROM solarsystem")
for row in results:
print(row)
conn.execute("DELETE FROM solarsystem WHERE name='Pluto'")
conn.commit()
results = conn.execute("SELECT * FROM solarsystem")
for row in results:
print(row)
death_star = [12,'Death Star',0.1,0.564,-0.845,-9.12e-05,0.014,0.00949,-5.81e-07]
conn.execute("INSERT INTO solarsystem VALUES (?,?,?,?,?,?,?,?,?)",death_star)
results = conn.execute("SELECT * FROM solarsystem")
for row in results:
print(row)
more_death_stars = list()
for i in range(10):
death_star[0] +=1
death_star[1] = "Death Star "+str(i)
more_death_stars.append(death_star.copy())
conn.executemany("INSERT INTO solarsystem VALUES (?,?,?,?,?,?,?,?,?)",more_death_stars)
conn.commit()
def dict_factory(cursor, row):
d = {}
for idx,col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
conn.row_factory = dict_factory
max_mass = 1.0
results = conn.execute("SELECT name,mass FROM solarsystem WHERE mass<?",[max_mass])
for row in results:
print(row)
results = conn.execute("SELECT AVG(mass) as mean_mass, COUNT(*) as n, mass>1.0 as larger_than_earth "+
"FROM solarsystem WHERE mass<>1.0 GROUP BY mass<1.0")
for row in results:
print(row)
You need to install MongoDB and start the server with mongod
in your terminal
# pip install pymongo
import pymongo
# Connecting to the server, localhost and 27017 as port would be default arguments
client = pymongo.MongoClient("localhost",27017)
# Get the database (if not existing it will be created)
db = client["tweets"]
# Get the collection (if not existing it will be created)
collection = db["uzh"]
print(collection)
with open("uzh.json","r") as f_i:
tweets = json.load(f_i)
# print(tweets)
# collection.insert_one(tweets[0])
# print(collection)
# print(collection.find()[0])
# collection.insert_many(tweets[1:])
# collection.find_one()
# for tweet in collection.find({'timestamp' : {'$gt':'2017-01-01'}}):
# print(tweet["text"])
import pprint
# pprint.pprint(collection.find_one())
# collection.delete_many({})