Scientific Programming: Analytics

Christian Elsasser (che@physik.uzh.ch)

The content of the lecture might be reused, also in parts, under the CC-licence by-sa 4.0

Use case 1: Analysing of financial data

Disclaimer: The approaches presented here are for illustrative purposes and do not represent at all state-of-the-art approaches.

Topics discussed:

  • Distributions
  • Minimisation/Optimisation

1.a Maximum-likelihood estimation

In [1]:
from scipy import stats
from scipy.optimize import minimize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
# Loading the data
df = pd.read_csv("stocks.csv")
df = df.fillna(method="ffill").set_index(["Date"])
df_pct = df.pct_change().fillna(0)
df_pct.head()
Out[2]:
SMI DJI
Date
26/06/17 0.000000 0.000000
27/06/17 -0.005297 -0.004619
28/06/17 0.000419 0.006755
29/06/17 -0.014615 -0.007811
30/06/17 -0.004152 0.002941
In [3]:
df.plot()
plt.show()
In [4]:
# Transform to percents
smi_pct = np.array(df_pct["SMI"])*100
dji_pct = np.array(df_pct["DJI"])*100
In [5]:
# define the probability density function
# Gaussian with p[0] as mu and p[1] as sigma
f = lambda x,p: stats.norm(p[0],p[1]).pdf(x)
In [6]:
gaussian = stats.norm(5,2)
In [10]:
gaussians = stats.norm(np.array([0,1,2]),5)
In [11]:
gaussians.cdf(1)
Out[11]:
array([0.57925971, 0.5       , 0.42074029])
In [12]:
# the log-likelihood function to be minimised
# here the data acts as parameter
ll = lambda p,data: -np.log(f(data,p)).sum()
In [14]:
minimize?
In [15]:
smi_result = minimize(ll,[0.1,0.5],args=(smi_pct,),method="TNC")
In [16]:
smi_result
Out[16]:
     fun: 281.6571204014134
     jac: array([0.00000000e+00, 1.13686838e-05])
 message: 'Linear search failed'
    nfev: 45
     nit: 9
  status: 4
 success: False
       x: array([-0.01997663,  0.74653703])
In [17]:
# Be aware:
print("BFGS")
print(minimize(ll,[0.0,0.5],args=(smi_pct,),method="BFGS"))
print("CG")
print(minimize(ll,[0.0,0.5],args=(smi_pct,),method="CG"))
BFGS
      fun: 281.6571204014135
 hess_inv: array([[2.24376251e-03, 2.20803085e-05],
       [2.20803085e-05, 1.14966362e-03]])
      jac: array([ 0.00000000e+00, -3.81469727e-06])
  message: 'Optimization terminated successfully.'
     nfev: 56
      nit: 9
     njev: 14
   status: 0
  success: True
        x: array([-0.01997663,  0.74653703])
CG
     fun: 284.908274106397
     jac: array([ 3.16257477, 62.6407814 ])
 message: 'Desired error not necessarily achieved due to precision loss.'
    nfev: 24
     nit: 1
    njev: 3
  status: 2
 success: False
       x: array([-0.01104488,  0.8402671 ])
In [18]:
# Different methods require different number of iterations and function evaluation until termination
# ... and also lead to different numerical precision
print("Nelder-Mead")
print(minimize(ll,[0.0,0.5],args=(smi_pct,),method="Nelder-Mead"))
Nelder-Mead
 final_simplex: (array([[-0.02001555,  0.74653212],
       [-0.01995295,  0.74648979],
       [-0.01996041,  0.74659233]]), array([281.65712075, 281.65712153, 281.65712183]))
           fun: 281.6571207522061
       message: 'Optimization terminated successfully.'
          nfev: 166
           nit: 85
        status: 0
       success: True
             x: array([-0.02001555,  0.74653212])
In [19]:
# "True" values
print([smi_pct.mean(),smi_pct.std()])
[-0.019976616609547904, 0.7465370350851463]
In [20]:
# Let's go with BFGS since it also gives us an approximation of the inverse Hessian
result_smi = minimize(ll,[0.0,0.5],args=(smi_pct,),method="BFGS")
result_dji = minimize(ll,[0.0,0.5],args=(dji_pct,),method="BFGS")
print(result_smi.success,result_dji.success)
True True
In [21]:
result_smi.x
Out[21]:
array([-0.01997663,  0.74653703])
In [22]:
# Create the data set specific pdf's with the estimated parameters as parameters
f_smi = lambda x: f(x,result_smi.x)
f_dji = lambda x: f(x,result_dji.x)
In [23]:
bins = np.linspace(-4,4,81)
h_smi,bin_smi = np.histogram(smi_pct,bins)
h_dji,bin_dji = np.histogram(dji_pct,bins)
In [24]:
import matplotlib.pyplot as plt
In [25]:
fig = plt.figure("smi",(6,4))
fig,ax = plt.subplots()
ax.bar(bin_smi[:-1],h_smi,width=bin_smi[1:]-bin_smi[:-1],color="b")
ax.plot(bin_smi,f_smi(bin_smi)*h_smi.sum()*(bin_smi[1:]-bin_smi[:-1]).mean(),'k--',linewidth=2)
ax.set_xlabel("Daily returns [%]")
ax.set_ylabel("Number of days")
ax.set_ylim([0,30])
plt.show()
<matplotlib.figure.Figure at 0x10c182908>
In [26]:
fig = plt.figure("dji",(6,4))
fig,ax = plt.subplots()
ax.bar(bin_dji[:-1],h_dji,width=bin_dji[1:]-bin_dji[:-1],color="r")
ax.plot(bin_dji,f_dji(bin_dji)*h_dji.sum()*(bin_dji[1:]-bin_dji[:-1]).mean(),'k--',linewidth=2)
ax.set_xlabel("Daily returns [%]")
ax.set_ylabel("Number of days")
ax.set_ylim([0,30])
plt.show()
<matplotlib.figure.Figure at 0x10c52c5f8>
In [27]:
# Create QQ (Quantile-quantile) plots to compare quantiles 
# of the data vs. the quantiles of the resulting distribution
x = np.linspace(-3,3,3)
fig = plt.figure("QQ",figsize=(6,6))
fig,ax = plt.subplots()
ax.plot(x,x,'k--')
ax.scatter(sorted(smi_pct),stats.norm(*(result_smi.x)).ppf(np.linspace(0,1,len(smi_pct))),color='b')
ax.scatter(sorted(dji_pct),stats.norm(*(result_dji.x)).ppf(np.linspace(0,1,len(dji_pct))),color='r')
ax.set_xlabel("Quantile data")
ax.set_ylabel("Quantile distribution")
ax.set_xlim([-3,3])
ax.set_ylim([-3,3])
plt.show()
<matplotlib.figure.Figure at 0x10c816ef0>
In [28]:
mu_smi = result_smi.x[0]*len(smi_pct)
mu_dji = result_dji.x[0]*len(dji_pct)
sig_smi = result_smi.x[1]*len(smi_pct)**0.5
sig_dji = result_dji.x[1]*len(dji_pct)**0.5
mu_rf = 1.0
In [29]:
mu_dji
Out[29]:
14.226607421183875
In [30]:
rho = np.corrcoef(smi_pct,dji_pct)[0,1]
print(rho)
-0.13337272485229681
In [31]:
# Define function to be minimised (risk) and function representing the constraint 
mu_min = 14.0
opt_f = lambda p,x : (p[0]*x[0])**2+(p[1]*x[1])**2+2*p[1]*p[0]*x[0]*x[1]*x[2]
con_f = lambda p,mu : mu[0]*p[0]+mu[1]*p[1]+mu[2]*(1-p[0]-p[1])-mu[3]
In [33]:
result = minimize(opt_f,[0.33,0.33],[sig_smi,sig_dji,rho],
                  bounds = [(0,1),(0,1)], # Example if I only want to have positive fractions (i.e. no short selling)
                  constraints=({'type':'ineq','fun' : con_f,'args':((mu_smi,mu_dji,mu_rf,mu_min),)}),
                  method="SLSQP")
print(result)
     fun: 171.83935170975178
     jac: array([-41.2742939, 349.669487 ])
 message: 'Optimization terminated successfully.'
    nfev: 13
     nit: 3
    njev: 3
  status: 0
 success: True
       x: array([7.84734611e-10, 9.82867306e-01])
In [34]:
p = result.x
In [35]:
# And get the final results
mu_pf = np.sum(np.array([mu_smi,mu_dji])*p)+mu_rf*(1-p.sum())
print("#mu =  {0:4.2f}%".format(mu_pf))
print("#sig = {0:4.2f}%".format(result.fun**0.5))
#mu =  14.00%
#sig = 13.11%

Use case 2 - Graph Theory

Topics discussed:

  • Matrices
  • Sparse matrices and Linear Algebra
In [36]:
np.matrix
Out[36]:
numpy.matrixlib.defmatrix.matrix
In [37]:
# Create a diagonal matrix
d = np.matrix(np.diag([1,2,3,4]))
print(d)
[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]
In [38]:
# Create a 4x4 matrix with random entries and its inverse
s = np.matrix(np.random.rand(4,4))
s_inv = s.I
In [40]:
# De-diagonalise it
m = s.I*d*s
print(m)
[[-80.92359473 -81.30381756 -96.30781205 -54.16752095]
 [ 76.66780204  76.91378256  88.06266916  50.54894834]
 [ -8.94587026  -8.57621022  -7.27555334  -6.03797431]
 [ 29.39634425  28.63554884  32.83838461  21.2853655 ]]
In [41]:
# Get back the eigenvalues
# ... and the eigenvectors
values,transf = np.linalg.eig(m)
print(values)
print(transf)
[1. 4. 2. 3.]
[[ 0.72094402  0.70214441  0.69842016  0.71564148]
 [-0.63849384 -0.66197235 -0.6831088  -0.65771753]
 [ 0.0717867   0.08015072  0.08537474  0.05954324]
 [-0.25963823 -0.24972324 -0.1956599  -0.22741928]]
In [42]:
routes = pd.read_csv("routes.txt",sep=",")
In [43]:
routes.head()
Out[43]:
iata_airline airline_id dep dep_id des des_id codeshare stops equip
0 2B 410 AER 2965 KZN 2990 NaN 0 CR2
1 2B 410 ASF 2966 KZN 2990 NaN 0 CR2
2 2B 410 ASF 2966 MRV 2962 NaN 0 CR2
3 2B 410 CEK 2968 KZN 2990 NaN 0 CR2
4 2B 410 CEK 2968 OVB 4078 NaN 0 CR2
In [44]:
n_flights = (routes["des"].value_counts()+routes["dep"].value_counts()).sort_values(ascending=False)
In [45]:
n_flights.head(10)
Out[45]:
ATL    1826.0
ORD    1108.0
PEK    1069.0
LHR    1051.0
CDG    1041.0
LAX     990.0
FRA     990.0
DFW     936.0
JFK     911.0
AMS     903.0
dtype: float64
In [46]:
abrev = list(n_flights[:1000].index.sort_values())
In [47]:
keep_route = routes["dep"].isin(abrev)&routes["des"].isin(abrev)
In [48]:
routes_kept = routes[keep_route].get(["dep","des"])
In [49]:
routes_kept = routes_kept.drop_duplicates()
In [50]:
mat = np.matrix(np.zeros((len(abrev),len(abrev)),dtype=bool))
In [51]:
for i,r in routes_kept.iterrows():
    i_dep = abrev.index(r["dep"])
    i_des = abrev.index(r["des"])
    mat[i_dep,i_des] = True
In [52]:
mat.shape
Out[52]:
(1000, 1000)
In [53]:
eig,trans = np.linalg.eig(mat)
In [58]:
mat
Out[58]:
matrix([[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]])
In [56]:
bins = np.linspace(-100,100,51)
h,bins = np.histogram(eig,bins)
In [57]:
fig = plt.figure("graph_spectrum",(6,4))
fig,ax = plt.subplots()
ax.bar(bins[:-1],h,width=bins[1:]-bins[:-1])
plt.show()
<matplotlib.figure.Figure at 0x10ea756d8>
In [62]:
from scipy.sparse import dok_matrix,csr_matrix
import scipy.sparse.linalg
In [63]:
routes_kept = routes.get(["dep","des"])
routes_kept = routes_kept.drop_duplicates()
In [64]:
abrev = list(n_flights.index.sort_values())
In [65]:
mat_dok = dok_matrix((len(abrev),len(abrev)),dtype=np.int8)
for i,r in routes_kept.iterrows():
    i_dep = abrev.index(r["dep"])
    i_des = abrev.index(r["des"])
    mat_dok[i_dep,i_des] = True
In [66]:
mat_csr = csr_matrix(mat_dok)
In [67]:
mat_csr.dtype
Out[67]:
dtype('int8')
In [68]:
%timeit mat_dok*mat_dok
10 loops, best of 3: 51.6 ms per loop
In [69]:
%timeit mat_csr*mat_csr
100 loops, best of 3: 16.2 ms per loop
In [70]:
mat_dok = mat_dok.asfptype()
%timeit scipy.sparse.linalg.eigs(mat_dok)
1 loop, best of 3: 1.23 s per loop
In [71]:
mat_csr = mat_csr.asfptype()
%timeit scipy.sparse.linalg.eigs(mat_csr)
100 loops, best of 3: 12 ms per loop

Use case 3 - Signal processing

Topics discussed:

  • Fast-Fourier-Transformation
  • Integration

3.a Fast-Fourier Transformation

In [72]:
from scipy.fftpack import fft, fftfreq, fftshift, ifft
import numpy as np
In [73]:
N = 1000 # Number of sample points
T = 0.02*2*np.pi # Frequency Resolution
t = np.linspace(0.0,N*T,N) # Spanning up time space
y = (t%(2*np.pi))/(2*np.pi) # Tooth-saw function
In [74]:
plt.figure("tooth_saw_function",[8,12])
fig,ax = plt.subplots()
ax.plot(t,y)
ax.set_xlim([t.min(),t.max()])
ax.set_ylim([0,1])
plt.show()
<matplotlib.figure.Figure at 0x10c809550>
In [75]:
yf = fft(y) # Actual transformation
xf = fftfreq(N, T) # Generation of frequency values
xf = fftshift(xf) # Shift it so that zero is centered
yplot = fftshift(yf) # Corresponding shift for frequency amplitudes
In [76]:
plt.figure("freq_spectrum_tooth_saw",[8,12])
fig,ax = plt.subplots()
ax.plot(xf, 1/N*np.abs(yplot),'r') # yplot values are complex --> abs
ax.set_xlim([-2,2])
ax.set_ylim([0,0.6])
plt.show()
<matplotlib.figure.Figure at 0x10c80ecf8>
In [77]:
# Let's check the inverse transformation
plt.figure("inverse_transform",[8,12])
fig,ax = plt.subplots()
ax.plot(t, y,'b') # Original
ax.plot(t, ifft(yf),'r') # inverse back transformed
ax.set_xlim([t.min(),t.max()])
ax.set_ylim([0,1.0])
plt.show()
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/numpy/core/numeric.py:492: ComplexWarning: Casting complex values to real discards the imaginary part
  return array(a, dtype, copy=False, order=order)
<matplotlib.figure.Figure at 0x10c11a8d0>

3.b "Normal" Fourier Transform

In [78]:
from scipy.integrate import quad
In [79]:
# Generating two signals
f_gauss = lambda x: stats.norm(0,0.5).pdf(x) # Gaussian signal
f_g_osc = lambda x: f_gauss(x)*np.cos(2*np.pi*x*0.5) # ... with oscillation
In [80]:
# time range considered
t = np.linspace(-5,5,101)
In [81]:
plt.figure("signals",[8,12])
fig,ax = plt.subplots()
ax.plot(t, f_gauss(t),'b')
ax.plot(t, f_g_osc(t),'r')
ax.set_xlim([t.min(),t.max()])
ax.set_ylim([-1.0,1.0])
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"$A(t)$")
plt.show()
<matplotlib.figure.Figure at 0x105534da0>
In [82]:
# Function that returns the fourier integrand as function, only considering the real part
def get_fourier_integrand(f):
    return lambda t,w: np.cos(w*t)*f(t)
In [83]:
# Define Fourier integrands for a given omega
f_i_gauss = lambda w : quad(get_fourier_integrand(f_gauss),-10,10,args=(w,))
f_i_g_osc = lambda w : quad(get_fourier_integrand(f_g_osc),-10,10,args=(w,))
In [84]:
f_i_gauss(1.0) # first entry: integration result, second entry: upper-bound on the error
Out[84]:
(0.8824969025845957, 6.463572515755245e-11)
In [85]:
# Vectorise the integrands (allows to calculate the integration for different omega in parallel)
int_f_gauss = np.vectorize(f_i_gauss)
int_f_g_osc = np.vectorize(f_i_g_osc)
In [86]:
# Spectrum we are considering
w = np.linspace(0,10,51)
In [87]:
# Actual performance of the integration
yw_gauss = int_f_gauss(w)
yw_g_osc = int_f_g_osc(w)
In [88]:
yw_gauss
Out[88]:
(array([1.00000000e+00, 9.95012479e-01, 9.80198673e-01, 9.55997482e-01,
        9.23116346e-01, 8.82496903e-01, 8.35270211e-01, 7.82704538e-01,
        7.26149037e-01, 6.66976811e-01, 6.06530660e-01, 5.46074427e-01,
        4.86752256e-01, 4.29557358e-01, 3.75311099e-01, 3.24652467e-01,
        2.78037300e-01, 2.35746077e-01, 1.97898699e-01, 1.64474457e-01,
        1.35335283e-01, 1.10250525e-01, 8.89216175e-02, 7.10053537e-02,
        5.61347628e-02, 4.39369336e-02, 3.40474547e-02, 2.61214099e-02,
        1.98410947e-02, 1.49207861e-02, 1.11089965e-02, 8.18870101e-03,
        5.97602290e-03, 4.31784001e-03, 3.08871541e-03, 2.18749112e-03,
        1.53381068e-03, 1.06476624e-03, 7.31802419e-04, 4.97955422e-04,
        3.35462628e-04, 2.23745794e-04, 1.47748360e-04, 9.65934137e-05,
        6.25215038e-05, 4.00652974e-05, 2.54193465e-05, 1.59667839e-05,
        9.92950431e-06, 6.11356797e-06, 3.72665317e-06]),
 array([8.67103044e-10, 8.71759742e-10, 8.65823089e-10, 7.86216492e-10,
        5.27645140e-10, 6.46357252e-11, 8.16586275e-10, 4.80060338e-09,
        7.97553740e-09, 8.25985364e-12, 5.33560001e-12, 4.00755154e-12,
        2.55833889e-12, 5.48197209e-13, 2.19692646e-12, 2.03017172e-11,
        3.15272127e-11, 3.68896589e-11, 4.21640461e-11, 4.12359313e-11,
        2.85877887e-11, 2.09011249e-12, 5.64819439e-11, 1.31280532e-10,
        2.23145719e-10, 3.11740273e-10, 3.71356595e-10, 3.53420517e-10,
        5.43545490e-11, 2.05276070e-10, 6.77224376e-10, 1.29296123e-09,
        1.98762025e-09, 2.58833702e-09, 1.59243956e-09, 1.00383517e-10,
        1.49000244e-09, 4.41577452e-09, 8.63796704e-09, 1.36829487e-08,
        1.46585692e-08, 9.88017601e-09, 1.37864404e-08, 1.00040538e-08,
        1.18979086e-08, 2.04162842e-10, 6.88497936e-11, 9.28605735e-09,
        4.70400366e-10, 1.21317811e-08, 3.06428681e-09]))
In [89]:
plt.figure("Signal_spectra",[8,12])
fig,ax = plt.subplots()
ax.plot(w, yw_gauss[0],'b')
ax.plot(w, yw_g_osc[0],'r')
ax.set_xlim([w.min(),w.max()])
ax.set_ylim([0,1.5])
ax.set_xlabel(r"Spectrum $\omega$")
plt.show()
<matplotlib.figure.Figure at 0x10c3c17b8>
In [90]:
# But actually quad allows already to basically perform a Fourier transfrom via the weighting option
f_i_gauss_alt = lambda w : quad(f_gauss,-10,10,weight="cos",wvar=(w,))
int_f_gauss_alt = np.vectorize(f_i_gauss)
yw_gauss_alt = int_f_gauss_alt(w)
In [91]:
# Comparing the original calculation with the weighting option calculation
plt.figure("Signal_spectrum_comparison",[8,12])
fig,ax = plt.subplots()
ax.plot(w, yw_gauss[0],'b')
ax.plot(w, yw_gauss_alt[0],'r')
ax.set_xlim([w.min(),w.max()])
ax.set_ylim([0,1.5])
ax.set_xlabel(r"Spectrum $\omega$")
plt.show()
<matplotlib.figure.Figure at 0x10c842588>
In [ ]: