import pandas as pd


pd.Series([11,13,17,19,23])

0    11
1    13
2    17
3    19
4    23
dtype: int64


series = pd.Series([11,13,17,19,23], index=['a', 'b', 'c', 'd', 'e'])
print(series)

a    11
b    13
c    17
d    19
e    23
dtype: int64


series.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


series.values

array([11, 13, 17, 19, 23])


#series. [TAB]


frame = pd.DataFrame({"primes": series, "fibo": [1,1,2,3,5], "0-4": range(5)})


print(frame)

   primes  fibo  0-4
a      11     1    0
b      13     1    1
c      17     2    2
d      19     3    3
e      23     5    4


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


refugees = pd.read_csv("data/refugee-population.csv", skiprows=4)


refugees.head()


refugees.set_index(["Country Code"], inplace=True)


refugees.head()


refugees.loc[["CHE","DEU"]]


refugees[["1990","2000"]].head()


refugees.get(["1990","2000"]).head()


che = refugees.loc["CHE"][[str(year) for year in range(1990,2020)]]


che.dropna().plot()
plt.show()


che.index.values

array(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019'], dtype=object)


che.index = pd.to_datetime(che.index, format="%Y")
print(che.index)

DatetimeIndex(['1990-01-01', '1991-01-01', '1992-01-01', '1993-01-01',
               '1994-01-01', '1995-01-01', '1996-01-01', '1997-01-01',
               '1998-01-01', '1999-01-01', '2000-01-01', '2001-01-01',
               '2002-01-01', '2003-01-01', '2004-01-01', '2005-01-01',
               '2006-01-01', '2007-01-01', '2008-01-01', '2009-01-01',
               '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01',
               '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01',
               '2018-01-01', '2019-01-01'],
              dtype='datetime64[ns]', freq=None)


che.plot()
che.rolling(center=False,window=5).mean().plot()
plt.show()


!head data/metadata-countries_population.csv


meta = pd.read_csv("data/metadata-countries_population.csv")


meta.columns

Index(['Country Code', 'Region', 'IncomeGroup', 'SpecialNotes', 'TableName',
       'Unnamed: 5'],
      dtype='object')


meta = meta[['Country Code', 'Region']]


meta.head()


meta.set_index("Country Code", inplace=True)


non_countries = meta.loc[meta.Region.isnull()].index
print(non_countries)

Index(['ARB', 'CEB', 'CSS', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS', 'EMU', 'EUU',
       'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX', 'LAC', 'LCN',
       'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 'MIC', 'MNA', 'NAC', 'OED',
       'OSS', 'PRE', 'PSS', 'PST', 'SAS', 'SSA', 'SSF', 'SST', 'TEA', 'TEC',
       'TLA', 'TMN', 'TSA', 'TSS', 'UMC', 'WLD'],
      dtype='object', name='Country Code')


refugees = refugees.drop(non_countries)


refugees.columns

Index(['Country Name', 'Indicator Name', 'Indicator Code', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970',
       '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', 'Unnamed: 64'],
      dtype='object')


np.any(refugees["2019"].notnull())

False


useful_cols = []
for year in range(2010,2019):
    useful_cols.append(str(year))


useful_cols

['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']


refugees = refugees[useful_cols]


refugee_means = refugees.mean(axis=1)


def load_file(file):
    """Load and process a Worldbank File"""
    data = pd.read_csv(file, skiprows=4)
    data.set_index("Country Code", inplace=True)
    data.drop(non_countries, inplace=True)
    data = data[[str(year) for year in range(2010,2017)]]
    return data.mean(axis=1), data


gdp_means, gdp = load_file("data/gdp-per-capita.csv")


gdp_means.head()

Country Code
ABW    24972.420561
AFG      593.293323
AGO     4519.970434
ALB     4263.977328
AND    39345.839439
dtype: float64


gdp.head()


population_means, population = load_file("data/population.csv")


data = pd.DataFrame({"gdp": gdp_means, "refugees": refugee_means/population_means}).dropna()


data.plot.scatter("gdp", "refugees")
plt.show()


data.where(data["refugees"]>0.1).dropna()


ax = data[data["refugees"] > 1e-10].plot.scatter(y="refugees", x="gdp", loglog=True)
ax = data.loc[["CHE"]].plot.scatter(y="refugees", x="gdp", ax=ax, color="r", label="Switzerland")
plt.title("refugees fraction vs. gdp")
plt.show()


data.loc["CHE"]

gdp         82933.055377
refugees        0.008484
Name: CHE, dtype: float64


europe = meta.loc[meta.Region == "Europe & Central Asia"].index


europe[:10]

Index(['ALB', 'AND', 'ARM', 'AUT', 'AZE', 'BEL', 'BGR', 'BIH', 'BLR', 'CHE'], dtype='object', name='Country Code')


ax = data[data["refugees"] > 1e-10].plot.scatter(y="refugees", x="gdp", loglog=True)
ax = data.loc[data.index.intersection(europe)].plot.scatter(y="refugees", x="gdp", ax=ax, color="r", label="Europe & Central Asia")
plt.title("refugees fraction vs. gdp")
plt.show()


europe_small = ['AUT',
 'DEU',
 'FRA',
 'ITA',
]


data_eu = data.loc[europe_small].dropna()


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
plt.title("refugees fraction vs. gdp")
plt.show()


x = np.linspace(data_eu["gdp"].min(), data_eu["gdp"].max(), 100)


from numpy import polyfit, polyval


res = polyfit(data_eu["gdp"], data_eu["refugees"],1)
print(res)

[ 5.20472464e-07 -1.64261467e-02]


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, polyval(res, x))
plt.title("refugees fraction vs. gdp")
plt.show()


from scipy.optimize import curve_fit


def fit_function(x,b,c):
    return b*x+c


res = curve_fit(fit_function, data_eu["gdp"], data_eu["refugees"])
print(res)

(array([ 5.20472463e-07, -1.64261467e-02]), array([[ 3.86848840e-15, -1.62289029e-10],
       [-1.62289029e-10,  6.90771771e-06]]))


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, fit_function(x, *(res[0])))
plt.title("refugees fraction vs. gdp")
plt.show()


from scipy.optimize import leastsq


def fit_function(x, p):
    return x*p[0]+p[1]


def error_function(params):
    return data_eu["refugees"] - fit_function(data_eu["gdp"], params)


res = leastsq(error_function, [0,0])
print(res)

(array([ 5.20472464e-07, -1.64261467e-02]), 3)


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, fit_function(x, res[0]))
plt.title("refugees fraction vs. gdp")
plt.show()


import statsmodels.formula.api as smf


res = smf.ols("refugees ~ gdp", data=data_eu).fit()


print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               refugees   R-squared:                       0.972
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                     70.03
Date:                Tue, 12 Jul 2022   Prob (F-statistic):             0.0140
Time:                        07:41:11   Log-Likelihood:                 25.185
No. Observations:                   4   AIC:                            -46.37
Df Residuals:                       2   BIC:                            -47.60
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0164      0.003     -6.250      0.025      -0.028      -0.005
gdp         5.205e-07   6.22e-08      8.368      0.014    2.53e-07    7.88e-07
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   3.279
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.402
Skew:                          -0.551   Prob(JB):                        0.818
Kurtosis:                       1.905   Cond. No.                     3.52e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.52e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

/usr/lib/python3/dist-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 4 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "


print(res.params)

Intercept   -1.642615e-02
gdp          5.204725e-07
dtype: float64


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, res.params[1]*x+res.params[0])
plt.title("refugees fraction vs. gdp")
plt.show()


frame.loc["a"]

primes    11
fibo       1
0-4        0
Name: a, dtype: int64


frame.loc[["a"]]


frame.loc[["a","c"]]


frame.loc["b":"d"]


frame.loc[[True,False,True,False,True]]


frame.loc[frame["primes"] > 20]


frame[frame["primes"] > 20]


frame.iloc[2:-1]


frame["primes"]

a    11
b    13
c    17
d    19
e    23
Name: primes, dtype: int64


frame[["primes"]]


frame[["primes","0-4"]]


frame.get(["primes","0-4"])


frame.primes

a    11
b    13
c    17
d    19
e    23
Name: primes, dtype: int64


# Raises SyntaxError
#frame.0-4

	Country Name	Country Code	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	...	2011	2012	2013	2014	2015	2016	2017	2018	2019	Unnamed: 64
0	Aruba	ABW	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN	2.0	1.0	NaN	NaN	NaN	NaN
1	Afghanistan	AFG	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	3009.0	16187.0	16863.0	300423.0	257554.0	59771.0	75927.0	72231.0	NaN	NaN
2	Angola	AGO	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	16223.0	23413.0	23783.0	15474.0	15555.0	15555.0	41109.0	39865.0	NaN	NaN
3	Albania	ALB	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	82.0	86.0	93.0	104.0	104.0	138.0	89.0	131.0	NaN	NaN
4	Andorra	AND	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Country Name	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	1966	...	2011	2012	2013	2014	2015	2016	2017	2018	2019	Unnamed: 64
Country Code
ABW	Aruba	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN	2.0	1.0	NaN	NaN	NaN	NaN
AFG	Afghanistan	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	3009.0	16187.0	16863.0	300423.0	257554.0	59771.0	75927.0	72231.0	NaN	NaN
AGO	Angola	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	16223.0	23413.0	23783.0	15474.0	15555.0	15555.0	41109.0	39865.0	NaN	NaN
ALB	Albania	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	82.0	86.0	93.0	104.0	104.0	138.0	89.0	131.0	NaN	NaN
AND	Andorra	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Country Name	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	1966	...	2011	2012	2013	2014	2015	2016	2017	2018	2019	Unnamed: 64
Country Code
CHE	Switzerland	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	50416.0	50747.0	52464.0	62620.0	73336.0	82681.0	92995.0	104037.0	NaN	NaN
DEU	Germany	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	571684.0	589737.0	187567.0	216973.0	316115.0	669482.0	970302.0	1063837.0	NaN	NaN

	1990	2000
Country Code
ABW	NaN	NaN
AFG	50.0	NaN
AGO	11557.0	12086.0
ALB	NaN	523.0
AND	NaN	NaN

	1990	2000
Country Code
ABW	NaN	NaN
AFG	50.0	NaN
AGO	11557.0	12086.0
ALB	NaN	523.0
AND	NaN	NaN

Pandas¶

Basic Data Structures¶

Series¶

DataFrame¶

Refugee Example¶

Loading and Accessing Data¶

Working with a Single Country¶

Removing Unwanted Data¶

Excluding Non-Countries¶

Excluding Columns¶

Loading Additional Files¶

Creating the Plot¶

Highlighting a Full Region¶

Fitting¶

Preparations¶

polyfit¶

curve_fit¶

leastsq¶

statsmodels¶

Appendix: Selecting from DataFrames¶

Accessing Rows¶

Accessing Columns¶

	Country Code	Region
0	ABW	Latin America & Caribbean
1	AFG	South Asia
2	AGO	Sub-Saharan Africa
3	ALB	Europe & Central Asia
4	AND	Europe & Central Asia

	2010	2011	2012	2013	2014	2015	2016
Country Code
ABW	23512.602596	24985.993281	24713.698045	25025.099563	25533.569780	25796.380251	25239.600411
AFG	543.303042	591.162346	641.872034	637.165044	613.856333	578.466353	547.228110
AGO	3587.883798	4615.468028	5100.095808	5254.882338	5408.410496	4166.979684	3506.072885
ALB	4094.362119	4437.178067	4247.614279	4413.081743	4578.666720	3952.829458	4124.108907
AND	39736.354063	41100.729938	38392.943901	40626.751632	42300.334128	36039.653496	37224.108916

	gdp	refugees
Country Code
JOR	3951.958197	0.317788
LBN	7766.668886	0.194719
PSE	2861.802636	0.508273