import pandas as pd


pd.Series([11,13,17,19,23])

0    11
1    13
2    17
3    19
4    23
dtype: int64


series = pd.Series([11,13,17,19,23], index=['a', 'b', 'c', 'd', 'e'])
print(series)

a    11
b    13
c    17
d    19
e    23
dtype: int64


series.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


series.values

array([11, 13, 17, 19, 23])


#series. [TAB]


frame = pd.DataFrame({"primes": series, "fibo": [1,1,2,3,5], "0-4": range(5)})


print(frame)

   primes  fibo  0-4
a      11     1    0
b      13     1    1
c      17     2    2
d      19     3    3
e      23     5    4


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


refugees = pd.read_csv("data/refugee-population.csv", skiprows=4)


refugees.head()


refugees.set_index(["Country Code"], inplace=True)


refugees.head()


refugees.loc[["CHE","DEU"]]


refugees[["1990","2000"]].head()


refugees.get(["1990","2000"]).head()


che = refugees.loc["CHE"][[str(year) for year in range(1990,2023)]]


che.dropna().plot()
plt.show()


che.index.values

array(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022'], dtype=object)


che.index = pd.to_datetime(che.index, format="%Y")
print(che.index)

DatetimeIndex(['1990-01-01', '1991-01-01', '1992-01-01', '1993-01-01',
               '1994-01-01', '1995-01-01', '1996-01-01', '1997-01-01',
               '1998-01-01', '1999-01-01', '2000-01-01', '2001-01-01',
               '2002-01-01', '2003-01-01', '2004-01-01', '2005-01-01',
               '2006-01-01', '2007-01-01', '2008-01-01', '2009-01-01',
               '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01',
               '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01',
               '2018-01-01', '2019-01-01', '2020-01-01', '2021-01-01',
               '2022-01-01'],
              dtype='datetime64[ns]', freq=None)


che.plot()
che.rolling(center=False,window=5).mean().plot()
plt.show()


!head data/metadata-countries_population.csv



"AFG","South Asia","Low income","The reporting period for national accounts data is designated as either calendar year basis (CY) or fiscal year basis (FY). For this country, it is fiscal year-based (fiscal year-end: March 20). Also, an estimate (PA.NUS.ATLS) of the exchange rate covers the same period and thus differs from the official exchange rate (CY).


meta = pd.read_csv("data/metadata-countries_population.csv")


meta.columns

Index(['Country Code', 'Region', 'IncomeGroup', 'SpecialNotes', 'TableName',
       'Unnamed: 5'],
      dtype='object')


meta = meta[['Country Code', 'Region']]


meta.head()


meta.set_index("Country Code", inplace=True)


non_countries = meta.loc[meta.Region.isnull()].index
print(non_countries)

Index(['AFE', 'AFW', 'ARB', 'CEB', 'CSS', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS',
       'EMU', 'EUU', 'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX',
       'LAC', 'LCN', 'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 'MIC', 'MNA',
       'NAC', 'OED', 'OSS', 'PRE', 'PSS', 'PST', 'SAS', 'SSA', 'SSF', 'SST',
       'TEA', 'TEC', 'TLA', 'TMN', 'TSA', 'TSS', 'UMC', 'WLD'],
      dtype='object', name='Country Code')


refugees = refugees.drop(non_countries)


refugees.columns

Index(['Country Name', 'Indicator Name', 'Indicator Code', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970',
       '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021', '2022', 'Unnamed: 67'],
      dtype='object')


np.sum(refugees["2022"].notnull())

163


useful_cols = []
last_year = 2022 # depending on output above
for year in range(last_year-5,last_year+1): 
    useful_cols.append(str(year))


useful_cols

['2017', '2018', '2019', '2020', '2021', '2022']


refugees = refugees[useful_cols]


refugee_means = refugees.mean(axis=1)


def load_file(file):
    """Load and process a Worldbank File"""
    data = pd.read_csv(file, skiprows=4)
    data.set_index("Country Code", inplace=True)
    data.drop(non_countries, inplace=True)
    data = data[useful_cols]
    return data.mean(axis=1), data


gdp_means, gdp = load_file("data/gdp-per-capita.csv")


gdp_means.head()

Country Code
ABW    29195.590031
AFG      482.654165
AGO     2219.687217
ALB     5622.992095
AND    41023.002828
dtype: float64


gdp.head()


population_means, population = load_file("data/population.csv")


data = pd.DataFrame({"gdp": gdp_means, "refugees": refugee_means/population_means}).dropna()


data.plot.scatter("gdp", "refugees")
plt.show()


data.where(data["refugees"]>0.1).dropna()


ax = data[data["refugees"] > 1e-10].plot.scatter(y="refugees", x="gdp", loglog=True)
ax = data.loc[["CHE"]].plot.scatter(y="refugees", x="gdp", ax=ax, color="r", label="Switzerland")
plt.title("refugees fraction vs. gdp")
plt.show()


data.loc["CHE"]

gdp         86890.390629
refugees        0.014023
Name: CHE, dtype: float64


europe = meta.loc[meta.Region == "Europe & Central Asia"].index


europe[:10]

Index(['ALB', 'AND', 'ARM', 'AUT', 'AZE', 'BEL', 'BGR', 'BIH', 'BLR', 'CHE'], dtype='object', name='Country Code')


ax = data[data["refugees"] > 1e-10].plot.scatter(y="refugees", x="gdp", loglog=True)
ax = data.loc[data.index.intersection(europe)].plot.scatter(y="refugees", x="gdp", ax=ax, color="r", label="Europe & Central Asia")
plt.title("refugees fraction vs. gdp")
plt.show()


europe_small = ['AUT',
 'DEU',
 'FRA',
 'ITA',
]


data_eu = data.loc[europe_small].dropna()
data_eu


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
plt.title("refugees fraction vs. gdp")
plt.show()


x = np.linspace(data_eu["gdp"].min(), data_eu["gdp"].max(), 100)


from numpy import polyfit, polyval


res = polyfit(data_eu["gdp"], data_eu["refugees"],1)
print(res)

[ 8.99316549e-07 -2.81665523e-02]


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, polyval(res, x))
plt.title("refugees fraction vs. gdp")
plt.show()


from scipy.optimize import curve_fit


def fit_function(x,b,c):
    return b*x+c


res = curve_fit(fit_function, data_eu["gdp"], data_eu["refugees"])
print(res)

(array([ 8.99316549e-07, -2.81665522e-02]), array([[ 1.54432316e-14, -6.66890325e-10],
       [-6.66890325e-10,  2.94526038e-05]]))


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, fit_function(x, *(res[0])))
plt.title("refugees fraction vs. gdp")
plt.show()


from scipy.optimize import leastsq


def fit_function(x, p):
    return x*p[0]+p[1]


def error_function(params):
    return data_eu["refugees"] - fit_function(data_eu["gdp"], params)


res = leastsq(error_function, [0,0])
print(res)

(array([ 8.99316549e-07, -2.81665523e-02]), 3)


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, fit_function(x, res[0]))
plt.title("refugees fraction vs. gdp")
plt.show()


import statsmodels.formula.api as smf


res = smf.ols("refugees ~ gdp", data=data_eu).fit()


print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               refugees   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     52.37
Date:                Tue, 11 Jul 2023   Prob (F-statistic):             0.0186
Time:                        12:17:27   Log-Likelihood:                 21.418
No. Observations:                   4   AIC:                            -38.84
Df Residuals:                       2   BIC:                            -40.06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0282      0.005     -5.190      0.035      -0.052      -0.005
gdp         8.993e-07   1.24e-07      7.237      0.019    3.65e-07    1.43e-06
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   3.069
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.688
Skew:                          -0.913   Prob(JB):                        0.709
Kurtosis:                       2.112   Cond. No.                     2.93e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.93e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

/usr/lib/python3/dist-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 4 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "


print(res.params)

Intercept   -2.816655e-02
gdp          8.993165e-07
dtype: float64


ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, res.params[1]*x+res.params[0])
plt.title("refugees fraction vs. gdp")
plt.show()


frame.loc["a"]

primes    11
fibo       1
0-4        0
Name: a, dtype: int64


frame.loc[["a"]]


frame.loc[["a","c"]]


frame.loc["b":"d"]


frame.loc[[True,False,True,False,True]]


frame.loc[frame["primes"] > 20]


frame[frame["primes"] > 20]


frame.iloc[2:-1]


frame["primes"]

a    11
b    13
c    17
d    19
e    23
Name: primes, dtype: int64


frame[["primes"]]


frame[["primes","0-4"]]


frame.get(["primes","0-4"])


frame.primes

a    11
b    13
c    17
d    19
e    23
Name: primes, dtype: int64


# Raises SyntaxError
#frame.0-4

	Country Name	Country Code	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	...	2014	2015	2016	2017	2018	2019	2020	2021	2022	Unnamed: 67
0	Aruba	ABW	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Africa Eastern and Southern	AFE	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	2637640.0	3333273.0	3990478.0	5155400.0	5114399.0	5087755.0	5183533.0	5436720.0	5412266.0	NaN
2	Afghanistan	AFG	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	300421.0	257553.0	59770.0	75927.0	72228.0	72227.0	72278.0	66949.0	52159.0	NaN
3	Africa Western and Central	AFW	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	1108169.0	1138010.0	1200854.0	1172523.0	1285773.0	1315229.0	1474135.0	1631057.0	1705777.0	NaN
4	Angola	AGO	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	15468.0	15547.0	15547.0	41119.0	39856.0	25793.0	25791.0	26045.0	25514.0	NaN

	Country Name	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	1966	...	2014	2015	2016	2017	2018	2019	2020	2021	2022	Unnamed: 67
Country Code
ABW	Aruba	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
AFE	Africa Eastern and Southern	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2637640.0	3333273.0	3990478.0	5155400.0	5114399.0	5087755.0	5183533.0	5436720.0	5412266.0	NaN
AFG	Afghanistan	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	300421.0	257553.0	59770.0	75927.0	72228.0	72227.0	72278.0	66949.0	52159.0	NaN
AFW	Africa Western and Central	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1108169.0	1138010.0	1200854.0	1172523.0	1285773.0	1315229.0	1474135.0	1631057.0	1705777.0	NaN
AGO	Angola	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	15468.0	15547.0	15547.0	41119.0	39856.0	25793.0	25791.0	26045.0	25514.0	NaN

	Country Name	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	1966	...	2014	2015	2016	2017	2018	2019	2020	2021	2022	Unnamed: 67
Country Code
CHE	Switzerland	Refugee population by country or territory of ...	SM.POP.REFG	20000.0	20000.0	20000.0	20000.0	20000.0	20000.0	20500.0	...	62596.0	73326.0	82668.0	93030.0	104011.0	110162.0	115798.0	118829.0	182474.0	NaN
DEU	Germany	Refugee population by country or territory of ...	SM.POP.REFG	197000.0	190000.0	185000.0	182000.0	180000.0	180000.0	140000.0	...	216956.0	316098.0	669468.0	970357.0	1063835.0	1146682.0	1210596.0	1255694.0	2075445.0	NaN

	1990	2000
Country Code
ABW	NaN	NaN
AFE	4709569.0	2444941.0
AFG	50.0	NaN
AFW	932052.0	968325.0
AGO	11557.0	12086.0

	1990	2000
Country Code
ABW	NaN	NaN
AFE	4709569.0	2444941.0
AFG	50.0	NaN
AFW	932052.0	968325.0
AGO	11557.0	12086.0

Pandas¶

Basic Data Structures¶

Series¶

DataFrame¶

Refugee Example¶

Loading and Accessing Data¶

Working with a Single Country¶

Removing Unwanted Data¶

Excluding Non-Countries¶

Excluding Columns¶

Loading Additional Files¶

Creating the Plot¶

Highlighting a Full Region¶

Fitting¶

Preparations¶

polyfit¶

curve_fit¶

leastsq¶

statsmodels¶

Appendix: Selecting from DataFrames¶

Accessing Rows¶

Accessing Columns¶

	Country Code	Region
0	ABW	Latin America & Caribbean
1	AFE	NaN
2	AFG	South Asia
3	AFW	NaN
4	AGO	Sub-Saharan Africa

	2017	2018	2019	2020	2021	2022
Country Code
ABW	29326.708058	30918.515218	31902.762582	24487.863569	29342.100730	NaN
AFG	530.149863	502.057099	500.522981	516.866797	363.674087	NaN
AGO	2283.214233	2487.500996	2142.238757	1502.950754	1903.717405	2998.501158
ALB	4531.032207	5287.660817	5396.214227	5343.037704	6377.203096	6802.804519
AND	40632.231554	42904.828456	41328.600499	37207.222000	42072.341103	41992.793358

	gdp	refugees
Country Code
JOR	4103.047067	0.277147
LBN	7313.453707	0.239103
PSE	3590.180327	0.491745

	gdp	refugees
Country Code
AUT	50590.749602	0.017460
DEU	47632.398320	0.015473
FRA	40751.983647	0.006578
ITA	33758.229029	0.003165