import pandas as pd

pd.Series([11,13,17,19,23])

0    11
1    13
2    17
3    19
4    23
dtype: int64

series = pd.Series([11,13,17,19,23], index=['a', 'b', 'c', 'd', 'e'])
print(series)

a    11
b    13
c    17
d    19
e    23
dtype: int64

series.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

series.values

array([11, 13, 17, 19, 23])

#series. [TAB]

frame = pd.DataFrame({"primes": series, "fibo": [1,1,2,3,5], "0-4": range(5)})

print(frame)

   primes  fibo  0-4
a      11     1    0
b      13     1    1
c      17     2    2
d      19     3    3
e      23     5    4

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

refugees = pd.read_csv("data/refugee-population.csv", skiprows=4)

refugees.head()

refugees.set_index(["Country Code"], inplace=True)

refugees.head()

refugees.loc[["CHE","DEU"]]

refugees[["1990","2000"]].head()

refugees.get(["1990","2000"]).head()

che = refugees.loc["CHE"][[str(year) for year in range(1990,2024)]]

che.dropna().plot()
plt.show()

che.index.values

array(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022', '2023'], dtype=object)

che.index = pd.to_datetime(che.index, format="%Y")
print(che.index)

DatetimeIndex(['1990-01-01', '1991-01-01', '1992-01-01', '1993-01-01',
               '1994-01-01', '1995-01-01', '1996-01-01', '1997-01-01',
               '1998-01-01', '1999-01-01', '2000-01-01', '2001-01-01',
               '2002-01-01', '2003-01-01', '2004-01-01', '2005-01-01',
               '2006-01-01', '2007-01-01', '2008-01-01', '2009-01-01',
               '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01',
               '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01',
               '2018-01-01', '2019-01-01', '2020-01-01', '2021-01-01',
               '2022-01-01', '2023-01-01'],
              dtype='datetime64[ns]', freq=None)

che.plot()
che.rolling(center=False,window=5).mean().plot()
plt.show()

!head data/metadata-countries_population.csv

﻿"Country Code","Region","IncomeGroup","SpecialNotes","TableName",

"ABW","Latin America & Caribbean","High income","","Aruba",

"AFE","","","26 countries, stretching from the Red Sea in the North to the Cape of Good Hope in the South (https://www.worldbank.org/en/region/afr/eastern-and-southern-africa)","Africa Eastern and Southern",

"AFG","South Asia","Low income","The reporting period for national accounts data is designated as either calendar year basis (CY) or fiscal year basis (FY). For this country, it is fiscal year-based (fiscal year-end: March 20). Also, an estimate (PA.NUS.ATLS) of the exchange rate covers the same period and thus differs from the official exchange rate (CY).

In addition, the World Bank systematically assesses the appropriateness of official exchange rates as conversion factors. In this country, multiple or dual exchange rate activity exists and must be accounted for appropriately in underlying statistics. An alternative estimate (“alternative conversion factor” - PA.NUS.ATLS) is thus calculated as a weighted average of the different exchange rates in use in the country. Doing so better reflects economic reality and leads to more accurate cross-country comparisons and country classifications by income level. For this country, this applies to the period 1960-2006. Alternative conversion factors are used in the Atlas methodology and elsewhere in World Development Indicators as single-year conversion factors.","Afghanistan",

"AFW","","","22 countries, stretching from the westernmost point of Africa, across the equator, and partly along the Atlantic Ocean till the Republic of Congo in the South (https://www.worldbank.org/en/region/afr/western-and-central-africa)","Africa Western and Central",

"AGO","Sub-Saharan Africa","Lower middle income","The World Bank systematically assesses the appropriateness of official exchange rates as conversion factors. In this country, multiple or dual exchange rate activity exists and must be accounted for appropriately in underlying statistics. An alternative estimate (“alternative conversion factor” - PA.NUS.ATLS) is thus calculated as a weighted average of the different exchange rates in use in the country. Doing so better reflects economic reality and leads to more accurate cross-country comparisons and country classifications by income level. For this country, this applies to the period 1994-2023. Alternative conversion factors are used in the Atlas methodology and elsewhere in World Development Indicators as single-year conversion factors.","Angola",

"ALB","Europe & Central Asia","Upper middle income","","Albania",

"AND","Europe & Central Asia","High income","","Andorra",

meta = pd.read_csv("data/metadata-countries_population.csv")

meta.columns

Index(['Country Code', 'Region', 'IncomeGroup', 'SpecialNotes', 'TableName',
       'Unnamed: 5'],
      dtype='object')

meta = meta[['Country Code', 'Region']]

meta.head()

meta.set_index("Country Code", inplace=True)

non_countries = meta.loc[meta.Region.isnull()].index
print(non_countries)

Index(['AFE', 'AFW', 'ARB', 'CEB', 'CSS', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS',
       'EMU', 'EUU', 'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX',
       'LAC', 'LCN', 'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 'MIC', 'MNA',
       'NAC', 'OED', 'OSS', 'PRE', 'PSS', 'PST', 'SAS', 'SSA', 'SSF', 'SST',
       'TEA', 'TEC', 'TLA', 'TMN', 'TSA', 'TSS', 'UMC', 'WLD'],
      dtype='object', name='Country Code')

refugees = refugees.drop(non_countries)

refugees.columns

Index(['Country Name', 'Indicator Name', 'Indicator Code', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970',
       '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
       'Unnamed: 68'],
      dtype='object')

np.sum(refugees["2023"].notnull())

np.int64(165)

useful_cols = []
last_year = 2023 # depending on output above
for year in range(last_year-5,last_year+1): 
    useful_cols.append(str(year))

useful_cols

['2018', '2019', '2020', '2021', '2022', '2023']

refugees = refugees[useful_cols]

refugee_means = refugees.mean(axis=1)

def load_file(file):
    """Load and process a Worldbank File"""
    data = pd.read_csv(file, skiprows=4)
    data.set_index("Country Code", inplace=True)
    data.drop(non_countries, inplace=True)
    data = data[useful_cols]
    return data.mean(axis=1), data

gdp_means, gdp = load_file("data/gdp-per-capita.csv")

gdp_means.head()

Country Code
ABW    29851.603885
AFG      442.053744
AGO     2225.540349
ALB     6263.667603
AND    42067.093210
dtype: float64

gdp.head()

population_means, population = load_file("data/population.csv")

data = pd.DataFrame({"gdp": gdp_means, "refugees": refugee_means/population_means}).dropna()

data.plot.scatter("gdp", "refugees")
plt.show()

data.where(data["refugees"]>0.1).dropna()

ax = data[data["refugees"] > 1e-10].plot.scatter(y="refugees", x="gdp", loglog=True)
ax = data.loc[["CHE"]].plot.scatter(y="refugees", x="gdp", ax=ax, color="r", label="Switzerland")
plt.title("refugees fraction vs. gdp")
plt.show()

data.loc["CHE"]

gdp         90323.060451
refugees        0.015824
Name: CHE, dtype: float64

europe = meta.loc[meta.Region == "Europe & Central Asia"].index

europe[:10]

Index(['ALB', 'AND', 'ARM', 'AUT', 'AZE', 'BEL', 'BGR', 'BIH', 'BLR', 'CHE'], dtype='object', name='Country Code')

ax = data[data["refugees"] > 1e-10].plot.scatter(y="refugees", x="gdp", loglog=True)
ax = data.loc[data.index.intersection(europe)].plot.scatter(y="refugees", x="gdp", ax=ax, color="r", label="Europe & Central Asia")
plt.title("refugees fraction vs. gdp")
plt.show()

europe_small = ['AUT',
 'DEU',
 'FRA',
 'ITA',
]

data_eu = data.loc[europe_small].dropna()
data_eu

ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
plt.title("refugees fraction vs. gdp")
plt.show()

x = np.linspace(data_eu["gdp"].min(), data_eu["gdp"].max(), 100)

from numpy import polyfit, polyval

res = polyfit(data_eu["gdp"], data_eu["refugees"],1)
print(res)

[ 1.04719896e-06 -3.41693157e-02]

ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, polyval(res, x))
plt.title("refugees fraction vs. gdp")
plt.show()

from scipy.optimize import curve_fit

def fit_function(x,b,c):
    return b*x+c

res = curve_fit(fit_function, data_eu["gdp"], data_eu["refugees"])
print(res)

(array([ 1.04719896e-06, -3.41693157e-02]), array([[ 2.25835729e-14, -1.00418736e-09],
       [-1.00418736e-09,  4.56445915e-05]]))

ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, fit_function(x, *(res[0])))
plt.title("refugees fraction vs. gdp")
plt.show()

from scipy.optimize import leastsq

def fit_function(x, p):
    return x*p[0]+p[1]

def error_function(params):
    return data_eu["refugees"] - fit_function(data_eu["gdp"], params)

res = leastsq(error_function, [0,0])
print(res)

(array([ 1.04719896e-06, -3.41693157e-02]), 1)

ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, fit_function(x, res[0]))
plt.title("refugees fraction vs. gdp")
plt.show()

import statsmodels.formula.api as smf

res = smf.ols("refugees ~ gdp", data=data_eu).fit()

print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               refugees   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     48.56
Date:                Tue, 24 Jun 2025   Prob (F-statistic):             0.0200
Time:                        12:15:39   Log-Likelihood:                 20.583
No. Observations:                   4   AIC:                            -37.17
Df Residuals:                       2   BIC:                            -38.39
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0342      0.007     -5.058      0.037      -0.063      -0.005
gdp         1.047e-06    1.5e-07      6.968      0.020    4.01e-07    1.69e-06
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   3.329
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.452
Skew:                          -0.507   Prob(JB):                        0.798
Kurtosis:                       1.703   Cond. No.                     3.05e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.05e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

/usr/lib/python3/dist-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 4 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "

print(res.params)

Intercept   -0.034169
gdp          0.000001
dtype: float64

ax = data_eu.plot.scatter(y="refugees", x="gdp", color="r")
ax.plot(x, res.params.gdp*x+res.params.Intercept)
plt.title("refugees fraction vs. gdp")
plt.show()

frame.loc["a"]

primes    11
fibo       1
0-4        0
Name: a, dtype: int64

frame.loc[["a"]]

frame.loc[["a","c"]]

frame.loc["b":"d"]

frame.loc[[True,False,True,False,True]]

frame.loc[frame["primes"] > 20]

frame[frame["primes"] > 20]

frame.iloc[2:-1]

frame["primes"]

a    11
b    13
c    17
d    19
e    23
Name: primes, dtype: int64

frame[["primes"]]

frame[["primes","0-4"]]

frame.get(["primes","0-4"])

frame.primes

a    11
b    13
c    17
d    19
e    23
Name: primes, dtype: int64

# Raises SyntaxError
#frame.0-4

	Country Name	Country Code	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	...	2015	2016	2017	2018	2019	2020	2021	2022	2023	Unnamed: 68
0	Aruba	ABW	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Africa Eastern and Southern	AFE	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	3333273.0	3990478.0	5155400.0	5114399.0	5087755.0	5183533.0	5436720.0	5412266.0	5553759.0	NaN
2	Afghanistan	AFG	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	257553.0	59770.0	75927.0	72228.0	72227.0	72278.0	66949.0	52159.0	34826.0	NaN
3	Africa Western and Central	AFW	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	1138010.0	1200854.0	1172523.0	1285773.0	1315229.0	1474135.0	1631057.0	1702392.0	2296159.0	NaN
4	Angola	AGO	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	...	15547.0	15547.0	41119.0	39856.0	25793.0	25791.0	26045.0	25514.0	25174.0	NaN

	Country Name	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	1966	...	2015	2016	2017	2018	2019	2020	2021	2022	2023	Unnamed: 68
Country Code
ABW	Aruba	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
AFE	Africa Eastern and Southern	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	3333273.0	3990478.0	5155400.0	5114399.0	5087755.0	5183533.0	5436720.0	5412266.0	5553759.0	NaN
AFG	Afghanistan	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	257553.0	59770.0	75927.0	72228.0	72227.0	72278.0	66949.0	52159.0	34826.0	NaN
AFW	Africa Western and Central	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1138010.0	1200854.0	1172523.0	1285773.0	1315229.0	1474135.0	1631057.0	1702392.0	2296159.0	NaN
AGO	Angola	Refugee population by country or territory of ...	SM.POP.REFG	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	15547.0	15547.0	41119.0	39856.0	25793.0	25791.0	26045.0	25514.0	25174.0	NaN

	Country Name	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	1966	...	2015	2016	2017	2018	2019	2020	2021	2022	2023	Unnamed: 68
Country Code
CHE	Switzerland	Refugee population by country or territory of ...	SM.POP.REFG	20000.0	20000.0	20000.0	20000.0	20000.0	20000.0	20500.0	...	73326.0	82668.0	93030.0	104011.0	110162.0	115798.0	118829.0	182474.0	192507.0	NaN
DEU	Germany	Refugee population by country or territory of ...	SM.POP.REFG	197000.0	190000.0	185000.0	182000.0	180000.0	180000.0	140000.0	...	316098.0	669468.0	970357.0	1063835.0	1146682.0	1210596.0	1255694.0	2075445.0	2593007.0	NaN

	1990	2000
Country Code
ABW	NaN	NaN
AFE	4709569.0	2444941.0
AFG	50.0	NaN
AFW	932052.0	968325.0
AGO	11557.0	12086.0

	1990	2000
Country Code
ABW	NaN	NaN
AFE	4709569.0	2444941.0
AFG	50.0	NaN
AFW	932052.0	968325.0
AGO	11557.0	12086.0

Pandas¶

Basic Data Structures¶

Series¶

DataFrame¶

Refugee Example¶

Loading and Accessing Data¶

Working with a Single Country¶

Removing Unwanted Data¶

Excluding Non-Countries¶

Excluding Columns¶

Loading Additional Files¶

Creating the Plot¶

Highlighting a Full Region¶

Fitting¶

Preparations¶

polyfit¶

curve_fit¶

leastsq¶

statsmodels¶

Appendix: Selecting from DataFrames¶

Accessing Rows¶

Accessing Columns¶

	Country Code	Region
0	ABW	Latin America & Caribbean
1	AFE	NaN
2	AFG	South Asia
3	AFW	NaN
4	AGO	Sub-Saharan Africa

	2018	2019	2020	2021	2022	2023
Country Code
ABW	30918.483584	31902.809818	24008.127822	29127.759384	33300.838819	NaN
AFG	492.090631	497.741431	512.055098	355.777826	352.603733	NaN
AGO	2540.508879	2191.347764	1450.905111	1927.474078	2933.484644	2309.521620
ALB	5287.660801	5396.214243	5343.037704	6377.203096	6810.114041	8367.775731
AND	42904.828456	41328.600498	37207.221999	42066.490518	42350.697069	46544.720720

	gdp	refugees
Country Code
JOR	4208.429706	0.274945
LBN	5843.601048	0.238846
PSE	3549.825962	0.488927

	gdp	refugees
Country Code
AUT	52072.030011	0.020003
DEU	49064.065027	0.018667
FRA	41708.479489	0.007363
ITA	35016.983603	0.003547