Setup¶

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from numpy.random import normal, exponential, randn, rand, random_sample

# Custom color palette
from cycler import cycler
colors = ["#e69f00", "#56b4e9", "#009e73", "#d55e00", "#cc799c"]
plt.rc("axes", prop_cycle=cycler("color", colors))

1D data¶

Histograms¶

x = randn(1000)
bins = np.linspace(-10, 10)
plt.hist(x, bins=bins, density=True)
plt.show()

sns.distplot(x)
plt.savefig("figs/sns_distplot.png")
plt.show()

sns.kdeplot(x, cumulative=True)
plt.xlim(-10, 10)
plt.show()

2D data¶

Line plotting¶

x = np.array([1, 2, 3, 4])
y = x**2
plt.plot(x, y)
plt.show()

df = pd.DataFrame({"x": x, "y":y})
df.plot("x", "y")
plt.show()

Filling areas¶

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

np.random.seed(33)
time = pd.date_range('2000-1-1', periods=150, freq='B')
price = pd.Series(100+randn(150).cumsum(), index=time)
avg = price.rolling(20).mean()
std = price.rolling(20).std()

plt.plot(price.index, price, 'k')
plt.plot(avg.index, avg, 'b')
plt.fill_between(std.index, avg-2*std, avg+2*std, color='b', alpha=0.2)
plt.ylabel("Price")
plt.savefig("figs/plt_moving_average.png")
plt.show()

Errorbars¶

x = np.arange(0, 2*np.pi, 0.1)
yerr = 0.3
noise = yerr * np.random.randn(*x.shape)
y = np.sin(x) + noise
plt.errorbar(x, y, yerr=yerr, fmt="o")
plt.savefig("figs/plt_errorbar.png")
plt.show()

Scatter plots¶

x = randn(1000)
y = exponential(1, 1000)
z = 15 - exponential(1, 1000)

plt.scatter(x, y, label="y")
plt.scatter(x, z, label="z")
plt.legend()
plt.savefig("figs/plt_scatter.png")

2D histogram¶

x = randn(1000)
y = exponential(size=1000)
hist = plt.hist2d(x, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

Images¶

path = "figs/python.png"
img = plt.imread(path)
fig1 = plt.imshow(img)
plt.savefig("figs/plt_imshow.png")
plt.show()

data = rand(*img.shape)
data[img > 0.95] = 1
fig = plt.imshow(data)
plt.show()

Violin plots¶

mus = 0, 1.5, 2.2
data = [normal(mu, 1, 1000) for mu in mus]
plt.violinplot(data, positions=mus)
plt.xlabel(r"$\mu$")
plt.show()

Split violin plots¶

tips = sns.load_dataset("tips")
tips["percent"] = tips.tip / tips.total_bill
sns.violinplot("day", "percent", "sex", data=tips, split=True, palette=["#e69f00", "#56b4e9"], saturation=1, linewidth=1)
plt.savefig("figs/sns_violin.png")
plt.show()

Geospatial plotting¶

Folium¶

import folium
m = folium.Map(location=[47.3686, 8.5391])
m

import pandas as pd


url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'
state_unemployment = f'{url}/US_Unemployment_Oct2012.csv'
state_data = pd.read_csv(state_unemployment)

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=state_data,
    columns=['State', 'Unemployment'],
    key_on='feature.id',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Unemployment Rate (%)'
).add_to(m)

folium.LayerControl().add_to(m)

m

Geopandas¶

import geopandas

geopandas.datasets.available

['naturalearth_cities', 'naturalearth_lowres', 'nybb']

world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.head()

world.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f991a529438>

world.columns

Index(['pop_est', 'continent', 'name', 'iso_a3', 'gdp_md_est', 'geometry'], dtype='object')

fig = plt.figure(figsize=(20, 5))
ax = fig.gca()
world = world[(world.pop_est>0) & (world.name!="Antarctica")]
world['gdp_per_cap'] = world.gdp_md_est / world.pop_est
world.plot(column='gdp_per_cap', ax=ax, legend=True, cmap="OrRd")
plt.savefig("figs/geopandas_gdp_per_cap.png")
plt.show()

fig = plt.figure(figsize=(20, 5))
ax = fig.gca()

cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities'))
world.plot(color='white', edgecolor='black', ax=ax)
cities.plot(ax=ax, marker='*', color='green', markersize=5)

<matplotlib.axes._subplots.AxesSubplot at 0x7f98d7a554e0>

3D data¶

Contour plots¶

import noise
pnoise2 = np.vectorize(noise.pnoise2)

x = np.arange(-3, 3, 0.1)
y = np.arange(-3, 3, 0.1)
X, Y = np.meshgrid(x, y)
z = pnoise2(X, Y)
plt.contour(X, Y, z)
plt.savefig("figs/plt_contour.png")
plt.show()

plt.contourf(X, Y, z, 20, cmap='RdGy')
plt.savefig("figs/plt_contourf.png")
plt.show()

3D visualization¶

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

cmap = plt.cm.viridis
x_, y_, z_ = X.flatten(), Y.flatten(), z.flatten()
surf = ax.plot_trisurf(x_, y_, z_, cmap=cmap)
plt.colorbar(surf)
plt.savefig("figs/plt_trisurf.png")
plt.show()

4D data¶

Scatterplot¶

planets = sns.load_dataset("planets")

cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
ax = sns.scatterplot(x="distance", y="orbital_period",
                     hue="year", size="mass",
                     palette=cmap, sizes=(10, 200),
                     data=planets)
plt.savefig("figs/sns_scatterplot.png")
plt.show()

ND data¶

Parallel coordinates¶

from pandas.plotting import parallel_coordinates

iris = pd.read_csv("data/iris.csv")
parallel_coordinates(iris, "Name")
plt.show()

Pairplot¶

iris.columns

Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')

sns.pairplot(iris, diag_kind="kde", hue="Name")
plt.savefig("figs/sns_pairplot.png")
plt.show()

Combining and styling plots¶

Plotting styles¶

plt.style.available

['fast',
 'tableau-colorblind10',
 'seaborn-paper',
 'seaborn-deep',
 'grayscale',
 'seaborn-colorblind',
 'seaborn-poster',
 'Solarize_Light2',
 'seaborn-white',
 'fivethirtyeight',
 'seaborn',
 'seaborn-whitegrid',
 'seaborn-dark',
 'bmh',
 'dark_background',
 'seaborn-ticks',
 'seaborn-talk',
 '_classic_test',
 'seaborn-muted',
 'seaborn-darkgrid',
 'seaborn-dark-palette',
 'seaborn-notebook',
 'seaborn-bright',
 'seaborn-pastel',
 'ggplot',
 'classic',
 'LHCb']

with plt.style.context('ggplot'):
    plt.plot(x, y)
    plt.savefig("figs/plt_line_ggplot.pdf")
    plt.show()

# plt.style.use("LHCb")
# with plt.style.context('LHCb'):
#     plt.plot(x, y)
#     plt.savefig("figs/plt_line_lhcb.pdf")
#     plt.show()

Subplots¶

np.random.seed(42)
x = np.arange(0, 10, 0.01)
y = np.random.randn(len(x)).cumsum()
d = np.diff(y)

plt.subplot?

plt.subplot(2, 2, 1)
plt.plot(x, y)

plt.subplot(224)
plt.hist(d, bins=20, density=True)

plt.savefig("figs/plt_subplot.png")
plt.show()

plt.subplots?

fig, axes = plt.subplots(2, 2)
axes[0,0].plot(x, y)
axes[1,1].hist(d, bins=20, density=True)
plt.savefig("figs/plt_subplots.png")
plt.show()

Picture-in Picture¶

plt.plot(x, y)
plt.axes([0.2, .6, .2, .2])
plt.hist(d, bins=20, density=True)
plt.xticks([])
plt.yticks([])
plt.savefig("figs/plt_pip.png")
plt.show()

Multiple axes¶

plt.figure()
x = np.linspace(-5, 5)
y = 2*x + 3
y2 = x**2

ax1 = plt.gca()
ax1.plot(x, y)
ax1.set_ylabel("Linear")
ax2 = ax1.twinx()
ax2.plot(x, y2)
ax2.set_ylabel("Quadratic")

plt.savefig("figs/plt_twinx.png")
plt.show()

Networks and Natural Language Processing (NLP)¶

import networkx as nx
import nltk
from nltk.util import ngrams
import matplotlib

from collections import Counter
from operator import itemgetter

G = nx.cubical_graph()
nx.draw(G)
plt.savefig("figs/networkx_cubical.png")
plt.show()

/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:579: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if not cb.iterable(width):

ignore = {",", ".", "``", "''", "'", "'s", "?", "!", "-", "--", "...", ";"}

def get_words(file_name, encoding="utf-8"):
    with open(file_name, encoding=encoding) as f:
        for line in f:
            if not line:
                continue
            for word in nltk.word_tokenize(line.strip()):
                if word not in ignore:
                    yield word.lower()

words = list(get_words("data/Harry Potter and the Sorcerer.txt", 'iso-8859-1'))

counter = Counter(words)
w, c = zip(*counter.most_common(15))
c = np.array(c) / len(words) * 100.

plt.plot(w, c)
plt.ylabel("Relative frequency [%]")
plt.show()

words = list(filter(lambda word: len(word) > 3, get_words("data/Harry Potter and the Sorcerer.txt", 'iso-8859-1')))
counter = Counter(words)

g = nx.Graph()
g.add_nodes_from((word, {"size": freq}) for word, freq in counter.items())

digrams = Counter(tuple(sorted(x)) for x in ngrams(words, 2)).items()
g.add_edges_from((*pair, {"weight": freq / counter[pair[0]]}) for pair, freq in digrams)

def filter_edges(g, most_common_nodes, cutoff=0.5):
    most_common_nodes = set(most_common_nodes)
    for edge in g.edges:
        if not all(node in most_common_nodes for node in edge):
            continue
        if g.get_edge_data(*edge)["weight"] < cutoff:
            continue
        yield edge

most_common_nodes = list(map(itemgetter(0), counter.most_common(100)))
edges = list(filter_edges(g, most_common_nodes, cutoff=0.))

g2 = nx.Graph()
g2.add_nodes_from((node, g.node[node]) for node in most_common_nodes)
g2.add_edges_from((*edge, g.get_edge_data(*edge)) for edge in edges)

plt.figure(figsize=(20, 20))
layout = nx.spring_layout(g2, k=0.1, iterations=50)
sizes = [g2.node[node]["size"] * 5 for node in g2]
edge_weights = np.array([g2.get_edge_data(*edge)["weight"] for edge in g2.edges])
nx.draw(g2, pos=layout,
        with_labels=True, node_size=sizes, node_color="#cccccc",
        edge_cmap=plt.cm.Blues, edge_color=edge_weights, edge_vmin=edge_weights.min(), edge_vmax=edge_weights.max())

/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:579: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if not cb.iterable(width):
/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:585: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  and cb.iterable(edge_color) \
/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:595: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  for c in edge_color]):

plt.hist(np.log(edge_weights))
plt.show()

	pop_est	continent	name	iso_a3	gdp_md_est	geometry
0	920938	Oceania	Fiji	FJI	8374.0	(POLYGON ((180 -16.06713266364245, 180 -16.555...
1	53950935	Africa	Tanzania	TZA	150600.0	POLYGON ((33.90371119710453 -0.950000000000000...
2	603253	Africa	W. Sahara	ESH	906.5	POLYGON ((-8.665589565454809 27.65642588959236...
3	35623680	North America	Canada	CAN	1674000.0	(POLYGON ((-122.84 49.00000000000011, -122.974...
4	326625791	North America	United States of America	USA	18560000.0	(POLYGON ((-122.84 49.00000000000011, -120 49....