Setup

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from numpy.random import normal, exponential, randn, rand, random_sample
In [2]:
# Custom color palette
from cycler import cycler
colors = ["#e69f00", "#56b4e9", "#009e73", "#d55e00", "#cc799c"]
plt.rc("axes", prop_cycle=cycler("color", colors))

1D data

Histograms

In [3]:
x = randn(1000)
bins = np.linspace(-10, 10)
plt.hist(x, bins=bins, density=True)
plt.show()
In [4]:
sns.distplot(x)
plt.savefig("figs/sns_distplot.png")
plt.show()
In [5]:
sns.kdeplot(x, cumulative=True)
plt.xlim(-10, 10)
plt.show()

2D data

Line plotting

In [6]:
x = np.array([1, 2, 3, 4])
y = x**2
plt.plot(x, y)
plt.show()

df = pd.DataFrame({"x": x, "y":y})
df.plot("x", "y")
plt.show()

Filling areas

In [7]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

np.random.seed(33)
time = pd.date_range('2000-1-1', periods=150, freq='B')
price = pd.Series(100+randn(150).cumsum(), index=time)
avg = price.rolling(20).mean()
std = price.rolling(20).std()

plt.plot(price.index, price, 'k')
plt.plot(avg.index, avg, 'b')
plt.fill_between(std.index, avg-2*std, avg+2*std, color='b', alpha=0.2)
plt.ylabel("Price")
plt.savefig("figs/plt_moving_average.png")
plt.show()

Errorbars

In [8]:
x = np.arange(0, 2*np.pi, 0.1)
yerr = 0.3
noise = yerr * np.random.randn(*x.shape)
y = np.sin(x) + noise
plt.errorbar(x, y, yerr=yerr, fmt="o")
plt.savefig("figs/plt_errorbar.png")
plt.show()

Scatter plots

In [9]:
x = randn(1000)
y = exponential(1, 1000)
z = 15 - exponential(1, 1000)

plt.scatter(x, y, label="y")
plt.scatter(x, z, label="z")
plt.legend()
plt.savefig("figs/plt_scatter.png")

2D histogram

In [10]:
x = randn(1000)
y = exponential(size=1000)
hist = plt.hist2d(x, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

Images

In [11]:
path = "figs/python.png"
img = plt.imread(path)
fig1 = plt.imshow(img)
plt.savefig("figs/plt_imshow.png")
plt.show()

data = rand(*img.shape)
data[img > 0.95] = 1
fig = plt.imshow(data)
plt.show()

Violin plots

In [12]:
mus = 0, 1.5, 2.2
data = [normal(mu, 1, 1000) for mu in mus]
plt.violinplot(data, positions=mus)
plt.xlabel(r"$\mu$")
plt.show()

Split violin plots

In [14]:
tips = sns.load_dataset("tips")
tips["percent"] = tips.tip / tips.total_bill
sns.violinplot("day", "percent", "sex", data=tips, split=True, palette=["#e69f00", "#56b4e9"], saturation=1, linewidth=1)
plt.savefig("figs/sns_violin.png")
plt.show()

Geospatial plotting

Folium

In [56]:
import folium
m = folium.Map(location=[47.3686, 8.5391])
m
Out[56]:
In [16]:
import pandas as pd


url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'
state_unemployment = f'{url}/US_Unemployment_Oct2012.csv'
state_data = pd.read_csv(state_unemployment)

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=state_data,
    columns=['State', 'Unemployment'],
    key_on='feature.id',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Unemployment Rate (%)'
).add_to(m)

folium.LayerControl().add_to(m)

m
Out[16]:

Geopandas

In [17]:
import geopandas
In [18]:
geopandas.datasets.available
Out[18]:
['naturalearth_cities', 'naturalearth_lowres', 'nybb']
In [19]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.head()
Out[19]:
pop_est continent name iso_a3 gdp_md_est geometry
0 920938 Oceania Fiji FJI 8374.0 (POLYGON ((180 -16.06713266364245, 180 -16.555...
1 53950935 Africa Tanzania TZA 150600.0 POLYGON ((33.90371119710453 -0.950000000000000...
2 603253 Africa W. Sahara ESH 906.5 POLYGON ((-8.665589565454809 27.65642588959236...
3 35623680 North America Canada CAN 1674000.0 (POLYGON ((-122.84 49.00000000000011, -122.974...
4 326625791 North America United States of America USA 18560000.0 (POLYGON ((-122.84 49.00000000000011, -120 49....
In [20]:
world.plot()
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f991a529438>
In [21]:
world.columns
Out[21]:
Index(['pop_est', 'continent', 'name', 'iso_a3', 'gdp_md_est', 'geometry'], dtype='object')
In [57]:
fig = plt.figure(figsize=(20, 5))
ax = fig.gca()
world = world[(world.pop_est>0) & (world.name!="Antarctica")]
world['gdp_per_cap'] = world.gdp_md_est / world.pop_est
world.plot(column='gdp_per_cap', ax=ax, legend=True, cmap="OrRd")
plt.savefig("figs/geopandas_gdp_per_cap.png")
plt.show()
In [23]:
fig = plt.figure(figsize=(20, 5))
ax = fig.gca()

cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities'))
world.plot(color='white', edgecolor='black', ax=ax)
cities.plot(ax=ax, marker='*', color='green', markersize=5)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f98d7a554e0>

3D data

Contour plots

In [24]:
import noise
pnoise2 = np.vectorize(noise.pnoise2)
In [25]:
x = np.arange(-3, 3, 0.1)
y = np.arange(-3, 3, 0.1)
X, Y = np.meshgrid(x, y)
z = pnoise2(X, Y)
plt.contour(X, Y, z)
plt.savefig("figs/plt_contour.png")
plt.show()
In [26]:
plt.contourf(X, Y, z, 20, cmap='RdGy')
plt.savefig("figs/plt_contourf.png")
plt.show()

3D visualization

In [27]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

cmap = plt.cm.viridis
x_, y_, z_ = X.flatten(), Y.flatten(), z.flatten()
surf = ax.plot_trisurf(x_, y_, z_, cmap=cmap)
plt.colorbar(surf)
plt.savefig("figs/plt_trisurf.png")
plt.show()

4D data

Scatterplot

In [28]:
planets = sns.load_dataset("planets")

cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
ax = sns.scatterplot(x="distance", y="orbital_period",
                     hue="year", size="mass",
                     palette=cmap, sizes=(10, 200),
                     data=planets)
plt.savefig("figs/sns_scatterplot.png")
plt.show()

ND data

Parallel coordinates

In [29]:
from pandas.plotting import parallel_coordinates

iris = pd.read_csv("data/iris.csv")
parallel_coordinates(iris, "Name")
plt.show()

Pairplot

In [64]:
iris.columns
Out[64]:
Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')
In [30]:
sns.pairplot(iris, diag_kind="kde", hue="Name")
plt.savefig("figs/sns_pairplot.png")
plt.show()

Combining and styling plots

Plotting styles

In [31]:
plt.style.available
Out[31]:
['fast',
 'tableau-colorblind10',
 'seaborn-paper',
 'seaborn-deep',
 'grayscale',
 'seaborn-colorblind',
 'seaborn-poster',
 'Solarize_Light2',
 'seaborn-white',
 'fivethirtyeight',
 'seaborn',
 'seaborn-whitegrid',
 'seaborn-dark',
 'bmh',
 'dark_background',
 'seaborn-ticks',
 'seaborn-talk',
 '_classic_test',
 'seaborn-muted',
 'seaborn-darkgrid',
 'seaborn-dark-palette',
 'seaborn-notebook',
 'seaborn-bright',
 'seaborn-pastel',
 'ggplot',
 'classic',
 'LHCb']
In [32]:
with plt.style.context('ggplot'):
    plt.plot(x, y)
    plt.savefig("figs/plt_line_ggplot.pdf")
    plt.show()
In [33]:
# plt.style.use("LHCb")
# with plt.style.context('LHCb'):
#     plt.plot(x, y)
#     plt.savefig("figs/plt_line_lhcb.pdf")
#     plt.show()

Subplots

In [62]:
np.random.seed(42)
x = np.arange(0, 10, 0.01)
y = np.random.randn(len(x)).cumsum()
d = np.diff(y)
In [59]:
plt.subplot?
In [35]:
plt.subplot(2, 2, 1)
plt.plot(x, y)

plt.subplot(224)
plt.hist(d, bins=20, density=True)

plt.savefig("figs/plt_subplot.png")
plt.show()
In [60]:
plt.subplots?
In [36]:
fig, axes = plt.subplots(2, 2)
axes[0,0].plot(x, y)
axes[1,1].hist(d, bins=20, density=True)
plt.savefig("figs/plt_subplots.png")
plt.show()

Picture-in Picture

In [63]:
plt.plot(x, y)
plt.axes([0.2, .6, .2, .2])
plt.hist(d, bins=20, density=True)
plt.xticks([])
plt.yticks([])
plt.savefig("figs/plt_pip.png")
plt.show()

Multiple axes

In [38]:
plt.figure()
x = np.linspace(-5, 5)
y = 2*x + 3
y2 = x**2

ax1 = plt.gca()
ax1.plot(x, y)
ax1.set_ylabel("Linear")
ax2 = ax1.twinx()
ax2.plot(x, y2)
ax2.set_ylabel("Quadratic")

plt.savefig("figs/plt_twinx.png")
plt.show()

Networks and Natural Language Processing (NLP)

In [39]:
import networkx as nx
import nltk
from nltk.util import ngrams
import matplotlib

from collections import Counter
from operator import itemgetter
In [58]:
G = nx.cubical_graph()
nx.draw(G)
plt.savefig("figs/networkx_cubical.png")
plt.show()
/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:579: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if not cb.iterable(width):
In [41]:
ignore = {",", ".", "``", "''", "'", "'s", "?", "!", "-", "--", "...", ";"}

def get_words(file_name, encoding="utf-8"):
    with open(file_name, encoding=encoding) as f:
        for line in f:
            if not line:
                continue
            for word in nltk.word_tokenize(line.strip()):
                if word not in ignore:
                    yield word.lower()
In [42]:
words = list(get_words("data/Harry Potter and the Sorcerer.txt", 'iso-8859-1'))
In [43]:
counter = Counter(words)
w, c = zip(*counter.most_common(15))
c = np.array(c) / len(words) * 100.
In [44]:
plt.plot(w, c)
plt.ylabel("Relative frequency [%]")
plt.show()
In [45]:
words = list(filter(lambda word: len(word) > 3, get_words("data/Harry Potter and the Sorcerer.txt", 'iso-8859-1')))
counter = Counter(words)
In [47]:
g = nx.Graph()
g.add_nodes_from((word, {"size": freq}) for word, freq in counter.items())

digrams = Counter(tuple(sorted(x)) for x in ngrams(words, 2)).items()
g.add_edges_from((*pair, {"weight": freq / counter[pair[0]]}) for pair, freq in digrams)
In [48]:
def filter_edges(g, most_common_nodes, cutoff=0.5):
    most_common_nodes = set(most_common_nodes)
    for edge in g.edges:
        if not all(node in most_common_nodes for node in edge):
            continue
        if g.get_edge_data(*edge)["weight"] < cutoff:
            continue
        yield edge
In [49]:
most_common_nodes = list(map(itemgetter(0), counter.most_common(100)))
edges = list(filter_edges(g, most_common_nodes, cutoff=0.))
In [50]:
g2 = nx.Graph()
g2.add_nodes_from((node, g.node[node]) for node in most_common_nodes)
g2.add_edges_from((*edge, g.get_edge_data(*edge)) for edge in edges)
In [51]:
plt.figure(figsize=(20, 20))
layout = nx.spring_layout(g2, k=0.1, iterations=50)
sizes = [g2.node[node]["size"] * 5 for node in g2]
edge_weights = np.array([g2.get_edge_data(*edge)["weight"] for edge in g2.edges])
nx.draw(g2, pos=layout,
        with_labels=True, node_size=sizes, node_color="#cccccc",
        edge_cmap=plt.cm.Blues, edge_color=edge_weights, edge_vmin=edge_weights.min(), edge_vmax=edge_weights.max())
/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:579: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if not cb.iterable(width):
/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:585: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  and cb.iterable(edge_color) \
/usr/local/lib/python3.6/dist-packages/networkx/drawing/nx_pylab.py:595: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  for c in edge_color]):
In [52]:
plt.hist(np.log(edge_weights))
plt.show()