import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy.random import normal, exponential, randn, rand, random_sample
# Custom color palette
from cycler import cycler
colors = ["#e69f00", "#56b4e9", "#009e73", "#d55e00", "#cc799c"]
plt.rc("axes", prop_cycle=cycler("color", colors))
x = randn(1000)
bins = np.linspace(-10, 10)
plt.hist(x, bins=bins, density=True)
plt.show()
sns.distplot(x)
plt.savefig("figs/sns_distplot.png")
plt.show()
sns.kdeplot(x, cumulative=True)
plt.xlim(-10, 10)
plt.show()
x = np.array([1, 2, 3, 4])
y = x**2
plt.plot(x, y)
plt.show()
df = pd.DataFrame({"x": x, "y":y})
df.plot("x", "y")
plt.show()
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.random.seed(33)
time = pd.date_range('2000-1-1', periods=150, freq='B')
price = pd.Series(100+randn(150).cumsum(), index=time)
avg = price.rolling(20).mean()
std = price.rolling(20).std()
plt.plot(price.index, price, 'k')
plt.plot(avg.index, avg, 'b')
plt.fill_between(std.index, avg-2*std, avg+2*std, color='b', alpha=0.2)
plt.ylabel("Price")
plt.savefig("figs/plt_moving_average.png")
plt.show()
x = np.arange(0, 2*np.pi, 0.1)
yerr = 0.3
noise = yerr * np.random.randn(*x.shape)
y = np.sin(x) + noise
plt.errorbar(x, y, yerr=yerr, fmt="o")
plt.savefig("figs/plt_errorbar.png")
plt.show()
x = randn(1000)
y = exponential(1, 1000)
z = 15 - exponential(1, 1000)
plt.scatter(x, y, label="y")
plt.scatter(x, z, label="z")
plt.legend()
plt.savefig("figs/plt_scatter.png")
x = randn(1000)
y = exponential(size=1000)
hist = plt.hist2d(x, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()
path = "figs/python.png"
img = plt.imread(path)
fig1 = plt.imshow(img)
plt.savefig("figs/plt_imshow.png")
plt.show()
data = rand(*img.shape)
data[img > 0.95] = 1
fig = plt.imshow(data)
plt.show()
mus = 0, 1.5, 2.2
data = [normal(mu, 1, 1000) for mu in mus]
plt.violinplot(data, positions=mus)
plt.xlabel(r"$\mu$")
plt.show()
tips = sns.load_dataset("tips")
tips["percent"] = tips.tip / tips.total_bill
sns.violinplot("day", "percent", "sex", data=tips, split=True, palette=["#e69f00", "#56b4e9"], saturation=1, linewidth=1)
plt.savefig("figs/sns_violin.png")
plt.show()
import folium
m = folium.Map(location=[47.3686, 8.5391])
m
import pandas as pd
url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'
state_unemployment = f'{url}/US_Unemployment_Oct2012.csv'
state_data = pd.read_csv(state_unemployment)
m = folium.Map(location=[48, -102], zoom_start=3)
folium.Choropleth(
geo_data=state_geo,
name='choropleth',
data=state_data,
columns=['State', 'Unemployment'],
key_on='feature.id',
fill_color='YlGn',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Unemployment Rate (%)'
).add_to(m)
folium.LayerControl().add_to(m)
m
import geopandas
geopandas.datasets.available
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.head()
world.plot()
world.columns
fig = plt.figure(figsize=(20, 5))
ax = fig.gca()
world = world[(world.pop_est>0) & (world.name!="Antarctica")]
world['gdp_per_cap'] = world.gdp_md_est / world.pop_est
world.plot(column='gdp_per_cap', ax=ax, legend=True, cmap="OrRd")
plt.savefig("figs/geopandas_gdp_per_cap.png")
plt.show()
fig = plt.figure(figsize=(20, 5))
ax = fig.gca()
cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities'))
world.plot(color='white', edgecolor='black', ax=ax)
cities.plot(ax=ax, marker='*', color='green', markersize=5)
import noise
pnoise2 = np.vectorize(noise.pnoise2)
x = np.arange(-3, 3, 0.1)
y = np.arange(-3, 3, 0.1)
X, Y = np.meshgrid(x, y)
z = pnoise2(X, Y)
plt.contour(X, Y, z)
plt.savefig("figs/plt_contour.png")
plt.show()
plt.contourf(X, Y, z, 20, cmap='RdGy')
plt.savefig("figs/plt_contourf.png")
plt.show()
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
cmap = plt.cm.viridis
x_, y_, z_ = X.flatten(), Y.flatten(), z.flatten()
surf = ax.plot_trisurf(x_, y_, z_, cmap=cmap)
plt.colorbar(surf)
plt.savefig("figs/plt_trisurf.png")
plt.show()
planets = sns.load_dataset("planets")
cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
ax = sns.scatterplot(x="distance", y="orbital_period",
hue="year", size="mass",
palette=cmap, sizes=(10, 200),
data=planets)
plt.savefig("figs/sns_scatterplot.png")
plt.show()
from pandas.plotting import parallel_coordinates
iris = pd.read_csv("data/iris.csv")
parallel_coordinates(iris, "Name")
plt.show()
iris.columns
sns.pairplot(iris, diag_kind="kde", hue="Name")
plt.savefig("figs/sns_pairplot.png")
plt.show()
plt.style.available
with plt.style.context('ggplot'):
plt.plot(x, y)
plt.savefig("figs/plt_line_ggplot.pdf")
plt.show()
# plt.style.use("LHCb")
# with plt.style.context('LHCb'):
# plt.plot(x, y)
# plt.savefig("figs/plt_line_lhcb.pdf")
# plt.show()
np.random.seed(42)
x = np.arange(0, 10, 0.01)
y = np.random.randn(len(x)).cumsum()
d = np.diff(y)
plt.subplot?
plt.subplot(2, 2, 1)
plt.plot(x, y)
plt.subplot(224)
plt.hist(d, bins=20, density=True)
plt.savefig("figs/plt_subplot.png")
plt.show()
plt.subplots?
fig, axes = plt.subplots(2, 2)
axes[0,0].plot(x, y)
axes[1,1].hist(d, bins=20, density=True)
plt.savefig("figs/plt_subplots.png")
plt.show()
plt.plot(x, y)
plt.axes([0.2, .6, .2, .2])
plt.hist(d, bins=20, density=True)
plt.xticks([])
plt.yticks([])
plt.savefig("figs/plt_pip.png")
plt.show()
plt.figure()
x = np.linspace(-5, 5)
y = 2*x + 3
y2 = x**2
ax1 = plt.gca()
ax1.plot(x, y)
ax1.set_ylabel("Linear")
ax2 = ax1.twinx()
ax2.plot(x, y2)
ax2.set_ylabel("Quadratic")
plt.savefig("figs/plt_twinx.png")
plt.show()
import networkx as nx
import nltk
from nltk.util import ngrams
import matplotlib
from collections import Counter
from operator import itemgetter
G = nx.cubical_graph()
nx.draw(G)
plt.savefig("figs/networkx_cubical.png")
plt.show()
ignore = {",", ".", "``", "''", "'", "'s", "?", "!", "-", "--", "...", ";"}
def get_words(file_name, encoding="utf-8"):
with open(file_name, encoding=encoding) as f:
for line in f:
if not line:
continue
for word in nltk.word_tokenize(line.strip()):
if word not in ignore:
yield word.lower()
words = list(get_words("data/Harry Potter and the Sorcerer.txt", 'iso-8859-1'))
counter = Counter(words)
w, c = zip(*counter.most_common(15))
c = np.array(c) / len(words) * 100.
plt.plot(w, c)
plt.ylabel("Relative frequency [%]")
plt.show()
words = list(filter(lambda word: len(word) > 3, get_words("data/Harry Potter and the Sorcerer.txt", 'iso-8859-1')))
counter = Counter(words)
g = nx.Graph()
g.add_nodes_from((word, {"size": freq}) for word, freq in counter.items())
digrams = Counter(tuple(sorted(x)) for x in ngrams(words, 2)).items()
g.add_edges_from((*pair, {"weight": freq / counter[pair[0]]}) for pair, freq in digrams)
def filter_edges(g, most_common_nodes, cutoff=0.5):
most_common_nodes = set(most_common_nodes)
for edge in g.edges:
if not all(node in most_common_nodes for node in edge):
continue
if g.get_edge_data(*edge)["weight"] < cutoff:
continue
yield edge
most_common_nodes = list(map(itemgetter(0), counter.most_common(100)))
edges = list(filter_edges(g, most_common_nodes, cutoff=0.))
g2 = nx.Graph()
g2.add_nodes_from((node, g.node[node]) for node in most_common_nodes)
g2.add_edges_from((*edge, g.get_edge_data(*edge)) for edge in edges)
plt.figure(figsize=(20, 20))
layout = nx.spring_layout(g2, k=0.1, iterations=50)
sizes = [g2.node[node]["size"] * 5 for node in g2]
edge_weights = np.array([g2.get_edge_data(*edge)["weight"] for edge in g2.edges])
nx.draw(g2, pos=layout,
with_labels=True, node_size=sizes, node_color="#cccccc",
edge_cmap=plt.cm.Blues, edge_color=edge_weights, edge_vmin=edge_weights.min(), edge_vmax=edge_weights.max())
plt.hist(np.log(edge_weights))
plt.show()