{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-06-25T10:17:44.765681Z", "start_time": "2020-06-25T10:17:44.327507Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "try:\n", " import mplhep\n", "except ImportError:\n", " !pip install mplhep\n", " import mplhep\n", "\n", "from numpy.random import normal, exponential, randn, rand, random_sample" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-06-25T10:17:45.775755Z", "start_time": "2020-06-25T10:17:45.772723Z" } }, "outputs": [], "source": [ "# Custom color palette\n", "from cycler import cycler\n", "colors = [\"#e69f00\", \"#56b4e9\", \"#009e73\", \"#d55e00\", \"#cc799c\"]\n", "plt.rc(\"axes\", prop_cycle=cycler(\"color\", colors))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1D data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histograms" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:07.773557Z", "start_time": "2019-06-27T14:40:07.314295Z" } }, "outputs": [], "source": [ "x = randn(1000)\n", "bins = np.linspace(-10, 10)\n", "plt.hist(x, bins=bins, density=True)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:08.193738Z", "start_time": "2019-06-27T14:40:07.775582Z" } }, "outputs": [], "source": [ "sns.distplot(x)\n", "plt.savefig(\"figs/sns_distplot.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:08.597310Z", "start_time": "2019-06-27T14:40:08.196079Z" } }, "outputs": [], "source": [ "sns.kdeplot(x, cumulative=True)\n", "plt.xlim(-10, 10)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2D data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Line plotting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:09.027719Z", "start_time": "2019-06-27T14:40:08.599427Z" } }, "outputs": [], "source": [ "x = np.array([1, 2, 3, 4])\n", "y = x**2\n", "plt.plot(x, y)\n", "plt.show()\n", "\n", "df = pd.DataFrame({\"x\": x, \"y\":y})\n", "df.plot(\"x\", \"y\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filling areas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:09.327108Z", "start_time": "2019-06-27T14:40:09.029811Z" } }, "outputs": [], "source": [ "from pandas.plotting import register_matplotlib_converters\n", "register_matplotlib_converters()\n", "\n", "np.random.seed(33)\n", "time = pd.date_range('2000-1-1', periods=150, freq='B')\n", "price = pd.Series(100+randn(150).cumsum(), index=time)\n", "avg = price.rolling(20).mean()\n", "std = price.rolling(20).std()\n", "\n", "plt.plot(price.index, price, 'k')\n", "plt.plot(avg.index, avg, 'b')\n", "plt.fill_between(std.index, avg-2*std, avg+2*std, color='b', alpha=0.2)\n", "plt.ylabel(\"Price\")\n", "plt.savefig(\"figs/plt_moving_average.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Errorbars" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:09.590264Z", "start_time": "2019-06-27T14:40:09.330503Z" } }, "outputs": [], "source": [ "x = np.arange(0, 2*np.pi, 0.1)\n", "yerr = 0.3\n", "noise = yerr * np.random.randn(*x.shape)\n", "y = np.sin(x) + noise\n", "plt.errorbar(x, y, yerr=yerr, fmt=\"o\")\n", "plt.savefig(\"figs/plt_errorbar.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histograms plotting from HEP" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# if a histogram is already given\n", "import mplhep\n", "h, bins = np.histogram(np.random.normal(10, 3, 400), bins=10)\n", "\n", "f, axs = plt.subplots(2, 2, sharex=True)\n", "axs = axs.flatten()\n", "\n", "mplhep.histplot(h, bins, yerr=True, ax=axs[0])\n", "mplhep.histplot(h, bins, yerr=np.sqrt(h), histtype='errorbar', ax=axs[1])\n", "mplhep.histplot(h, bins, histtype='fill', ax=axs[2])\n", "mplhep.histplot([1.5 * h, h], bins, histtype=\"fill\", stack=True, ax=axs[3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scatter plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:10.041270Z", "start_time": "2019-06-27T14:40:09.593166Z" } }, "outputs": [], "source": [ "x = randn(1000)\n", "y = exponential(1, 1000)\n", "z = 15 - exponential(1, 1000)\n", "\n", "plt.scatter(x, y, label=\"y\")\n", "plt.scatter(x, z, label=\"z\")\n", "plt.legend()\n", "plt.savefig(\"figs/plt_scatter.png\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2D histogram" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:10.269109Z", "start_time": "2019-06-27T14:40:10.044168Z" } }, "outputs": [], "source": [ "x = randn(1000)\n", "y = exponential(size=1000)\n", "hist = plt.hist2d(x, y)\n", "plt.xlabel(\"x\")\n", "plt.ylabel(\"y\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hexagonal histogram" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-05-24T20:21:52.346089Z", "start_time": "2020-05-24T20:21:51.007779Z" } }, "outputs": [], "source": [ "x, y = randn(2, 10000)\n", "sns.jointplot(x, y, kind=\"hex\", color=\"k\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Images" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:10.894817Z", "start_time": "2019-06-27T14:40:10.271379Z" } }, "outputs": [], "source": [ "path = \"figs/python.png\"\n", "img = plt.imread(path)\n", "fig1 = plt.imshow(img)\n", "plt.savefig(\"figs/plt_imshow.png\")\n", "plt.show()\n", "\n", "data = rand(*img.shape)\n", "data[img > 0.95] = 1\n", "fig = plt.imshow(data)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Violin plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:11.745545Z", "start_time": "2019-06-27T14:40:10.897721Z" } }, "outputs": [], "source": [ "mus = 0, 1.5, 2.2\n", "data = [normal(mu, 1, 1000) for mu in mus]\n", "plt.violinplot(data, positions=mus)\n", "plt.xlabel(r\"$\\mu$\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split violin plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-05-25T07:40:22.685647Z", "start_time": "2020-05-25T07:40:22.418372Z" } }, "outputs": [], "source": [ "tips = sns.load_dataset(\"tips\")\n", "tips[\"percent\"] = tips.tip / tips.total_bill\n", "sns.violinplot(\"day\", \"percent\", \"sex\", data=tips, split=True, palette=[\"#e69f00\", \"#56b4e9\"], saturation=1, linewidth=1)\n", "plt.savefig(\"figs/sns_violin.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bar charts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-05-25T07:41:54.723246Z", "start_time": "2020-05-25T07:41:54.528843Z" } }, "outputs": [], "source": [ "bottom = 0\n", "for sex, df in tips.groupby(\"sex\", sort=False):\n", " df = df.groupby(\"day\")[\"tip\"].mean().reset_index()\n", " plt.bar(df[\"day\"], df[\"tip\"], 0.8, bottom, label=sex)\n", " bottom = df[\"tip\"]\n", "plt.legend()\n", "plt.savefig(\"figs/plt_bar.png\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Geospatial plotting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Folium" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T15:08:40.651659Z", "start_time": "2019-06-27T15:08:40.641575Z" } }, "outputs": [], "source": [ "import folium\n", "m = folium.Map(location=[47.3686, 8.5391])\n", "m" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:13.111354Z", "start_time": "2019-06-27T14:40:12.846500Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "\n", "url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'\n", "state_geo = f'{url}/us-states.json'\n", "state_unemployment = f'{url}/US_Unemployment_Oct2012.csv'\n", "state_data = pd.read_csv(state_unemployment)\n", "\n", "m = folium.Map(location=[48, -102], zoom_start=3)\n", "\n", "folium.Choropleth(\n", " geo_data=state_geo,\n", " name='choropleth',\n", " data=state_data,\n", " columns=['State', 'Unemployment'],\n", " key_on='feature.id',\n", " fill_color='YlGn',\n", " fill_opacity=0.7,\n", " line_opacity=0.2,\n", " legend_name='Unemployment Rate (%)'\n", ").add_to(m)\n", "\n", "folium.LayerControl().add_to(m)\n", "\n", "m" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Geopandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:13.359886Z", "start_time": "2019-06-27T14:40:13.114563Z" } }, "outputs": [], "source": [ "import geopandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:13.368839Z", "start_time": "2019-06-27T14:40:13.363658Z" } }, "outputs": [], "source": [ "geopandas.datasets.available" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:13.454241Z", "start_time": "2019-06-27T14:40:13.371385Z" } }, "outputs": [], "source": [ "world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))\n", "world.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:13.951802Z", "start_time": "2019-06-27T14:40:13.456856Z" } }, "outputs": [], "source": [ "world.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:13.963576Z", "start_time": "2019-06-27T14:40:13.955739Z" } }, "outputs": [], "source": [ "world.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T15:16:22.136045Z", "start_time": "2019-06-27T15:16:21.384431Z" } }, "outputs": [], "source": [ "fig = plt.figure(figsize=(20, 5))\n", "ax = fig.gca()\n", "world = world[(world.pop_est>0) & (world.name!=\"Antarctica\")]\n", "world['gdp_per_cap'] = world.gdp_md_est / world.pop_est\n", "world.plot(column='gdp_per_cap', ax=ax, legend=True, cmap=\"OrRd\")\n", "plt.savefig(\"figs/geopandas_gdp_per_cap.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:15.548621Z", "start_time": "2019-06-27T14:40:14.783129Z" } }, "outputs": [], "source": [ "fig = plt.figure(figsize=(20, 5))\n", "ax = fig.gca()\n", "\n", "cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities'))\n", "world.plot(color='white', edgecolor='black', ax=ax)\n", "cities.plot(ax=ax, marker='*', color='green', markersize=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3D data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Contour plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:15.562236Z", "start_time": "2019-06-27T14:40:15.551601Z" } }, "outputs": [], "source": [ "import noise\n", "pnoise2 = np.vectorize(noise.pnoise2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:15.981339Z", "start_time": "2019-06-27T14:40:15.565983Z" } }, "outputs": [], "source": [ "x = np.arange(-3, 3, 0.1)\n", "y = np.arange(-3, 3, 0.1)\n", "X, Y = np.meshgrid(x, y)\n", "z = pnoise2(X, Y)\n", "plt.contour(X, Y, z)\n", "plt.savefig(\"figs/plt_contour.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:16.425673Z", "start_time": "2019-06-27T14:40:15.984466Z" } }, "outputs": [], "source": [ "plt.contourf(X, Y, z, 20, cmap='RdGy')\n", "plt.savefig(\"figs/plt_contourf.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3D visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:18.433559Z", "start_time": "2019-06-27T14:40:16.429281Z" } }, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", "fig = plt.figure()\n", "ax = fig.add_subplot(111, projection='3d')\n", "\n", "cmap = plt.cm.viridis\n", "x_, y_, z_ = X.flatten(), Y.flatten(), z.flatten()\n", "surf = ax.plot_trisurf(x_, y_, z_, cmap=cmap)\n", "plt.colorbar(surf)\n", "plt.savefig(\"figs/plt_trisurf.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Ternary plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-06-25T10:17:56.654392Z", "start_time": "2020-06-25T10:17:56.297017Z" } }, "outputs": [], "source": [ "import ternary\n", "fig, tax = ternary.figure(scale=100)\n", "fig.set_size_inches(5, 5)\n", "k, n = 100, 50\n", "a = k * rand(n); b = (k - a) * rand(n)\n", "c = k - a - b\n", "tax.scatter(np.array([[a, b, c]]))\n", "tax.scatter([[20, 35, 45]])\n", "tax.right_corner_label(\"A\")\n", "tax.top_corner_label(\"B\")\n", "tax.left_corner_label(\"C\")\n", "tax.left_axis_label(\"c [%]\")\n", "tax.right_axis_label(\"b [%]\")\n", "tax.bottom_axis_label(\"a [%]\")\n", "tax.gridlines(multiple=20, color=\"gray\")\n", "tax.ticks(axis='lbr', multiple=20)\n", "tax.boundary(linewidth=1)\n", "tax.get_axes().axis('off')\n", "tax.savefig(\"figs/ternary.png\")\n", "tax.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4D data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scatterplot" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:19.172645Z", "start_time": "2019-06-27T14:40:18.436763Z" } }, "outputs": [], "source": [ "planets = sns.load_dataset(\"planets\")\n", "\n", "cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)\n", "ax = sns.scatterplot(x=\"distance\", y=\"orbital_period\",\n", " hue=\"year\", size=\"mass\",\n", " palette=cmap, sizes=(10, 200),\n", " data=planets)\n", "plt.savefig(\"figs/sns_scatterplot.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ND data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parallel coordinates" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:19.933292Z", "start_time": "2019-06-27T14:40:19.179551Z" } }, "outputs": [], "source": [ "from pandas.plotting import parallel_coordinates\n", "\n", "iris = pd.read_csv(\"data/iris.csv\")\n", "parallel_coordinates(iris, \"Name\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-03-27T18:48:12.487758Z", "start_time": "2020-03-27T18:48:12.485686Z" } }, "source": [ "## Spider graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-03-27T19:17:34.113642Z", "start_time": "2020-03-27T19:17:33.044161Z" } }, "outputs": [], "source": [ "from math import pi\n", " \n", "# Set data\n", "df = pd.DataFrame({\n", "'group': ['A','B','C','D'],\n", "'var1': [38, 1.5, 30, 4],\n", "'var2': [29, 10, 9, 34],\n", "'var3': [8, 39, 23, 24],\n", "'var4': [7, 31, 33, 14],\n", "'var5': [28, 15, 32, 14]\n", "})\n", " \n", "# ------- PART 1: Define a function that do a plot for one line of the dataset!\n", " \n", "def make_spider(row, title, color):\n", " \n", " # number of variable\n", " categories = list(df)[1:]\n", " N = len(categories)\n", "\n", " # What will be the angle of each axis in the plot? (we divide the plot / number of variable)\n", " angles = [n / float(N) * 2 * pi for n in range(N)]\n", " angles += angles[:1]\n", "\n", " # Initialise the spider plot\n", " ax = plt.subplot(2,2,row+1, polar=True, )\n", "\n", " # If you want the first axis to be on top:\n", " ax.set_theta_offset(pi / 2)\n", " ax.set_theta_direction(-1)\n", "\n", " # Draw one axe per variable + add labels labels yet\n", " plt.xticks(angles[:-1], categories, color='grey', size=8)\n", "\n", " # Draw ylabels\n", " ax.set_rlabel_position(0)\n", " plt.yticks([10,20,30], [\"10\",\"20\",\"30\"], color=\"grey\", size=7)\n", " plt.ylim(0,40)\n", "\n", " # Ind1\n", " values=df.loc[row].drop('group').values.flatten().tolist()\n", " values += values[:1]\n", " ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')\n", " ax.fill(angles, values, color=color, alpha=0.4)\n", "\n", " # Add a title\n", " plt.title(title, size=11, color=color, y=1.1)\n", "\n", "# ------- PART 2: Apply to all individuals\n", "# initialize the figure\n", "my_dpi=96\n", "plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi)\n", "\n", "# Create a color palette:\n", "my_palette = plt.cm.get_cmap(\"Set2\", len(df.index))\n", " \n", "# Loop to plot\n", "for row in range(0, len(df.index)):\n", " make_spider( row=row, title='group '+df['group'][row], color=my_palette(row))\n", "plt.savefig(\"figs/plt_radar.png\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pairplot" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-28T05:52:26.243814Z", "start_time": "2019-06-28T05:52:26.240123Z" } }, "outputs": [], "source": [ "iris.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:29.710936Z", "start_time": "2019-06-27T14:40:19.936853Z" } }, "outputs": [], "source": [ "sns.pairplot(iris, diag_kind=\"kde\", hue=\"Name\")\n", "plt.savefig(\"figs/sns_pairplot.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combining and styling plots" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plotting styles" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:29.722110Z", "start_time": "2019-06-27T14:40:29.714179Z" } }, "outputs": [], "source": [ "plt.style.available" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:30.242358Z", "start_time": "2019-06-27T14:40:29.725776Z" } }, "outputs": [], "source": [ "with plt.style.context('ggplot'):\n", " plt.plot(x, y)\n", " plt.savefig(\"figs/plt_line_ggplot.pdf\")\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:30.252447Z", "start_time": "2019-06-27T14:40:30.246994Z" } }, "outputs": [], "source": [ "# plt.style.use(\"LHCb\")\n", "# with plt.style.context('LHCb'):\n", "# plt.plot(x, y)\n", "# plt.savefig(\"figs/plt_line_lhcb.pdf\")\n", "# plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Subplots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-28T05:24:36.847930Z", "start_time": "2019-06-28T05:24:36.842176Z" } }, "outputs": [], "source": [ "np.random.seed(42)\n", "x = np.arange(0, 10, 0.01)\n", "y = np.random.randn(len(x)).cumsum()\n", "d = np.diff(y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-28T05:06:22.409055Z", "start_time": "2019-06-28T05:06:22.403692Z" } }, "outputs": [], "source": [ "plt.subplot?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:30.911511Z", "start_time": "2019-06-27T14:40:30.269246Z" } }, "outputs": [], "source": [ "plt.subplot(2, 2, 1)\n", "plt.plot(x, y)\n", "\n", "plt.subplot(224)\n", "plt.hist(d, bins=20, density=True)\n", "\n", "plt.savefig(\"figs/plt_subplot.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-28T05:07:47.298098Z", "start_time": "2019-06-28T05:07:47.292394Z" } }, "outputs": [], "source": [ "plt.subplots?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:32.239210Z", "start_time": "2019-06-27T14:40:30.915361Z" } }, "outputs": [], "source": [ "fig, axes = plt.subplots(2, 2)\n", "axes[0,0].plot(x, y)\n", "axes[1,1].hist(d, bins=20, density=True)\n", "plt.savefig(\"figs/plt_subplots.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Picture-in Picture" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-28T05:24:40.262539Z", "start_time": "2019-06-28T05:24:39.852408Z" } }, "outputs": [], "source": [ "plt.plot(x, y)\n", "plt.axes([0.2, .6, .2, .2])\n", "plt.hist(d, bins=20, density=True)\n", "plt.xticks([])\n", "plt.yticks([])\n", "plt.savefig(\"figs/plt_pip.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multiple axes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:33.126327Z", "start_time": "2019-06-27T14:40:32.633716Z" } }, "outputs": [], "source": [ "plt.figure()\n", "x = np.linspace(-5, 5)\n", "y = 2*x + 3\n", "y2 = x**2\n", "\n", "ax1 = plt.gca()\n", "ax1.plot(x, y)\n", "ax1.set_ylabel(\"Linear\")\n", "ax2 = ax1.twinx()\n", "ax2.plot(x, y2)\n", "ax2.set_ylabel(\"Quadratic\")\n", "\n", "plt.savefig(\"figs/plt_twinx.png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Networks and Natural Language Processing (NLP)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:59.550235Z", "start_time": "2019-06-30T18:50:59.546131Z" } }, "outputs": [], "source": [ "import networkx as nx\n", "import nltk\n", "from nltk.util import ngrams\n", "import matplotlib\n", "\n", "from collections import Counter, defaultdict\n", "from operator import itemgetter" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:05.436181Z", "start_time": "2019-06-30T18:50:05.204203Z" } }, "outputs": [], "source": [ "G = nx.cubical_graph()\n", "nx.draw(G)\n", "plt.savefig(\"figs/networkx_cubical.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:06.689941Z", "start_time": "2019-06-30T18:50:06.682965Z" } }, "outputs": [], "source": [ "ignore = {\",\", \".\", \"``\", \"''\", \"'\", \"'s\", \"?\", \"!\", \"-\", \"--\", \"...\", \";\"}\n", "\n", "def get_words(file_name, encoding=\"utf-8\"):\n", " with open(file_name, encoding=encoding) as f:\n", " for line in f:\n", " if not line:\n", " continue\n", " for word in nltk.word_tokenize(line.strip()):\n", " if word not in ignore:\n", " yield word.lower()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:08.351232Z", "start_time": "2019-06-30T18:50:07.466634Z" } }, "outputs": [], "source": [ "words = list(get_words(\"data/Harry Potter and the Sorcerer.txt\", 'iso-8859-1'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:09.206595Z", "start_time": "2019-06-30T18:50:09.193709Z" } }, "outputs": [], "source": [ "counter = Counter(words)\n", "w, c = zip(*counter.most_common(15))\n", "c = np.array(c) / len(words) * 100." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:10.038766Z", "start_time": "2019-06-30T18:50:09.839734Z" } }, "outputs": [], "source": [ "plt.plot(w, c)\n", "plt.ylabel(\"Relative frequency [%]\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:13.588485Z", "start_time": "2019-06-30T18:50:12.586774Z" } }, "outputs": [], "source": [ "words = list(filter(lambda word: len(word) > 3, get_words(\"data/Harry Potter and the Sorcerer.txt\", 'iso-8859-1')))\n", "counter = Counter(words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:50:13.592890Z", "start_time": "2019-06-30T18:50:13.590527Z" } }, "outputs": [], "source": [ "from itertools import islice" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$\\phi=\\frac{n_{11}n_{00}-n_{10}n_{01}}{\\sqrt{n_{1\\cdot}n_{0\\cdot}n_{\\cdot0}n_{\\cdot1}}}$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def phi(word1, word2, counter, digram_graph):\n", " n1tot = \n", " n11 = digram_graph[word1][word2]\n", " n00 = sum(counter.values()) - " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:51:05.411712Z", "start_time": "2019-06-30T18:51:05.196713Z" } }, "outputs": [], "source": [ "g = nx.Graph()\n", "g.add_nodes_from((word, {\"size\": freq}) for word, freq in counter.items())\n", "\n", "digram_graph = defaultdict(Counter)\n", "for word1, word2 in ngrams(words, 2):\n", " digram_graph[word1][word2] += 1\n", " digram_graph[word2][word1] += 1\n", "\n", "digrams = Counter(tuple(sorted(x)) for x in ngrams(words, 2)).items()\n", "\n", "g.add_edges_from((*pair, {\"weight\": freq / counter[pair[0]]}) for pair, freq in digrams)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T18:51:09.167583Z", "start_time": "2019-06-30T18:51:08.550691Z" } }, "outputs": [], "source": [ "digram_graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:37.288331Z", "start_time": "2019-06-27T14:40:37.282050Z" } }, "outputs": [], "source": [ "def filter_edges(g, most_common_nodes, cutoff=0.5):\n", " most_common_nodes = set(most_common_nodes)\n", " for edge in g.edges:\n", " if not all(node in most_common_nodes for node in edge):\n", " continue\n", " if g.get_edge_data(*edge)[\"weight\"] < cutoff:\n", " continue\n", " yield edge" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:37.346301Z", "start_time": "2019-06-27T14:40:37.290140Z" } }, "outputs": [], "source": [ "most_common_nodes = list(map(itemgetter(0), counter.most_common(100)))\n", "edges = list(filter_edges(g, most_common_nodes, cutoff=0.))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:37.363477Z", "start_time": "2019-06-27T14:40:37.349133Z" } }, "outputs": [], "source": [ "g2 = nx.Graph()\n", "g2.add_nodes_from((node, g.node[node]) for node in most_common_nodes)\n", "g2.add_edges_from((*edge, g.get_edge_data(*edge)) for edge in edges)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:40.874333Z", "start_time": "2019-06-27T14:40:37.366331Z" } }, "outputs": [], "source": [ "plt.figure(figsize=(20, 20))\n", "layout = nx.spring_layout(g2, k=0.1, iterations=50)\n", "sizes = [g2.node[node][\"size\"] * 5 for node in g2]\n", "edge_weights = np.array([g2.get_edge_data(*edge)[\"weight\"] for edge in g2.edges])\n", "nx.draw(g2, pos=layout,\n", " with_labels=True, node_size=sizes, node_color=\"#cccccc\",\n", " edge_cmap=plt.cm.Blues, edge_color=edge_weights, edge_vmin=edge_weights.min(), edge_vmax=edge_weights.max())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-27T14:40:41.158594Z", "start_time": "2019-06-27T14:40:40.879342Z" } }, "outputs": [], "source": [ "plt.hist(np.log(edge_weights))\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "384px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }