{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T10:17:44.765681Z",
     "start_time": "2020-06-25T10:17:44.327507Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "try:\n",
    "    import mplhep\n",
    "except ImportError:\n",
    "    !pip install mplhep\n",
    "    import mplhep\n",
    "\n",
    "from numpy.random import normal, exponential, randn, rand, random_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T10:17:45.775755Z",
     "start_time": "2020-06-25T10:17:45.772723Z"
    }
   },
   "outputs": [],
   "source": [
    "# Custom color palette\n",
    "from cycler import cycler\n",
    "colors = [\"#e69f00\", \"#56b4e9\", \"#009e73\", \"#d55e00\", \"#cc799c\"]\n",
    "plt.rc(\"axes\", prop_cycle=cycler(\"color\", colors))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1D data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Histograms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:07.773557Z",
     "start_time": "2019-06-27T14:40:07.314295Z"
    }
   },
   "outputs": [],
   "source": [
    "x = randn(1000)\n",
    "bins = np.linspace(-10, 10)\n",
    "plt.hist(x, bins=bins, density=True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:08.193738Z",
     "start_time": "2019-06-27T14:40:07.775582Z"
    }
   },
   "outputs": [],
   "source": [
    "sns.distplot(x)\n",
    "plt.savefig(\"figs/sns_distplot.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:08.597310Z",
     "start_time": "2019-06-27T14:40:08.196079Z"
    }
   },
   "outputs": [],
   "source": [
    "sns.kdeplot(x, cumulative=True)\n",
    "plt.xlim(-10, 10)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2D data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Line plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:09.027719Z",
     "start_time": "2019-06-27T14:40:08.599427Z"
    }
   },
   "outputs": [],
   "source": [
    "x = np.array([1, 2, 3, 4])\n",
    "y = x**2\n",
    "plt.plot(x, y)\n",
    "plt.show()\n",
    "\n",
    "df = pd.DataFrame({\"x\": x, \"y\":y})\n",
    "df.plot(\"x\", \"y\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filling areas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:09.327108Z",
     "start_time": "2019-06-27T14:40:09.029811Z"
    }
   },
   "outputs": [],
   "source": [
    "from pandas.plotting import register_matplotlib_converters\n",
    "register_matplotlib_converters()\n",
    "\n",
    "np.random.seed(33)\n",
    "time = pd.date_range('2000-1-1', periods=150, freq='B')\n",
    "price = pd.Series(100+randn(150).cumsum(), index=time)\n",
    "avg = price.rolling(20).mean()\n",
    "std = price.rolling(20).std()\n",
    "\n",
    "plt.plot(price.index, price, 'k')\n",
    "plt.plot(avg.index, avg, 'b')\n",
    "plt.fill_between(std.index, avg-2*std, avg+2*std, color='b', alpha=0.2)\n",
    "plt.ylabel(\"Price\")\n",
    "plt.savefig(\"figs/plt_moving_average.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Errorbars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:09.590264Z",
     "start_time": "2019-06-27T14:40:09.330503Z"
    }
   },
   "outputs": [],
   "source": [
    "x = np.arange(0, 2*np.pi, 0.1)\n",
    "yerr = 0.3\n",
    "noise = yerr * np.random.randn(*x.shape)\n",
    "y = np.sin(x) + noise\n",
    "plt.errorbar(x, y, yerr=yerr, fmt=\"o\")\n",
    "plt.savefig(\"figs/plt_errorbar.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Histograms plotting from HEP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if a histogram is already given\n",
    "import mplhep\n",
    "h, bins = np.histogram(np.random.normal(10, 3, 400), bins=10)\n",
    "\n",
    "f, axs = plt.subplots(2, 2, sharex=True)\n",
    "axs = axs.flatten()\n",
    "\n",
    "mplhep.histplot(h, bins, yerr=True, ax=axs[0])\n",
    "mplhep.histplot(h, bins, yerr=np.sqrt(h), histtype='errorbar', ax=axs[1])\n",
    "mplhep.histplot(h, bins, histtype='fill', ax=axs[2])\n",
    "mplhep.histplot([1.5 * h, h], bins, histtype=\"fill\", stack=True, ax=axs[3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scatter plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:10.041270Z",
     "start_time": "2019-06-27T14:40:09.593166Z"
    }
   },
   "outputs": [],
   "source": [
    "x = randn(1000)\n",
    "y = exponential(1, 1000)\n",
    "z = 15 - exponential(1, 1000)\n",
    "\n",
    "plt.scatter(x, y, label=\"y\")\n",
    "plt.scatter(x, z, label=\"z\")\n",
    "plt.legend()\n",
    "plt.savefig(\"figs/plt_scatter.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2D histogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:10.269109Z",
     "start_time": "2019-06-27T14:40:10.044168Z"
    }
   },
   "outputs": [],
   "source": [
    "x = randn(1000)\n",
    "y = exponential(size=1000)\n",
    "hist = plt.hist2d(x, y)\n",
    "plt.xlabel(\"x\")\n",
    "plt.ylabel(\"y\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hexagonal histogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-24T20:21:52.346089Z",
     "start_time": "2020-05-24T20:21:51.007779Z"
    }
   },
   "outputs": [],
   "source": [
    "x, y = randn(2, 10000)\n",
    "sns.jointplot(x, y, kind=\"hex\", color=\"k\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:10.894817Z",
     "start_time": "2019-06-27T14:40:10.271379Z"
    }
   },
   "outputs": [],
   "source": [
    "path = \"figs/python.png\"\n",
    "img = plt.imread(path)\n",
    "fig1 = plt.imshow(img)\n",
    "plt.savefig(\"figs/plt_imshow.png\")\n",
    "plt.show()\n",
    "\n",
    "data = rand(*img.shape)\n",
    "data[img > 0.95] = 1\n",
    "fig = plt.imshow(data)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Violin plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:11.745545Z",
     "start_time": "2019-06-27T14:40:10.897721Z"
    }
   },
   "outputs": [],
   "source": [
    "mus = 0, 1.5, 2.2\n",
    "data = [normal(mu, 1, 1000) for mu in mus]\n",
    "plt.violinplot(data, positions=mus)\n",
    "plt.xlabel(r\"$\\mu$\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split violin plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-25T07:40:22.685647Z",
     "start_time": "2020-05-25T07:40:22.418372Z"
    }
   },
   "outputs": [],
   "source": [
    "tips = sns.load_dataset(\"tips\")\n",
    "tips[\"percent\"] = tips.tip / tips.total_bill\n",
    "sns.violinplot(\"day\", \"percent\", \"sex\", data=tips, split=True, palette=[\"#e69f00\", \"#56b4e9\"], saturation=1, linewidth=1)\n",
    "plt.savefig(\"figs/sns_violin.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bar charts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-25T07:41:54.723246Z",
     "start_time": "2020-05-25T07:41:54.528843Z"
    }
   },
   "outputs": [],
   "source": [
    "bottom = 0\n",
    "for sex, df in tips.groupby(\"sex\", sort=False):\n",
    "    df = df.groupby(\"day\")[\"tip\"].mean().reset_index()\n",
    "    plt.bar(df[\"day\"], df[\"tip\"], 0.8, bottom, label=sex)\n",
    "    bottom = df[\"tip\"]\n",
    "plt.legend()\n",
    "plt.savefig(\"figs/plt_bar.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Geospatial plotting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Folium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T15:08:40.651659Z",
     "start_time": "2019-06-27T15:08:40.641575Z"
    }
   },
   "outputs": [],
   "source": [
    "import folium\n",
    "m = folium.Map(location=[47.3686, 8.5391])\n",
    "m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:13.111354Z",
     "start_time": "2019-06-27T14:40:12.846500Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'\n",
    "state_geo = f'{url}/us-states.json'\n",
    "state_unemployment = f'{url}/US_Unemployment_Oct2012.csv'\n",
    "state_data = pd.read_csv(state_unemployment)\n",
    "\n",
    "m = folium.Map(location=[48, -102], zoom_start=3)\n",
    "\n",
    "folium.Choropleth(\n",
    "    geo_data=state_geo,\n",
    "    name='choropleth',\n",
    "    data=state_data,\n",
    "    columns=['State', 'Unemployment'],\n",
    "    key_on='feature.id',\n",
    "    fill_color='YlGn',\n",
    "    fill_opacity=0.7,\n",
    "    line_opacity=0.2,\n",
    "    legend_name='Unemployment Rate (%)'\n",
    ").add_to(m)\n",
    "\n",
    "folium.LayerControl().add_to(m)\n",
    "\n",
    "m"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Geopandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:13.359886Z",
     "start_time": "2019-06-27T14:40:13.114563Z"
    }
   },
   "outputs": [],
   "source": [
    "import geopandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:13.368839Z",
     "start_time": "2019-06-27T14:40:13.363658Z"
    }
   },
   "outputs": [],
   "source": [
    "geopandas.datasets.available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:13.454241Z",
     "start_time": "2019-06-27T14:40:13.371385Z"
    }
   },
   "outputs": [],
   "source": [
    "world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))\n",
    "world.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:13.951802Z",
     "start_time": "2019-06-27T14:40:13.456856Z"
    }
   },
   "outputs": [],
   "source": [
    "world.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:13.963576Z",
     "start_time": "2019-06-27T14:40:13.955739Z"
    }
   },
   "outputs": [],
   "source": [
    "world.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T15:16:22.136045Z",
     "start_time": "2019-06-27T15:16:21.384431Z"
    }
   },
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(20, 5))\n",
    "ax = fig.gca()\n",
    "world = world[(world.pop_est>0) & (world.name!=\"Antarctica\")]\n",
    "world['gdp_per_cap'] = world.gdp_md_est / world.pop_est\n",
    "world.plot(column='gdp_per_cap', ax=ax, legend=True, cmap=\"OrRd\")\n",
    "plt.savefig(\"figs/geopandas_gdp_per_cap.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:15.548621Z",
     "start_time": "2019-06-27T14:40:14.783129Z"
    }
   },
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(20, 5))\n",
    "ax = fig.gca()\n",
    "\n",
    "cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities'))\n",
    "world.plot(color='white', edgecolor='black', ax=ax)\n",
    "cities.plot(ax=ax, marker='*', color='green', markersize=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3D data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Contour plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:15.562236Z",
     "start_time": "2019-06-27T14:40:15.551601Z"
    }
   },
   "outputs": [],
   "source": [
    "import noise\n",
    "pnoise2 = np.vectorize(noise.pnoise2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:15.981339Z",
     "start_time": "2019-06-27T14:40:15.565983Z"
    }
   },
   "outputs": [],
   "source": [
    "x = np.arange(-3, 3, 0.1)\n",
    "y = np.arange(-3, 3, 0.1)\n",
    "X, Y = np.meshgrid(x, y)\n",
    "z = pnoise2(X, Y)\n",
    "plt.contour(X, Y, z)\n",
    "plt.savefig(\"figs/plt_contour.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:16.425673Z",
     "start_time": "2019-06-27T14:40:15.984466Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.contourf(X, Y, z, 20, cmap='RdGy')\n",
    "plt.savefig(\"figs/plt_contourf.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3D visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:18.433559Z",
     "start_time": "2019-06-27T14:40:16.429281Z"
    }
   },
   "outputs": [],
   "source": [
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "fig = plt.figure()\n",
    "ax = fig.add_subplot(111, projection='3d')\n",
    "\n",
    "cmap = plt.cm.viridis\n",
    "x_, y_, z_ = X.flatten(), Y.flatten(), z.flatten()\n",
    "surf = ax.plot_trisurf(x_, y_, z_, cmap=cmap)\n",
    "plt.colorbar(surf)\n",
    "plt.savefig(\"figs/plt_trisurf.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ternary plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T10:17:56.654392Z",
     "start_time": "2020-06-25T10:17:56.297017Z"
    }
   },
   "outputs": [],
   "source": [
    "import ternary\n",
    "fig, tax = ternary.figure(scale=100)\n",
    "fig.set_size_inches(5, 5)\n",
    "k, n = 100, 50\n",
    "a = k * rand(n); b = (k - a) * rand(n)\n",
    "c = k - a - b\n",
    "tax.scatter(np.array([[a, b, c]]))\n",
    "tax.scatter([[20, 35, 45]])\n",
    "tax.right_corner_label(\"A\")\n",
    "tax.top_corner_label(\"B\")\n",
    "tax.left_corner_label(\"C\")\n",
    "tax.left_axis_label(\"c [%]\")\n",
    "tax.right_axis_label(\"b [%]\")\n",
    "tax.bottom_axis_label(\"a [%]\")\n",
    "tax.gridlines(multiple=20, color=\"gray\")\n",
    "tax.ticks(axis='lbr', multiple=20)\n",
    "tax.boundary(linewidth=1)\n",
    "tax.get_axes().axis('off')\n",
    "tax.savefig(\"figs/ternary.png\")\n",
    "tax.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4D data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scatterplot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:19.172645Z",
     "start_time": "2019-06-27T14:40:18.436763Z"
    }
   },
   "outputs": [],
   "source": [
    "planets = sns.load_dataset(\"planets\")\n",
    "\n",
    "cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)\n",
    "ax = sns.scatterplot(x=\"distance\", y=\"orbital_period\",\n",
    "                     hue=\"year\", size=\"mass\",\n",
    "                     palette=cmap, sizes=(10, 200),\n",
    "                     data=planets)\n",
    "plt.savefig(\"figs/sns_scatterplot.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ND data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Parallel coordinates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:19.933292Z",
     "start_time": "2019-06-27T14:40:19.179551Z"
    }
   },
   "outputs": [],
   "source": [
    "from pandas.plotting import parallel_coordinates\n",
    "\n",
    "iris = pd.read_csv(\"data/iris.csv\")\n",
    "parallel_coordinates(iris, \"Name\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-27T18:48:12.487758Z",
     "start_time": "2020-03-27T18:48:12.485686Z"
    }
   },
   "source": [
    "## Spider graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-27T19:17:34.113642Z",
     "start_time": "2020-03-27T19:17:33.044161Z"
    }
   },
   "outputs": [],
   "source": [
    "from math import pi\n",
    " \n",
    "# Set data\n",
    "df = pd.DataFrame({\n",
    "'group': ['A','B','C','D'],\n",
    "'var1': [38, 1.5, 30, 4],\n",
    "'var2': [29, 10, 9, 34],\n",
    "'var3': [8, 39, 23, 24],\n",
    "'var4': [7, 31, 33, 14],\n",
    "'var5': [28, 15, 32, 14]\n",
    "})\n",
    " \n",
    "# ------- PART 1: Define a function that do a plot for one line of the dataset!\n",
    " \n",
    "def make_spider(row, title, color):\n",
    " \n",
    "    # number of variable\n",
    "    categories = list(df)[1:]\n",
    "    N = len(categories)\n",
    "\n",
    "    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)\n",
    "    angles = [n / float(N) * 2 * pi for n in range(N)]\n",
    "    angles += angles[:1]\n",
    "\n",
    "    # Initialise the spider plot\n",
    "    ax = plt.subplot(2,2,row+1, polar=True, )\n",
    "\n",
    "    # If you want the first axis to be on top:\n",
    "    ax.set_theta_offset(pi / 2)\n",
    "    ax.set_theta_direction(-1)\n",
    "\n",
    "    # Draw one axe per variable + add labels labels yet\n",
    "    plt.xticks(angles[:-1], categories, color='grey', size=8)\n",
    "\n",
    "    # Draw ylabels\n",
    "    ax.set_rlabel_position(0)\n",
    "    plt.yticks([10,20,30], [\"10\",\"20\",\"30\"], color=\"grey\", size=7)\n",
    "    plt.ylim(0,40)\n",
    "\n",
    "    # Ind1\n",
    "    values=df.loc[row].drop('group').values.flatten().tolist()\n",
    "    values += values[:1]\n",
    "    ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')\n",
    "    ax.fill(angles, values, color=color, alpha=0.4)\n",
    "\n",
    "    # Add a title\n",
    "    plt.title(title, size=11, color=color, y=1.1)\n",
    "\n",
    "# ------- PART 2: Apply to all individuals\n",
    "# initialize the figure\n",
    "my_dpi=96\n",
    "plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi)\n",
    "\n",
    "# Create a color palette:\n",
    "my_palette = plt.cm.get_cmap(\"Set2\", len(df.index))\n",
    " \n",
    "# Loop to plot\n",
    "for row in range(0, len(df.index)):\n",
    "    make_spider( row=row, title='group '+df['group'][row], color=my_palette(row))\n",
    "plt.savefig(\"figs/plt_radar.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pairplot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-28T05:52:26.243814Z",
     "start_time": "2019-06-28T05:52:26.240123Z"
    }
   },
   "outputs": [],
   "source": [
    "iris.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:29.710936Z",
     "start_time": "2019-06-27T14:40:19.936853Z"
    }
   },
   "outputs": [],
   "source": [
    "sns.pairplot(iris, diag_kind=\"kde\", hue=\"Name\")\n",
    "plt.savefig(\"figs/sns_pairplot.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combining and styling plots"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plotting styles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:29.722110Z",
     "start_time": "2019-06-27T14:40:29.714179Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.style.available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:30.242358Z",
     "start_time": "2019-06-27T14:40:29.725776Z"
    }
   },
   "outputs": [],
   "source": [
    "with plt.style.context('ggplot'):\n",
    "    plt.plot(x, y)\n",
    "    plt.savefig(\"figs/plt_line_ggplot.pdf\")\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:30.252447Z",
     "start_time": "2019-06-27T14:40:30.246994Z"
    }
   },
   "outputs": [],
   "source": [
    "# plt.style.use(\"LHCb\")\n",
    "# with plt.style.context('LHCb'):\n",
    "#     plt.plot(x, y)\n",
    "#     plt.savefig(\"figs/plt_line_lhcb.pdf\")\n",
    "#     plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Subplots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-28T05:24:36.847930Z",
     "start_time": "2019-06-28T05:24:36.842176Z"
    }
   },
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "x = np.arange(0, 10, 0.01)\n",
    "y = np.random.randn(len(x)).cumsum()\n",
    "d = np.diff(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-28T05:06:22.409055Z",
     "start_time": "2019-06-28T05:06:22.403692Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.subplot?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:30.911511Z",
     "start_time": "2019-06-27T14:40:30.269246Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.subplot(2, 2, 1)\n",
    "plt.plot(x, y)\n",
    "\n",
    "plt.subplot(224)\n",
    "plt.hist(d, bins=20, density=True)\n",
    "\n",
    "plt.savefig(\"figs/plt_subplot.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-28T05:07:47.298098Z",
     "start_time": "2019-06-28T05:07:47.292394Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.subplots?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:32.239210Z",
     "start_time": "2019-06-27T14:40:30.915361Z"
    }
   },
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 2)\n",
    "axes[0,0].plot(x, y)\n",
    "axes[1,1].hist(d, bins=20, density=True)\n",
    "plt.savefig(\"figs/plt_subplots.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Picture-in Picture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-28T05:24:40.262539Z",
     "start_time": "2019-06-28T05:24:39.852408Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.plot(x, y)\n",
    "plt.axes([0.2, .6, .2, .2])\n",
    "plt.hist(d, bins=20, density=True)\n",
    "plt.xticks([])\n",
    "plt.yticks([])\n",
    "plt.savefig(\"figs/plt_pip.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multiple axes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:33.126327Z",
     "start_time": "2019-06-27T14:40:32.633716Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "x = np.linspace(-5, 5)\n",
    "y = 2*x + 3\n",
    "y2 = x**2\n",
    "\n",
    "ax1 = plt.gca()\n",
    "ax1.plot(x, y)\n",
    "ax1.set_ylabel(\"Linear\")\n",
    "ax2 = ax1.twinx()\n",
    "ax2.plot(x, y2)\n",
    "ax2.set_ylabel(\"Quadratic\")\n",
    "\n",
    "plt.savefig(\"figs/plt_twinx.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Networks and Natural Language Processing (NLP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:59.550235Z",
     "start_time": "2019-06-30T18:50:59.546131Z"
    }
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import nltk\n",
    "from nltk.util import ngrams\n",
    "import matplotlib\n",
    "\n",
    "from collections import Counter, defaultdict\n",
    "from operator import itemgetter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:05.436181Z",
     "start_time": "2019-06-30T18:50:05.204203Z"
    }
   },
   "outputs": [],
   "source": [
    "G = nx.cubical_graph()\n",
    "nx.draw(G)\n",
    "plt.savefig(\"figs/networkx_cubical.png\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:06.689941Z",
     "start_time": "2019-06-30T18:50:06.682965Z"
    }
   },
   "outputs": [],
   "source": [
    "ignore = {\",\", \".\", \"``\", \"''\", \"'\", \"'s\", \"?\", \"!\", \"-\", \"--\", \"...\", \";\"}\n",
    "\n",
    "def get_words(file_name, encoding=\"utf-8\"):\n",
    "    with open(file_name, encoding=encoding) as f:\n",
    "        for line in f:\n",
    "            if not line:\n",
    "                continue\n",
    "            for word in nltk.word_tokenize(line.strip()):\n",
    "                if word not in ignore:\n",
    "                    yield word.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:08.351232Z",
     "start_time": "2019-06-30T18:50:07.466634Z"
    }
   },
   "outputs": [],
   "source": [
    "words = list(get_words(\"data/Harry Potter and the Sorcerer.txt\", 'iso-8859-1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:09.206595Z",
     "start_time": "2019-06-30T18:50:09.193709Z"
    }
   },
   "outputs": [],
   "source": [
    "counter = Counter(words)\n",
    "w, c = zip(*counter.most_common(15))\n",
    "c = np.array(c) / len(words) * 100."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:10.038766Z",
     "start_time": "2019-06-30T18:50:09.839734Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.plot(w, c)\n",
    "plt.ylabel(\"Relative frequency [%]\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:13.588485Z",
     "start_time": "2019-06-30T18:50:12.586774Z"
    }
   },
   "outputs": [],
   "source": [
    "words = list(filter(lambda word: len(word) > 3, get_words(\"data/Harry Potter and the Sorcerer.txt\", 'iso-8859-1')))\n",
    "counter = Counter(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:50:13.592890Z",
     "start_time": "2019-06-30T18:50:13.590527Z"
    }
   },
   "outputs": [],
   "source": [
    "from itertools import islice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$\\phi=\\frac{n_{11}n_{00}-n_{10}n_{01}}{\\sqrt{n_{1\\cdot}n_{0\\cdot}n_{\\cdot0}n_{\\cdot1}}}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def phi(word1, word2, counter, digram_graph):\n",
    "    n1tot = \n",
    "    n11 = digram_graph[word1][word2]\n",
    "    n00 = sum(counter.values()) - "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:51:05.411712Z",
     "start_time": "2019-06-30T18:51:05.196713Z"
    }
   },
   "outputs": [],
   "source": [
    "g = nx.Graph()\n",
    "g.add_nodes_from((word, {\"size\": freq}) for word, freq in counter.items())\n",
    "\n",
    "digram_graph = defaultdict(Counter)\n",
    "for word1, word2 in ngrams(words, 2):\n",
    "    digram_graph[word1][word2] += 1\n",
    "    digram_graph[word2][word1] += 1\n",
    "\n",
    "digrams = Counter(tuple(sorted(x)) for x in ngrams(words, 2)).items()\n",
    "\n",
    "g.add_edges_from((*pair, {\"weight\": freq / counter[pair[0]]}) for pair, freq in digrams)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-30T18:51:09.167583Z",
     "start_time": "2019-06-30T18:51:08.550691Z"
    }
   },
   "outputs": [],
   "source": [
    "digram_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:37.288331Z",
     "start_time": "2019-06-27T14:40:37.282050Z"
    }
   },
   "outputs": [],
   "source": [
    "def filter_edges(g, most_common_nodes, cutoff=0.5):\n",
    "    most_common_nodes = set(most_common_nodes)\n",
    "    for edge in g.edges:\n",
    "        if not all(node in most_common_nodes for node in edge):\n",
    "            continue\n",
    "        if g.get_edge_data(*edge)[\"weight\"] < cutoff:\n",
    "            continue\n",
    "        yield edge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:37.346301Z",
     "start_time": "2019-06-27T14:40:37.290140Z"
    }
   },
   "outputs": [],
   "source": [
    "most_common_nodes = list(map(itemgetter(0), counter.most_common(100)))\n",
    "edges = list(filter_edges(g, most_common_nodes, cutoff=0.))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:37.363477Z",
     "start_time": "2019-06-27T14:40:37.349133Z"
    }
   },
   "outputs": [],
   "source": [
    "g2 = nx.Graph()\n",
    "g2.add_nodes_from((node, g.node[node]) for node in most_common_nodes)\n",
    "g2.add_edges_from((*edge, g.get_edge_data(*edge)) for edge in edges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:40.874333Z",
     "start_time": "2019-06-27T14:40:37.366331Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20, 20))\n",
    "layout = nx.spring_layout(g2, k=0.1, iterations=50)\n",
    "sizes = [g2.node[node][\"size\"] * 5 for node in g2]\n",
    "edge_weights = np.array([g2.get_edge_data(*edge)[\"weight\"] for edge in g2.edges])\n",
    "nx.draw(g2, pos=layout,\n",
    "        with_labels=True, node_size=sizes, node_color=\"#cccccc\",\n",
    "        edge_cmap=plt.cm.Blues, edge_color=edge_weights, edge_vmin=edge_weights.min(), edge_vmax=edge_weights.max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-27T14:40:41.158594Z",
     "start_time": "2019-06-27T14:40:40.879342Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.hist(np.log(edge_weights))\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "384px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}