{ "cells": [ { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from tqdm.auto import tqdm\n", "import feedparser\n", "from pathlib import Path\n", "import shutil\n", "from plotly import express as ex\n", "import numpy as np\n", "from markdownify import markdownify\n", "from bs4 import BeautifulSoup\n", "import re\n", "import codecs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get feed" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "b'\\n\\ncount=%{y}", "legendgroup": "", "marker": { "color": "#636efa", "pattern": { "shape": "" } }, "name": "", "offsetgroup": "", "orientation": "v", "showlegend": false, "type": "histogram", "x": [ 428, 313, 45, 101, 322, 71, 514, 143, 200, 120, 44, 38, 44, 26, 72, 87, 102, 49, 46, 59, 99, 98, 44, 96, 93, 212, 98, 143, 113, 263, 185, 295, 117, 151, 138, 91, 17, 40, 65, 151, 163, 187, 210, 161, 252, 76, 48, 103, 44, 114, 179, 95, 50, 182, 266, 60, 127, 55, 18, 79, 70, 393, 52, 150, 274, 204, 214, 168, 289, 227, 125, 189, 163, 123, 222, 94, 177, 296, 109, 51, 76, 445, 331, 98, 170, 105, 1, 73, 1, 162, 127, 158, 213, 344, 46, 236, 73, 101, 74, 78, 144, 402, 42, 75, 47, 82, 173, 535, 274, 88, 95, 92, 56, 79, 131, 56, 196, 54, 315, 21, 80, 34, 56, 31, 81, 145, 79, 58, 42, 80, 59, 122, 42, 75, 274, 117, 77, 211, 91, 76, 109, 92, 58, 114, 54, 222, 52, 265, 70 ], "xaxis": "x", "yaxis": "y" } ], "layout": { "barmode": "relative", "legend": { "tracegroupgap": 0 }, "margin": { "t": 60 }, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "xaxis": { "anchor": "y", "domain": [ 0, 1 ], "title": { "text": "x" } }, "yaxis": { "anchor": "x", "domain": [ 0, 1 ], "title": { "text": "count" } } } } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sanity check\n", "ex.histogram(x=word_count)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## setup data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "clean-up previous export\n" ] } ], "source": [ "workdir = Path(\"../data/\")\n", "mddir = workdir / 'md'\n", "imgdir = workdir / 'static' / 'images'\n", "if workdir.exists():\n", " print(\"clean-up previous export\")\n", " shutil.rmtree(workdir)\n", "workdir.mkdir(parents=True)\n", "mddir.mkdir(parents=True)\n", "imgdir.mkdir(parents=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## parsing" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def get_images(html, img_dir):\n", " bs = BeautifulSoup(html, 'html.parser')\n", " images = bs.find_all('img') #, {'src':re.compile('(.jpg|.png|.gif)')})\n", " srcs = [i['src'] for i in images]\n", " r = []\n", " for src in srcs:\n", " if not \"provos.org\" in src:\n", " continue\n", " img_name = src.split('/')[-1]\n", " fname = img_dir / img_name\n", " \n", " # download if only needed\n", " if not fname.exists():\n", " req = requests.get(src, verify=False)\n", " if req.status_code == 200:\n", " with open(fname, 'wb') as f:\n", " f.write(req.content)\n", " r.append({\"url\": src, \"fname\": str(img_name)})\n", " return r" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['title', 'title_detail', 'links', 'link', 'tags', 'comments', 'wfw_comment', 'slash_comments', 'wfw_commentrss', 'authors', 'author', 'author_detail', 'content', 'summary', 'published', 'published_parsed', 'id', 'guidislink'])" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "posts[10].keys()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "creating posts: 61%|██████ | 91/149 [00:00<00:00, 901.04it/s]C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning:\n", "\n", "Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", "\n", "creating posts: 100%|██████████| 149/149 [00:02<00:00, 63.35it/s]\n" ] } ], "source": [ "for p in tqdm(posts, desc=\"creating posts\"):\n", " title = p.title\n", " slug = p.title.lower().replace(' ', '-').replace('--', '-')\n", " slug = re.sub('[^a-z0-9-]', '', slug)\n", " postpath = mddir / f\"{slug}.md\" \n", " \n", " date = p.published\n", " if 'tags' in p:\n", " tags = [t['term'] for t in p.tags]\n", " else:\n", " tags = []\n", " content = p.summary\n", "\n", " # dl images and get info\n", " img_info = get_images(content, imgdir)\n", "\n", " # markdown\n", " md = markdownify(content) \n", " md = md.replace('\\n\\n', '\\n') \n", "\n", " # replace with relative images\n", " for i in img_info:\n", " md = md.replace(i['url'], f\"/static/images/{i['fname']}\")\n", "\n", " with codecs.open(postpath, 'w+', encoding='utf-8') as o:\n", " o.write('---\\n')\n", " o.write(f'layout: post\\n')\n", " o.write(f'title: {title}\\n')\n", " o.write(f'slug: {slug}\\n')\n", " o.write(f'permalink: posts/{slug}\\n')\n", " o.write(\"author: Niels Provos\\n\")\n", " o.write(f'date: {date}\\n') \n", " o.write(f'tags: {\",\".join(tags)}\\n') \n", " o.write(f'published: true\\n')\n", " o.write('---\\n')\n", " o.write(f'# {title}\\n')\n", " o.write(md)\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2 (default, Feb 28 2021, 17:03:44) \n[GCC 10.2.1 20210110]" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 2 }