{ "cells": [ { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from tqdm.auto import tqdm\n", "import feedparser\n", "from pathlib import Path\n", "import shutil\n", "from plotly import express as ex\n", "import numpy as np\n", "from markdownify import markdownify\n", "from bs4 import BeautifulSoup\n", "import re\n", "import codecs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get feed" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\elie\\AppData\\Roaming\\Python\\Python310\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www.provos.org'. Adding certificate verification is strongly advised. "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "xaxis": { "anchor": "y", "domain": [ 0, 1 ], "title": { "text": "x" } }, "yaxis": { "anchor": "x", "domain": [ 0, 1 ], "title": { "text": "count" } } } } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sanity check\n", "ex.histogram(x=word_count)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## setup data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "clean-up previous export\n" ] } ], "source": [ "workdir = Path(\"../data/\")\n", "mddir = workdir / 'md'\n", "imgdir = workdir / 'static' / 'images'\n", "if workdir.exists():\n", " print(\"clean-up previous export\")\n", " shutil.rmtree(workdir)\n", "workdir.mkdir(parents=True)\n", "mddir.mkdir(parents=True)\n", "imgdir.mkdir(parents=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## parsing" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def get_images(html, img_dir):\n", " bs = BeautifulSoup(html, 'html.parser')\n", " images = bs.find_all('img') #, {'src':re.compile('(.jpg|.png|.gif)')})\n", " srcs = [i['src'] for i in images]\n", " r = []\n", " for src in srcs:\n", " if not \"provos.org\" in src:\n", " continue\n", " img_name = src.split('/')[-1]\n", " fname = img_dir / img_name\n", " \n", " # download if only needed\n", " if not fname.exists():\n", " req = requests.get(src, verify=False)\n", " if req.status_code == 200:\n", " with open(fname, 'wb') as f:\n", " f.write(req.content)\n", " r.append({\"url\": src, \"fname\": str(img_name)})\n", " return r" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['title', 'title_detail', 'links', 'link', 'tags', 'comments', 'wfw_comment', for p in tqdm(posts, desc="creating posts"):
 title = p.title
 slug = p.title.lower().replace(' ', '-').replace('--', '-')
 slug = re.sub('[^a-z0-9-]', '', slug)
 postpath = mddir / f"{slug}.md" 
 
 date = p.published
 if 'tags' in p:
 tags = [t['term'] for t in p.tags]
 else:
 tags = []
 content = p.summary

 # dl images and get info
 img_info = get_images(content, imgdir)

 # markdown
 md = markdownify(content) 
 md = md.replace('\n\n', '\n') 

 # replace with relative images
 for i in img_info:
 md = md.replace(i['url'], f"/static/images/{i['fname']}")

 with codecs.open(postpath, 'w+', encoding='utf-8') as o:
 o.write('---\n')
 o.write(f'layout: post\n')
 o.write(f'title: {title}\n')
 o.write(f'slug: {slug}\n')
 o.write(f'permalink: posts/{slug}\n')
 o.write("author: Niels Provos\n")
 o.write(f'date: {date}\n') 
 o.write(f'tags: {",".join(tags)}\n') 
 o.write(f'published: true\n')
 o.write('---\n')
 o.write(f'# {title}\n')
 o.write(md)