Add draft of infection data scraper.

author Christian Heller <c.heller@plomlompom.de>

Tue, 21 Jul 2020 17:02:27 +0000 (19:02 +0200)

committer Christian Heller <c.heller@plomlompom.de>

Tue, 21 Jul 2020 17:02:27 +0000 (19:02 +0200)
author Christian Heller <c.heller@plomlompom.de>
Tue, 21 Jul 2020 17:02:27 +0000 (19:02 +0200)
committer Christian Heller <c.heller@plomlompom.de>
Tue, 21 Jul 2020 17:02:27 +0000 (19:02 +0200)
diff --git a/scrape.py b/scrape.py

new file mode 100755 (executable)

index 0000000..7c7196c
--- /dev/null
+++ b/scrape.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+import urllib.request
+import datetime
+import bs4
+import re
+
+url_prefix = 'https://www.berlin.de'
+pm_dir = '/sen/gpg/service/presse/2020/'
+pm_nav_path = pm_dir + '?page_at_1_0='
+
+# Map abbreviations to full names (and their alternate spellings).
+abbrevs = {
+  'CW': {'Charlottenburg-Wilmersdorf'},
+  'FK': {'Friedrichshain-Kreuzberg'},
+  'Li': {'Lichtenberg'},
+  'MH': {'Marzahn-Hellersdorf'},
+  'Mi': {'Mitte'},
+  'Ne': {'Neukölln', 'Neuköln'},
+  'Pa': {'Pankow'},
+  'Re': {'Reinickendorf'},
+  'Sp': {'Spandau'},
+  'SZ': {'Steglitz-Zehlendorf'},
+  'TS': {'Tempelhof-Schöneberg'},
+  'TK': {'Treptow-Köpenick'},
+  'sum': {'Summe', 'Berlin'},
+}
+
+# Here only image files are available for the table data.
+unparsable_graphics_fallback = {
+    datetime.datetime(2020, 7, 2): {
+        'CW': {'growth': 4, 'total': 851},
+        'FK': {'growth': 10, 'total': 681},
+        'Li': {'growth': 3, 'total': 427},
+        'MH': {'growth': 4, 'total': 468},
+        'Mi': {'growth': 0, 'total': 1202},
+        'Ne': {'growth': 7, 'total': 1031},
+        'Pa': {'growth': 3, 'total': 784},
+        'Re': {'growth': 6, 'total': 660},
+        'Sp': {'growth': 3, 'total': 450},
+        'SZ': {'growth': 0, 'total': 591},
+        'TS': {'growth': 3, 'total': 798},
+        'TK': {'growth': 0, 'total': 401},
+        'sum': {'growth': 43, 'total': 8344}
+    },
+    datetime.datetime(2020, 4, 5): {
+        'CW': {'growth': 9, 'total': 462},
+        'FK': {'growth': 2, 'total': 352},
+        'Li': {'growth': 0, 'total': 142},
+        'MH': {'growth': 3, 'total': 127},
+        'Mi': {'growth': 14, 'total': 537},
+        'Ne': {'growth': 0, 'total': 392},
+        'Pa': {'growth': 10, 'total': 378},
+        'Re': {'growth': 9, 'total': 248},
+        'Sp': {'growth': 3, 'total': 150},
+        'SZ': {'growth': 0, 'total': 312},
+        'TS': {'growth': 8, 'total': 394},
+        'TK': {'growth': 3, 'total': 193},
+        'sum': {'growth': 61, 'total': 3687}
+    },
+    datetime.datetime(2020, 4, 4): {
+        'CW': {'growth': 2, 'total': 453},
+        'FK': {'growth': 7, 'total': 350},
+        'Li': {'growth': 0, 'total': 142},
+        'MH': {'growth': 15, 'total': 124},
+        'Mi': {'growth': 22, 'total': 523},
+        'Ne': {'growth': 15, 'total': 392},
+        'Pa': {'growth': 10, 'total': 368},
+        'Re': {'growth': 5, 'total': 239},
+        'Sp': {'growth': 21, 'total': 147},
+        'SZ': {'growth': 12, 'total': 312},
+        'TS': {'growth': 24, 'total': 386},
+        'TK': {'growth': 7, 'total': 190},
+        'sum': {'growth': 140, 'total': 3626}
+    },
+    datetime.datetime(2020, 4, 3): {
+        'CW': {'growth': 44, 'total': 451},
+        'FK': {'growth': 17, 'total': 343},
+        'Li': {'growth': 7, 'total': 142},
+        'MH': {'growth': 4, 'total': 109},
+        'Mi': {'growth': 4, 'total': 501},
+        'Ne': {'growth': 40, 'total': 377},
+        'Pa': {'growth': 39, 'total': 358},
+        'Re': {'growth': 26, 'total': 234},
+        'Sp': {'growth': 9, 'total': 126},
+        'SZ': {'growth': 18, 'total': 300},
+        'TS': {'growth': 41, 'total': 362},
+        'TK': {'growth': 14, 'total': 183},
+        'sum': {'growth': 263, 'total': 3486}
+    }
+}
+fixes = {
+   # Here the official total is 215, while the summation of district
+   # numbers only adds up to 125 – pretty much looks like a mere
+   # transposition of digits.
+   datetime.datetime(2020, 3, 27): {
+       'sum': {
+           'growth': 125
+       }
+   },
+   # Here the official total is 1937, while the summation of district
+   # numbers only adds up to 1917; furthermore, the original value for
+   # SZ is 118 (+18), which makes no sense, as the day before is
+   # 120 (+15) and the day after is 147 (+15).  The following is a
+   # compromise to keep as many surrounding numbers stable as possible.
+   datetime.datetime(2020, 3, 26): {
+       'SZ': {
+           'growth': 12,
+           'total': 132
+       },
+       'sum': {
+           'growth': 286,
+           'total': 1931
+       }
+   },
+   # Here the official total is 220, while the summation of district
+   # numbers adds up to 228 – looks like someone misread an 8 as a 0.
+   datetime.datetime(2020, 3, 25): {
+       'sum': {
+           'growth': 220
+       }
+   },
+}
+
+# Scan navigation bar for maximum pagination value. 
+url = url_prefix + pm_dir
+with urllib.request.urlopen(url) as response:
+   html = response.read()
+soup = bs4.BeautifulSoup(html, 'html.parser')
+max_page=0
+for link in soup.find_all('a'):
+    href = link['href']
+    if str.startswith(href, pm_nav_path):
+        max_test = int(href.split('=')[1])
+        max_page = max_test if max_test > max_page else max_page
+
+# Scan paginated press release links for daily Corona number briefing links.
+day_urls = []
+for i in range(max_page):
+    url = url_prefix + pm_nav_path + str(i + 1)
+    with urllib.request.urlopen(url) as response:
+        html = response.read()
+    soup = bs4.BeautifulSoup(html, 'html.parser')
+    for link in soup.find_all('a'):
+        if (not link.string) or\
+           (not link.string.startswith('Coronavirus: Derzeit') and
+            not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')):
+            continue
+        day_urls += [link['href']]
+
+# Collect infection data. 
+data = {}
+first_run = True
+districts_sorted = []
+# TODO: Push limit further back (might need more data fixes for that).
+date_limit = datetime.datetime(2020, 3, 16)
+for path in day_urls:
+    url = url_prefix + path
+    with urllib.request.urlopen(url) as response:
+        html = response.read()
+    soup = bs4.BeautifulSoup(html, 'html.parser')
+    date_title = soup.find('div', class_='pressnumber')
+    m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string)
+    date_formatted = m.group(0)
+    date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
+    if date_limit > date:
+        break
+    if date in data:
+        raise Exception('Double date %s', date)
+        #date -= datetime.timedelta(days=1)
+    data[date] = {}
+    table = soup.find('table')
+    if table is None:
+        data[date] = unparsable_graphics_fallback[date]
+        continue
+    for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
+        printable_tds = []
+        for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
+            printable_string = ' '.join([s for s in td.strings])
+            printable_tds += [printable_string.strip()]
+        district_long = printable_tds[0]
+        district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
+        if first_run:
+            districts_sorted += [district_short]
+        split_char = ' '
+        if not split_char in printable_tds[1]:
+            split_char = '('
+        total_str, growth_str = printable_tds[1].split(split_char)
+        growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
+        total = int(total_str.replace('.', ''))
+        data[date][district_short] = {'growth': growth, 'total': total}
+    first_run = False
+dates_sorted = list(data.keys())
+dates_sorted.sort()
+dates_sorted.reverse()
+
+# Apply fixes and ensure integrity of results
+for date in fixes:
+    for district in fixes[date]:
+        for type_ in fixes[date][district]:
+            data[date][district][type_] = fixes[date][district][type_]
+for date in dates_sorted:
+    if date in fixes:
+       continue
+    for district in [d for d in districts_sorted if not d=='sum']:
+        prev_date = date - datetime.timedelta(days=1)
+        if prev_date not in dates_sorted:
+            # TODO: ensure dates until end of list are continuous
+            continue
+        prev_total = data[date - datetime.timedelta(days=1)][district]['total']
+        cur_total = data[date][district]['total']
+        if cur_total - data[date][district]['growth'] != prev_total:
+            raise Exception('Questionable district infection total in %s/%s' % (district, date))
+    day_sum = 0
+    for district in [d for d in districts_sorted if not d=='sum']:
+       day_sum += data[date][district]['total']
+    if day_sum != data[date]['sum']['total']:
+        raise Exception('Questionable district infection total sum in %s' % date)
+    day_sum = 0
+    for district in [d for d in districts_sorted if not d=='sum']:
+       day_sum += data[date][district]['growth']
+    if day_sum != data[date]['sum']['growth']:
+        raise Exception('Questionable district infection growth sum in %s' % date)
+
+# Final output.
+print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
+for date in dates_sorted:
+    growths = []
+    for d in districts_sorted:
+        growths += [data[date][d]['growth']]
+    print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))
author	Christian Heller <c.heller@plomlompom.de>
	Tue, 21 Jul 2020 17:02:27 +0000 (19:02 +0200)
committer	Christian Heller <c.heller@plomlompom.de>
	Tue, 21 Jul 2020 17:02:27 +0000 (19:02 +0200)