--- /dev/null
+#!/usr/bin/env python3
+import urllib.request
+import datetime
+import bs4
+import re
+url_prefix = 'https://www.berlin.de'
+pm_dir = '/sen/gpg/service/presse/2020/'
+pm_nav_path = pm_dir + '?page_at_1_0='
+# Map abbreviations to full names (and their alternate spellings).
+abbrevs = {
+ 'CW': {'Charlottenburg-Wilmersdorf'},
+ 'FK': {'Friedrichshain-Kreuzberg'},
+ 'Li': {'Lichtenberg'},
+ 'MH': {'Marzahn-Hellersdorf'},
+ 'Mi': {'Mitte'},
+ 'Ne': {'Neukölln', 'Neuköln'},
+ 'Pa': {'Pankow'},
+ 'Re': {'Reinickendorf'},
+ 'Sp': {'Spandau'},
+ 'SZ': {'Steglitz-Zehlendorf'},
+ 'TS': {'Tempelhof-Schöneberg'},
+ 'TK': {'Treptow-Köpenick'},
+ 'sum': {'Summe', 'Berlin'},
+# Here only image files are available for the table data.
+unparsable_graphics_fallback = {
+ datetime.datetime(2020, 7, 2): {
+ 'CW': {'growth': 4, 'total': 851},
+ 'FK': {'growth': 10, 'total': 681},
+ 'Li': {'growth': 3, 'total': 427},
+ 'MH': {'growth': 4, 'total': 468},
+ 'Mi': {'growth': 0, 'total': 1202},
+ 'Ne': {'growth': 7, 'total': 1031},
+ 'Pa': {'growth': 3, 'total': 784},
+ 'Re': {'growth': 6, 'total': 660},
+ 'Sp': {'growth': 3, 'total': 450},
+ 'SZ': {'growth': 0, 'total': 591},
+ 'TS': {'growth': 3, 'total': 798},
+ 'TK': {'growth': 0, 'total': 401},
+ 'sum': {'growth': 43, 'total': 8344}
+ },
+ datetime.datetime(2020, 4, 5): {
+ 'CW': {'growth': 9, 'total': 462},
+ 'FK': {'growth': 2, 'total': 352},
+ 'Li': {'growth': 0, 'total': 142},
+ 'MH': {'growth': 3, 'total': 127},
+ 'Mi': {'growth': 14, 'total': 537},
+ 'Ne': {'growth': 0, 'total': 392},
+ 'Pa': {'growth': 10, 'total': 378},
+ 'Re': {'growth': 9, 'total': 248},
+ 'Sp': {'growth': 3, 'total': 150},
+ 'SZ': {'growth': 0, 'total': 312},
+ 'TS': {'growth': 8, 'total': 394},
+ 'TK': {'growth': 3, 'total': 193},
+ 'sum': {'growth': 61, 'total': 3687}
+ },
+ datetime.datetime(2020, 4, 4): {
+ 'CW': {'growth': 2, 'total': 453},
+ 'FK': {'growth': 7, 'total': 350},
+ 'Li': {'growth': 0, 'total': 142},
+ 'MH': {'growth': 15, 'total': 124},
+ 'Mi': {'growth': 22, 'total': 523},
+ 'Ne': {'growth': 15, 'total': 392},
+ 'Pa': {'growth': 10, 'total': 368},
+ 'Re': {'growth': 5, 'total': 239},
+ 'Sp': {'growth': 21, 'total': 147},
+ 'SZ': {'growth': 12, 'total': 312},
+ 'TS': {'growth': 24, 'total': 386},
+ 'TK': {'growth': 7, 'total': 190},
+ 'sum': {'growth': 140, 'total': 3626}
+ },
+ datetime.datetime(2020, 4, 3): {
+ 'CW': {'growth': 44, 'total': 451},
+ 'FK': {'growth': 17, 'total': 343},
+ 'Li': {'growth': 7, 'total': 142},
+ 'MH': {'growth': 4, 'total': 109},
+ 'Mi': {'growth': 4, 'total': 501},
+ 'Ne': {'growth': 40, 'total': 377},
+ 'Pa': {'growth': 39, 'total': 358},
+ 'Re': {'growth': 26, 'total': 234},
+ 'Sp': {'growth': 9, 'total': 126},
+ 'SZ': {'growth': 18, 'total': 300},
+ 'TS': {'growth': 41, 'total': 362},
+ 'TK': {'growth': 14, 'total': 183},
+ 'sum': {'growth': 263, 'total': 3486}
+ }
+fixes = {
+ # Here the official total is 215, while the summation of district
+ # numbers only adds up to 125 – pretty much looks like a mere
+ # transposition of digits.
+ datetime.datetime(2020, 3, 27): {
+ 'sum': {
+ 'growth': 125
+ }
+ },
+ # Here the official total is 1937, while the summation of district
+ # numbers only adds up to 1917; furthermore, the original value for
+ # SZ is 118 (+18), which makes no sense, as the day before is
+ # 120 (+15) and the day after is 147 (+15). The following is a
+ # compromise to keep as many surrounding numbers stable as possible.
+ datetime.datetime(2020, 3, 26): {
+ 'SZ': {
+ 'growth': 12,
+ 'total': 132
+ },
+ 'sum': {
+ 'growth': 286,
+ 'total': 1931
+ }
+ },
+ # Here the official total is 220, while the summation of district
+ # numbers adds up to 228 – looks like someone misread an 8 as a 0.
+ datetime.datetime(2020, 3, 25): {
+ 'sum': {
+ 'growth': 220
+ }
+ },
+# Scan navigation bar for maximum pagination value.
+url = url_prefix + pm_dir
+with urllib.request.urlopen(url) as response:
+ html = response.read()
+soup = bs4.BeautifulSoup(html, 'html.parser')
+for link in soup.find_all('a'):
+ href = link['href']
+ if str.startswith(href, pm_nav_path):
+ max_test = int(href.split('=')[1])
+ max_page = max_test if max_test > max_page else max_page
+# Scan paginated press release links for daily Corona number briefing links.
+day_urls = []
+for i in range(max_page):
+ url = url_prefix + pm_nav_path + str(i + 1)
+ with urllib.request.urlopen(url) as response:
+ html = response.read()
+ soup = bs4.BeautifulSoup(html, 'html.parser')
+ for link in soup.find_all('a'):
+ if (not link.string) or\
+ (not link.string.startswith('Coronavirus: Derzeit') and
+ not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')):
+ continue
+ day_urls += [link['href']]
+# Collect infection data.
+data = {}
+first_run = True
+districts_sorted = []
+# TODO: Push limit further back (might need more data fixes for that).
+date_limit = datetime.datetime(2020, 3, 16)
+for path in day_urls:
+ url = url_prefix + path
+ with urllib.request.urlopen(url) as response:
+ html = response.read()
+ soup = bs4.BeautifulSoup(html, 'html.parser')
+ date_title = soup.find('div', class_='pressnumber')
+ m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string)
+ date_formatted = m.group(0)
+ date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
+ if date_limit > date:
+ break
+ if date in data:
+ raise Exception('Double date %s', date)
+ #date -= datetime.timedelta(days=1)
+ data[date] = {}
+ table = soup.find('table')
+ if table is None:
+ data[date] = unparsable_graphics_fallback[date]
+ continue
+ for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
+ printable_tds = []
+ for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
+ printable_string = ' '.join([s for s in td.strings])
+ printable_tds += [printable_string.strip()]
+ district_long = printable_tds[0]
+ district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
+ if first_run:
+ districts_sorted += [district_short]
+ split_char = ' '
+ if not split_char in printable_tds[1]:
+ split_char = '('
+ total_str, growth_str = printable_tds[1].split(split_char)
+ growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
+ total = int(total_str.replace('.', ''))
+ data[date][district_short] = {'growth': growth, 'total': total}
+ first_run = False
+dates_sorted = list(data.keys())
+# Apply fixes and ensure integrity of results
+for date in fixes:
+ for district in fixes[date]:
+ for type_ in fixes[date][district]:
+ data[date][district][type_] = fixes[date][district][type_]
+for date in dates_sorted:
+ if date in fixes:
+ continue
+ for district in [d for d in districts_sorted if not d=='sum']:
+ prev_date = date - datetime.timedelta(days=1)
+ if prev_date not in dates_sorted:
+ # TODO: ensure dates until end of list are continuous
+ continue
+ prev_total = data[date - datetime.timedelta(days=1)][district]['total']
+ cur_total = data[date][district]['total']
+ if cur_total - data[date][district]['growth'] != prev_total:
+ raise Exception('Questionable district infection total in %s/%s' % (district, date))
+ day_sum = 0
+ for district in [d for d in districts_sorted if not d=='sum']:
+ day_sum += data[date][district]['total']
+ if day_sum != data[date]['sum']['total']:
+ raise Exception('Questionable district infection total sum in %s' % date)
+ day_sum = 0
+ for district in [d for d in districts_sorted if not d=='sum']:
+ day_sum += data[date][district]['growth']
+ if day_sum != data[date]['sum']['growth']:
+ raise Exception('Questionable district infection growth sum in %s' % date)
+# Final output.
+print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
+for date in dates_sorted:
+ growths = []
+ for d in districts_sorted:
+ growths += [data[date][d]['growth']]
+ print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))