'sum': {'Summe', 'Berlin'},
}
-# Here only image files are available for the table data.
-unparsable_graphics_fallback = {
+# some pre-filled values
+data = {
+ # For these, only image files are available for the table data.
datetime.datetime(2020, 7, 2): {
'CW': {'growth': 4, 'total': 851},
'FK': {'growth': 10, 'total': 681},
'TS': {'growth': 41, 'total': 362},
'TK': {'growth': 14, 'total': 183},
'sum': {'growth': 263, 'total': 3486}
- }
+ },
+ # Here the totals needed to be reconstructed.
+ datetime.datetime(2020, 3, 9): {
+ 'CW': {'growth': 4, 'total': 13},
+ 'FK': {'growth': 3, 'total': 12},
+ 'Li': {'growth': 0, 'total': 1},
+ 'MH': {'growth': 1, 'total': 2},
+ 'Mi': {'growth': 0, 'total': 8},
+ 'Ne': {'growth': 1, 'total': 3},
+ 'Pa': {'growth': 1, 'total': 6},
+ 'Re': {'growth': 0, 'total': 3},
+ 'Sp': {'growth': 0, 'total': 2},
+ 'SZ': {'growth': 0, 'total': 3},
+ 'TS': {'growth': 0, 'total': 5},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 10, 'total': 58}
+ },
+ # Here the growth numbers needed to be reconstructed.
+ datetime.datetime(2020, 3, 8): {
+ 'CW': {'growth': 0, 'total': 9},
+ 'FK': {'growth': 4, 'total': 9},
+ 'Li': {'growth': 1, 'total': 1},
+ 'MH': {'growth': 0, 'total': 1},
+ 'Mi': {'growth': 0, 'total': 8},
+ 'Ne': {'growth': 0, 'total': 2},
+ 'Pa': {'growth': 0, 'total': 5},
+ 'Re': {'growth': 0, 'total': 3},
+ 'Sp': {'growth': 2, 'total': 2},
+ 'SZ': {'growth': 1, 'total': 3},
+ 'TS': {'growth': 0, 'total': 5},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 8, 'total': 48}
+ },
+ # Here the growth numbers needed to be reconstructed.
+ datetime.datetime(2020, 3, 7): {
+ 'CW': {'growth': 6, 'total': 9},
+ 'FK': {'growth': 1, 'total': 5},
+ 'Li': {'growth': 0, 'total': 0},
+ 'MH': {'growth': 0, 'total': 1},
+ 'Mi': {'growth': 1, 'total': 8},
+ 'Ne': {'growth': 0, 'total': 2},
+ 'Pa': {'growth': 1, 'total': 5},
+ 'Re': {'growth': 0, 'total': 3},
+ 'Sp': {'growth': 0, 'total': 0},
+ 'SZ': {'growth': 2, 'total': 2},
+ 'TS': {'growth': 1, 'total': 5},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 12, 'total': 40}
+ },
+ # Here the growth numbers needed to be reconstructed.
+ datetime.datetime(2020, 3, 6): {
+ 'CW': {'growth': 1, 'total': 3},
+ 'FK': {'growth': 0, 'total': 4},
+ 'Li': {'growth': 0, 'total': 0},
+ 'MH': {'growth': 0, 'total': 1},
+ 'Mi': {'growth': 4, 'total': 7},
+ 'Ne': {'growth': 1, 'total': 2},
+ 'Pa': {'growth': 1, 'total': 4},
+ 'Re': {'growth': 0, 'total': 3},
+ 'Sp': {'growth': 0, 'total': 0},
+ 'SZ': {'growth': 0, 'total': 0},
+ 'TS': {'growth': 2, 'total': 4},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 9, 'total': 28}
+ },
+ # Here the growth numbers needed to be reconstructed.
+ datetime.datetime(2020, 3, 5): {
+ 'CW': {'growth': 2, 'total': 2},
+ 'FK': {'growth': 0, 'total': 4},
+ 'Li': {'growth': 0, 'total': 0},
+ 'MH': {'growth': 0, 'total': 1},
+ 'Mi': {'growth': 0, 'total': 3},
+ 'Ne': {'growth': 0, 'total': 1},
+ 'Pa': {'growth': 1, 'total': 3},
+ 'Re': {'growth': 2, 'total': 3},
+ 'Sp': {'growth': 0, 'total': 0},
+ 'SZ': {'growth': 0, 'total': 0},
+ 'TS': {'growth': 1, 'total': 2},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 6, 'total': 19}
+ },
+ # Here the growth numbers needed to be reconstructed.
+ datetime.datetime(2020, 3, 4): {
+ 'CW': {'growth': 0, 'total': 0},
+ 'FK': {'growth': 2, 'total': 4},
+ 'Li': {'growth': 0, 'total': 0},
+ 'MH': {'growth': 0, 'total': 1},
+ 'Mi': {'growth': 0, 'total': 3},
+ 'Ne': {'growth': 0, 'total': 1},
+ 'Pa': {'growth': 1, 'total': 2},
+ 'Re': {'growth': 1, 'total': 1},
+ 'Sp': {'growth': 0, 'total': 0},
+ 'SZ': {'growth': 0, 'total': 0},
+ 'TS': {'growth': 0, 'total': 1},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 4, 'total': 13}
+ },
+ # Here the growth numbers needed to be reconstructed.
+ datetime.datetime(2020, 3, 3): {
+ 'CW': {'growth': 0, 'total': 0},
+ 'FK': {'growth': 2, 'total': 2},
+ 'Li': {'growth': 0, 'total': 0},
+ 'MH': {'growth': 0, 'total': 1},
+ 'Mi': {'growth': 0, 'total': 3},
+ 'Ne': {'growth': 0, 'total': 1},
+ 'Pa': {'growth': 1, 'total': 1},
+ 'Re': {'growth': 0, 'total': 0},
+ 'Sp': {'growth': 0, 'total': 0},
+ 'SZ': {'growth': 0, 'total': 0},
+ 'TS': {'growth': 0, 'total': 1},
+ 'TK': {'growth': 0, 'total': 0},
+ 'sum': {'growth': 3, 'total': 9}
+ },
}
fixes = {
# Here the official total is 215, while the summation of district
# compromise to keep as many surrounding numbers stable as possible.
datetime.datetime(2020, 3, 26): {
'SZ': {
- 'growth': 12,
- 'total': 132
+ 'growth': 12
},
'sum': {
- 'growth': 286,
- 'total': 1931
+ 'growth': 286
}
},
# Here the official total is 220, while the summation of district
# numbers adds up to 228 – looks like someone misread an 8 as a 0.
datetime.datetime(2020, 3, 25): {
'sum': {
- 'growth': 220
+ 'growth': 228
}
},
}
-# Scan navigation bar for maximum pagination value.
+# Scan navigation bar for maximum pagination value.
url = url_prefix + pm_dir
with urllib.request.urlopen(url) as response:
html = response.read()
continue
day_urls += [link['href']]
-# Collect infection data.
-data = {}
+# Collect infection data.
first_run = True
districts_sorted = []
-# TODO: Push limit further back (might need more data fixes for that).
-date_limit = datetime.datetime(2020, 3, 16)
+date_limit = datetime.datetime(2020, 3, 11)
for path in day_urls:
url = url_prefix + path
with urllib.request.urlopen(url) as response:
date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
if date_limit > date:
break
- if date in data:
- raise Exception('Double date %s', date)
- #date -= datetime.timedelta(days=1)
- data[date] = {}
+ # On that day, two press releases were released, for that and the prev day.
+ if date == datetime.datetime(2020, 3, 15) and date in data:
+ date = datetime.datetime(2020, 3, 14)
+ # From here on, press releases describe numbers from prev day.
+ if date <= datetime.datetime(2020, 3, 13):
+ date = date - datetime.timedelta(days=1)
table = soup.find('table')
- if table is None:
- data[date] = unparsable_graphics_fallback[date]
+ # For 13th of March we lack a press release.
+ if table is None and (date in data or date == datetime.datetime(2020, 3, 13)):
continue
+ data[date] = {}
for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
- printable_tds = []
- for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
- printable_string = ' '.join([s for s in td.strings])
- printable_tds += [printable_string.strip()]
- district_long = printable_tds[0]
- district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
- if first_run:
- districts_sorted += [district_short]
- split_char = ' '
- if not split_char in printable_tds[1]:
- split_char = '('
- total_str, growth_str = printable_tds[1].split(split_char)
- growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
- total = int(total_str.replace('.', ''))
- data[date][district_short] = {'growth': growth, 'total': total}
+ printable_tds = []
+ for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
+ printable_string = ' '.join([s for s in td.strings])
+ printable_tds += [printable_string.strip()]
+ district_long = printable_tds[0]
+ district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
+ if first_run:
+ districts_sorted += [district_short]
+ if date == datetime.datetime(2020, 3, 10):
+ # For this date we only get totals.
+ data[date][district_short] = {'total': int(printable_tds[1])}
+ else:
+ split_char = ' '
+ if not split_char in printable_tds[1]:
+ split_char = '('
+ total_str, growth_str = printable_tds[1].split(split_char)
+ growth = int(growth_str.replace('(', '').replace(')', '').\
+ replace('+', ''))
+ total = int(total_str.replace('.', ''))
+ data[date][district_short] = {'growth': growth, 'total': total}
first_run = False
+
+def neighbor_days(day_target):
+ day_delta = datetime.timedelta(days=1)
+ return day_target + day_delta, day_target - day_delta
+
+# Reconstruct growth for 10th of March.
+day_target = datetime.datetime(2020, 3, 10)
+day_after, day_before = neighbor_days(day_target)
+for district in [d for d in districts_sorted]:
+ total_target = data[day_target][district]['total']
+ total_before = data[day_before][district]['total']
+ data[day_target][district]['growth'] = total_target - total_before
+
+# Reconstruct data for 13th of March.
+day_target = datetime.datetime(2020, 3, 13)
+day_after, day_before = neighbor_days(day_target)
+data[day_target] = {}
+for district in [d for d in districts_sorted]:
+ data[day_target][district] = {}
+ total_after = data[day_after][district]['total']
+ growth_after = data[day_after][district]['growth']
+ total_target = total_after - growth_after
+ data[day_target][district]['total'] = total_target
+ total_before = data[day_before][district]['total']
+ data[day_target][district]['growth'] = total_target - total_before
+
dates_sorted = list(data.keys())
dates_sorted.sort()
dates_sorted.reverse()
for district in [d for d in districts_sorted if not d=='sum']:
prev_date = date - datetime.timedelta(days=1)
if prev_date not in dates_sorted:
- # TODO: ensure dates until end of list are continuous
- continue
- prev_total = data[date - datetime.timedelta(days=1)][district]['total']
+ if prev_date >= date_limit:
+ raise Exception('Dates not contiguous: %s missing', prev_date)
+ else:
+ continue
+ prev_total = data[prev_date][district]['total']
cur_total = data[date][district]['total']
if cur_total - data[date][district]['growth'] != prev_total:
raise Exception('Questionable district infection total in %s/%s' % (district, date))
raise Exception('Questionable district infection growth sum in %s' % date)
# Final output.
+dates_sorted.reverse()
print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
for date in dates_sorted:
growths = []