X-Git-Url: https://plomlompom.com/repos/?p=berlin-corona-table;a=blobdiff_plain;f=scrape.py;h=f11f315f5e73e8f3d6b5b3b98c0f7fbfac63267a;hp=41a0db9d3c44a5781a66d7dbf02be6200c2df34c;hb=88e62da91d09edd688abd38a5924b4a712eecbf5;hpb=402755409e719a9e2cba7a9dcb1482b5834a3aa1 diff --git a/scrape.py b/scrape.py index 41a0db9..f11f315 100755 --- a/scrape.py +++ b/scrape.py @@ -25,8 +25,9 @@ abbrevs = { 'sum': {'Summe', 'Berlin'}, } -# Here only image files are available for the table data. -unparsable_graphics_fallback = { +# some pre-filled values +data = { + # For these, only image files are available for the table data. datetime.datetime(2020, 7, 2): { 'CW': {'growth': 4, 'total': 851}, 'FK': {'growth': 10, 'total': 681}, @@ -86,7 +87,135 @@ unparsable_graphics_fallback = { 'TS': {'growth': 41, 'total': 362}, 'TK': {'growth': 14, 'total': 183}, 'sum': {'growth': 263, 'total': 3486} - } + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 10): { + 'CW': {'growth': 2, 'total': 15}, + 'FK': {'growth': 0, 'total': 12}, + 'Li': {'growth': 4, 'total': 5}, + 'MH': {'growth': 1, 'total': 3}, + 'Mi': {'growth': 0, 'total': 8}, + 'Ne': {'growth': 2, 'total': 5}, + 'Pa': {'growth': 2, 'total': 8}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 4, 'total': 6}, + 'SZ': {'growth': 3, 'total': 6}, + 'TS': {'growth': 2, 'total': 7}, + 'TK': {'growth': 3, 'total': 3}, + 'sum': {'growth': 23, 'total': 81} + }, + # Here the totals needed to be reconstructed. + datetime.datetime(2020, 3, 9): { + 'CW': {'growth': 4, 'total': 13}, + 'FK': {'growth': 3, 'total': 12}, + 'Li': {'growth': 0, 'total': 1}, + 'MH': {'growth': 1, 'total': 2}, + 'Mi': {'growth': 0, 'total': 8}, + 'Ne': {'growth': 1, 'total': 3}, + 'Pa': {'growth': 1, 'total': 6}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 0, 'total': 2}, + 'SZ': {'growth': 0, 'total': 3}, + 'TS': {'growth': 0, 'total': 5}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 10, 'total': 58} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 8): { + 'CW': {'growth': 0, 'total': 9}, + 'FK': {'growth': 4, 'total': 9}, + 'Li': {'growth': 1, 'total': 1}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 8}, + 'Ne': {'growth': 0, 'total': 2}, + 'Pa': {'growth': 0, 'total': 5}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 2, 'total': 2}, + 'SZ': {'growth': 1, 'total': 3}, + 'TS': {'growth': 0, 'total': 5}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 8, 'total': 48} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 7): { + 'CW': {'growth': 6, 'total': 9}, + 'FK': {'growth': 1, 'total': 5}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 1, 'total': 8}, + 'Ne': {'growth': 0, 'total': 2}, + 'Pa': {'growth': 1, 'total': 5}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 2, 'total': 2}, + 'TS': {'growth': 1, 'total': 5}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 12, 'total': 40} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 6): { + 'CW': {'growth': 1, 'total': 3}, + 'FK': {'growth': 0, 'total': 4}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 4, 'total': 7}, + 'Ne': {'growth': 1, 'total': 2}, + 'Pa': {'growth': 1, 'total': 4}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 2, 'total': 4}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 9, 'total': 28} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 5): { + 'CW': {'growth': 2, 'total': 2}, + 'FK': {'growth': 0, 'total': 4}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 3}, + 'Ne': {'growth': 0, 'total': 1}, + 'Pa': {'growth': 1, 'total': 3}, + 'Re': {'growth': 2, 'total': 3}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 1, 'total': 2}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 6, 'total': 19} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 4): { + 'CW': {'growth': 0, 'total': 0}, + 'FK': {'growth': 2, 'total': 4}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 3}, + 'Ne': {'growth': 0, 'total': 1}, + 'Pa': {'growth': 1, 'total': 2}, + 'Re': {'growth': 1, 'total': 1}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 0, 'total': 1}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 4, 'total': 13} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 3): { + 'CW': {'growth': 0, 'total': 0}, + 'FK': {'growth': 2, 'total': 2}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 3}, + 'Ne': {'growth': 0, 'total': 1}, + 'Pa': {'growth': 1, 'total': 1}, + 'Re': {'growth': 0, 'total': 0}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 0, 'total': 1}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 3, 'total': 9} + }, } fixes = { # Here the official total is 215, while the summation of district @@ -104,12 +233,10 @@ fixes = { # compromise to keep as many surrounding numbers stable as possible. datetime.datetime(2020, 3, 26): { 'SZ': { - 'growth': 12, - 'total': 132 + 'growth': 12 }, 'sum': { - 'growth': 286, - 'total': 1931 + 'growth': 286 } }, # Here the official total is 220, while the summation of district @@ -121,7 +248,7 @@ fixes = { }, } -# Scan navigation bar for maximum pagination value. +# Scan navigation bar for maximum pagination value. url = url_prefix + pm_dir with urllib.request.urlopen(url) as response: html = response.read() @@ -147,12 +274,11 @@ for i in range(max_page): continue day_urls += [link['href']] -# Collect infection data. -data = {} +# Collect infection data. first_run = True districts_sorted = [] # TODO: Push limit further back (might need more data fixes for that). -date_limit = datetime.datetime(2020, 3, 16) +date_limit = datetime.datetime(2020, 3, 12) for path in day_urls: url = url_prefix + path with urllib.request.urlopen(url) as response: @@ -164,14 +290,17 @@ for path in day_urls: date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y') if date_limit > date: break - if date in data: - raise Exception('Double date %s', date) - #date -= datetime.timedelta(days=1) - data[date] = {} + # On that day, two press releases were released, for that and the prev day. + if date == datetime.datetime(2020, 3, 15) and date in data: + date = datetime.datetime(2020, 3, 14) + # From here on, press releases describe numbers from prev day. + if date <= datetime.datetime(2020, 3, 13): + date = date - datetime.timedelta(days=1) table = soup.find('table') - if table is None: - data[date] = unparsable_graphics_fallback[date] + # For 13th of March we lack a press release. + if table is None and (date in data or date == datetime.datetime(2020, 3, 13)): continue + data[date] = {} for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]: printable_tds = [] for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]: @@ -189,6 +318,21 @@ for path in day_urls: total = int(total_str.replace('.', '')) data[date][district_short] = {'growth': growth, 'total': total} first_run = False + +# Reconstruct data for 13th of March. +day_target = datetime.datetime(2020, 3, 13) +day_after = day_target + datetime.timedelta(days=1) +day_before = day_target - datetime.timedelta(days=1) +data[day_target] = {} +for district in [d for d in districts_sorted]: + data[day_target][district] = {} + total_after = data[day_after][district]['total'] + growth_after = data[day_after][district]['growth'] + total_target = total_after - growth_after + data[day_target][district]['total'] = total_target + total_before = data[day_before][district]['total'] + data[day_target][district]['growth'] = total_target - total_before + dates_sorted = list(data.keys()) dates_sorted.sort() dates_sorted.reverse() @@ -208,7 +352,7 @@ for date in dates_sorted: raise Exception('Dates not contiguous: %s missing', prev_date) else: continue - prev_total = data[date - datetime.timedelta(days=1)][district]['total'] + prev_total = data[prev_date][district]['total'] cur_total = data[date][district]['total'] if cur_total - data[date][district]['growth'] != prev_total: raise Exception('Questionable district infection total in %s/%s' % (district, date))