X-Git-Url: https://plomlompom.com/repos/?p=berlin-corona-table;a=blobdiff_plain;f=scrape.py;h=d05fe909707bae674362e84b9dfa6ffeeff87073;hp=de458f4234a241bff448dc6252df12b6ed2e9636;hb=6bc3d0cd7fb6e4872e47951dc66aa72f4014b0aa;hpb=6238ffaeaaacf95a632b691ec1529966cd92a381 diff --git a/scrape.py b/scrape.py index de458f4..d05fe90 100755 --- a/scrape.py +++ b/scrape.py @@ -88,23 +88,118 @@ data = { 'TK': {'growth': 14, 'total': 183}, 'sum': {'growth': 263, 'total': 3486} }, - # This one has no press release but can be reconstructed from - # the neighbour ones. - datetime.datetime(2020, 3, 13): { - 'CW': {'growth': 16, 'total': 47}, - 'FK': {'growth': 8, 'total': 22}, - 'Li': {'growth': 2, 'total': 8}, - 'MH': {'growth': 1, 'total': 4}, - 'Mi': {'growth': 9, 'total': 29}, - 'Ne': {'growth': 6, 'total': 16}, - 'Pa': {'growth': 11, 'total': 26}, - 'Re': {'growth': 0, 'total': 11}, - 'Sp': {'growth': 1, 'total': 9}, - 'SZ': {'growth': 0, 'total': 20}, - 'TS': {'growth': 1, 'total': 17}, - 'TK': {'growth': 3, 'total': 7}, - 'sum': {'growth': 58, 'total': 216} - } + # Here the totals needed to be reconstructed. + datetime.datetime(2020, 3, 9): { + 'CW': {'growth': 4, 'total': 13}, + 'FK': {'growth': 3, 'total': 12}, + 'Li': {'growth': 0, 'total': 1}, + 'MH': {'growth': 1, 'total': 2}, + 'Mi': {'growth': 0, 'total': 8}, + 'Ne': {'growth': 1, 'total': 3}, + 'Pa': {'growth': 1, 'total': 6}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 0, 'total': 2}, + 'SZ': {'growth': 0, 'total': 3}, + 'TS': {'growth': 0, 'total': 5}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 10, 'total': 58} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 8): { + 'CW': {'growth': 0, 'total': 9}, + 'FK': {'growth': 4, 'total': 9}, + 'Li': {'growth': 1, 'total': 1}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 8}, + 'Ne': {'growth': 0, 'total': 2}, + 'Pa': {'growth': 0, 'total': 5}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 2, 'total': 2}, + 'SZ': {'growth': 1, 'total': 3}, + 'TS': {'growth': 0, 'total': 5}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 8, 'total': 48} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 7): { + 'CW': {'growth': 6, 'total': 9}, + 'FK': {'growth': 1, 'total': 5}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 1, 'total': 8}, + 'Ne': {'growth': 0, 'total': 2}, + 'Pa': {'growth': 1, 'total': 5}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 2, 'total': 2}, + 'TS': {'growth': 1, 'total': 5}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 12, 'total': 40} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 6): { + 'CW': {'growth': 1, 'total': 3}, + 'FK': {'growth': 0, 'total': 4}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 4, 'total': 7}, + 'Ne': {'growth': 1, 'total': 2}, + 'Pa': {'growth': 1, 'total': 4}, + 'Re': {'growth': 0, 'total': 3}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 2, 'total': 4}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 9, 'total': 28} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 5): { + 'CW': {'growth': 2, 'total': 2}, + 'FK': {'growth': 0, 'total': 4}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 3}, + 'Ne': {'growth': 0, 'total': 1}, + 'Pa': {'growth': 1, 'total': 3}, + 'Re': {'growth': 2, 'total': 3}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 1, 'total': 2}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 6, 'total': 19} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 4): { + 'CW': {'growth': 0, 'total': 0}, + 'FK': {'growth': 2, 'total': 4}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 3}, + 'Ne': {'growth': 0, 'total': 1}, + 'Pa': {'growth': 1, 'total': 2}, + 'Re': {'growth': 1, 'total': 1}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 0, 'total': 1}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 4, 'total': 13} + }, + # Here the growth numbers needed to be reconstructed. + datetime.datetime(2020, 3, 3): { + 'CW': {'growth': 0, 'total': 0}, + 'FK': {'growth': 2, 'total': 2}, + 'Li': {'growth': 0, 'total': 0}, + 'MH': {'growth': 0, 'total': 1}, + 'Mi': {'growth': 0, 'total': 3}, + 'Ne': {'growth': 0, 'total': 1}, + 'Pa': {'growth': 1, 'total': 1}, + 'Re': {'growth': 0, 'total': 0}, + 'Sp': {'growth': 0, 'total': 0}, + 'SZ': {'growth': 0, 'total': 0}, + 'TS': {'growth': 0, 'total': 1}, + 'TK': {'growth': 0, 'total': 0}, + 'sum': {'growth': 3, 'total': 9} + }, } fixes = { # Here the official total is 215, while the summation of district @@ -166,8 +261,7 @@ for i in range(max_page): # Collect infection data. first_run = True districts_sorted = [] -# TODO: Push limit further back (might need more data fixes for that). -date_limit = datetime.datetime(2020, 3, 12) +date_limit = datetime.datetime(2020, 3, 11) for path in day_urls: url = url_prefix + path with urllib.request.urlopen(url) as response: @@ -186,26 +280,58 @@ for path in day_urls: if date <= datetime.datetime(2020, 3, 13): date = date - datetime.timedelta(days=1) table = soup.find('table') - if table is None and date in data: + # For 13th of March we lack a press release. + if table is None and (date in data or date == datetime.datetime(2020, 3, 13)): continue data[date] = {} for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]: - printable_tds = [] - for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]: - printable_string = ' '.join([s for s in td.strings]) - printable_tds += [printable_string.strip()] - district_long = printable_tds[0] - district_short = [k for k in abbrevs if district_long in abbrevs[k]][0] - if first_run: - districts_sorted += [district_short] - split_char = ' ' - if not split_char in printable_tds[1]: - split_char = '(' - total_str, growth_str = printable_tds[1].split(split_char) - growth = int(growth_str.replace('(', '').replace(')', '').replace('+', '')) - total = int(total_str.replace('.', '')) - data[date][district_short] = {'growth': growth, 'total': total} + printable_tds = [] + for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]: + printable_string = ' '.join([s for s in td.strings]) + printable_tds += [printable_string.strip()] + district_long = printable_tds[0] + district_short = [k for k in abbrevs if district_long in abbrevs[k]][0] + if first_run: + districts_sorted += [district_short] + if date == datetime.datetime(2020, 3, 10): + # For this date we only get totals. + data[date][district_short] = {'total': int(printable_tds[1])} + else: + split_char = ' ' + if not split_char in printable_tds[1]: + split_char = '(' + total_str, growth_str = printable_tds[1].split(split_char) + growth = int(growth_str.replace('(', '').replace(')', '').\ + replace('+', '')) + total = int(total_str.replace('.', '')) + data[date][district_short] = {'growth': growth, 'total': total} first_run = False + +def neighbor_days(day_target): + day_delta = datetime.timedelta(days=1) + return day_target + day_delta, day_target - day_delta + +# Reconstruct growth for 10th of March. +day_target = datetime.datetime(2020, 3, 10) +day_after, day_before = neighbor_days(day_target) +for district in [d for d in districts_sorted]: + total_target = data[day_target][district]['total'] + total_before = data[day_before][district]['total'] + data[day_target][district]['growth'] = total_target - total_before + +# Reconstruct data for 13th of March. +day_target = datetime.datetime(2020, 3, 13) +day_after, day_before = neighbor_days(day_target) +data[day_target] = {} +for district in [d for d in districts_sorted]: + data[day_target][district] = {} + total_after = data[day_after][district]['total'] + growth_after = data[day_after][district]['growth'] + total_target = total_after - growth_after + data[day_target][district]['total'] = total_target + total_before = data[day_before][district]['total'] + data[day_target][district]['growth'] = total_target - total_before + dates_sorted = list(data.keys()) dates_sorted.sort() dates_sorted.reverse() @@ -241,6 +367,7 @@ for date in dates_sorted: raise Exception('Questionable district infection growth sum in %s' % date) # Final output. +dates_sorted.reverse() print(' '*10, ' '.join(['%3s' % d for d in districts_sorted])) for date in dates_sorted: growths = []