X-Git-Url: https://plomlompom.com/repos/?p=berlin-corona-table;a=blobdiff_plain;f=scrape.py;h=e449b998b2821f4fe1fe9539fc1fbf28b2898b9c;hp=7c7196c93eef2cde7adc6a1586b1c42d7c394443;hb=4289e3ee6177babe630e973d5a2d5049538ab96e;hpb=d08b45bff0b5147825c60fa8f17b2ec384521963 diff --git a/scrape.py b/scrape.py index 7c7196c..e449b99 100755 --- a/scrape.py +++ b/scrape.py @@ -104,12 +104,10 @@ fixes = { # compromise to keep as many surrounding numbers stable as possible. datetime.datetime(2020, 3, 26): { 'SZ': { - 'growth': 12, - 'total': 132 + 'growth': 12 }, 'sum': { - 'growth': 286, - 'total': 1931 + 'growth': 286 } }, # Here the official total is 220, while the summation of district @@ -121,7 +119,7 @@ fixes = { }, } -# Scan navigation bar for maximum pagination value. +# Scan navigation bar for maximum pagination value. url = url_prefix + pm_dir with urllib.request.urlopen(url) as response: html = response.read() @@ -147,7 +145,7 @@ for i in range(max_page): continue day_urls += [link['href']] -# Collect infection data. +# Collect infection data. data = {} first_run = True districts_sorted = [] @@ -204,8 +202,10 @@ for date in dates_sorted: for district in [d for d in districts_sorted if not d=='sum']: prev_date = date - datetime.timedelta(days=1) if prev_date not in dates_sorted: - # TODO: ensure dates until end of list are continuous - continue + if prev_date >= date_limit: + raise Exception('Dates not contiguous: %s missing', prev_date) + else: + continue prev_total = data[date - datetime.timedelta(days=1)][district]['total'] cur_total = data[date][district]['total'] if cur_total - data[date][district]['growth'] != prev_total: