From: Christian Heller Date: Tue, 21 Jul 2020 17:02:27 +0000 (+0200) Subject: Add draft of infection data scraper. X-Git-Url: https://plomlompom.com/repos/test?a=commitdiff_plain;h=d08b45bff0b5147825c60fa8f17b2ec384521963;p=berlin-corona-table Add draft of infection data scraper. --- diff --git a/scrape.py b/scrape.py new file mode 100755 index 0000000..7c7196c --- /dev/null +++ b/scrape.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +import urllib.request +import datetime +import bs4 +import re + +url_prefix = 'https://www.berlin.de' +pm_dir = '/sen/gpg/service/presse/2020/' +pm_nav_path = pm_dir + '?page_at_1_0=' + +# Map abbreviations to full names (and their alternate spellings). +abbrevs = { + 'CW': {'Charlottenburg-Wilmersdorf'}, + 'FK': {'Friedrichshain-Kreuzberg'}, + 'Li': {'Lichtenberg'}, + 'MH': {'Marzahn-Hellersdorf'}, + 'Mi': {'Mitte'}, + 'Ne': {'Neukölln', 'Neuköln'}, + 'Pa': {'Pankow'}, + 'Re': {'Reinickendorf'}, + 'Sp': {'Spandau'}, + 'SZ': {'Steglitz-Zehlendorf'}, + 'TS': {'Tempelhof-Schöneberg'}, + 'TK': {'Treptow-Köpenick'}, + 'sum': {'Summe', 'Berlin'}, +} + +# Here only image files are available for the table data. +unparsable_graphics_fallback = { + datetime.datetime(2020, 7, 2): { + 'CW': {'growth': 4, 'total': 851}, + 'FK': {'growth': 10, 'total': 681}, + 'Li': {'growth': 3, 'total': 427}, + 'MH': {'growth': 4, 'total': 468}, + 'Mi': {'growth': 0, 'total': 1202}, + 'Ne': {'growth': 7, 'total': 1031}, + 'Pa': {'growth': 3, 'total': 784}, + 'Re': {'growth': 6, 'total': 660}, + 'Sp': {'growth': 3, 'total': 450}, + 'SZ': {'growth': 0, 'total': 591}, + 'TS': {'growth': 3, 'total': 798}, + 'TK': {'growth': 0, 'total': 401}, + 'sum': {'growth': 43, 'total': 8344} + }, + datetime.datetime(2020, 4, 5): { + 'CW': {'growth': 9, 'total': 462}, + 'FK': {'growth': 2, 'total': 352}, + 'Li': {'growth': 0, 'total': 142}, + 'MH': {'growth': 3, 'total': 127}, + 'Mi': {'growth': 14, 'total': 537}, + 'Ne': {'growth': 0, 'total': 392}, + 'Pa': {'growth': 10, 'total': 378}, + 'Re': {'growth': 9, 'total': 248}, + 'Sp': {'growth': 3, 'total': 150}, + 'SZ': {'growth': 0, 'total': 312}, + 'TS': {'growth': 8, 'total': 394}, + 'TK': {'growth': 3, 'total': 193}, + 'sum': {'growth': 61, 'total': 3687} + }, + datetime.datetime(2020, 4, 4): { + 'CW': {'growth': 2, 'total': 453}, + 'FK': {'growth': 7, 'total': 350}, + 'Li': {'growth': 0, 'total': 142}, + 'MH': {'growth': 15, 'total': 124}, + 'Mi': {'growth': 22, 'total': 523}, + 'Ne': {'growth': 15, 'total': 392}, + 'Pa': {'growth': 10, 'total': 368}, + 'Re': {'growth': 5, 'total': 239}, + 'Sp': {'growth': 21, 'total': 147}, + 'SZ': {'growth': 12, 'total': 312}, + 'TS': {'growth': 24, 'total': 386}, + 'TK': {'growth': 7, 'total': 190}, + 'sum': {'growth': 140, 'total': 3626} + }, + datetime.datetime(2020, 4, 3): { + 'CW': {'growth': 44, 'total': 451}, + 'FK': {'growth': 17, 'total': 343}, + 'Li': {'growth': 7, 'total': 142}, + 'MH': {'growth': 4, 'total': 109}, + 'Mi': {'growth': 4, 'total': 501}, + 'Ne': {'growth': 40, 'total': 377}, + 'Pa': {'growth': 39, 'total': 358}, + 'Re': {'growth': 26, 'total': 234}, + 'Sp': {'growth': 9, 'total': 126}, + 'SZ': {'growth': 18, 'total': 300}, + 'TS': {'growth': 41, 'total': 362}, + 'TK': {'growth': 14, 'total': 183}, + 'sum': {'growth': 263, 'total': 3486} + } +} +fixes = { + # Here the official total is 215, while the summation of district + # numbers only adds up to 125 – pretty much looks like a mere + # transposition of digits. + datetime.datetime(2020, 3, 27): { + 'sum': { + 'growth': 125 + } + }, + # Here the official total is 1937, while the summation of district + # numbers only adds up to 1917; furthermore, the original value for + # SZ is 118 (+18), which makes no sense, as the day before is + # 120 (+15) and the day after is 147 (+15). The following is a + # compromise to keep as many surrounding numbers stable as possible. + datetime.datetime(2020, 3, 26): { + 'SZ': { + 'growth': 12, + 'total': 132 + }, + 'sum': { + 'growth': 286, + 'total': 1931 + } + }, + # Here the official total is 220, while the summation of district + # numbers adds up to 228 – looks like someone misread an 8 as a 0. + datetime.datetime(2020, 3, 25): { + 'sum': { + 'growth': 220 + } + }, +} + +# Scan navigation bar for maximum pagination value. +url = url_prefix + pm_dir +with urllib.request.urlopen(url) as response: + html = response.read() +soup = bs4.BeautifulSoup(html, 'html.parser') +max_page=0 +for link in soup.find_all('a'): + href = link['href'] + if str.startswith(href, pm_nav_path): + max_test = int(href.split('=')[1]) + max_page = max_test if max_test > max_page else max_page + +# Scan paginated press release links for daily Corona number briefing links. +day_urls = [] +for i in range(max_page): + url = url_prefix + pm_nav_path + str(i + 1) + with urllib.request.urlopen(url) as response: + html = response.read() + soup = bs4.BeautifulSoup(html, 'html.parser') + for link in soup.find_all('a'): + if (not link.string) or\ + (not link.string.startswith('Coronavirus: Derzeit') and + not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')): + continue + day_urls += [link['href']] + +# Collect infection data. +data = {} +first_run = True +districts_sorted = [] +# TODO: Push limit further back (might need more data fixes for that). +date_limit = datetime.datetime(2020, 3, 16) +for path in day_urls: + url = url_prefix + path + with urllib.request.urlopen(url) as response: + html = response.read() + soup = bs4.BeautifulSoup(html, 'html.parser') + date_title = soup.find('div', class_='pressnumber') + m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string) + date_formatted = m.group(0) + date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y') + if date_limit > date: + break + if date in data: + raise Exception('Double date %s', date) + #date -= datetime.timedelta(days=1) + data[date] = {} + table = soup.find('table') + if table is None: + data[date] = unparsable_graphics_fallback[date] + continue + for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]: + printable_tds = [] + for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]: + printable_string = ' '.join([s for s in td.strings]) + printable_tds += [printable_string.strip()] + district_long = printable_tds[0] + district_short = [k for k in abbrevs if district_long in abbrevs[k]][0] + if first_run: + districts_sorted += [district_short] + split_char = ' ' + if not split_char in printable_tds[1]: + split_char = '(' + total_str, growth_str = printable_tds[1].split(split_char) + growth = int(growth_str.replace('(', '').replace(')', '').replace('+', '')) + total = int(total_str.replace('.', '')) + data[date][district_short] = {'growth': growth, 'total': total} + first_run = False +dates_sorted = list(data.keys()) +dates_sorted.sort() +dates_sorted.reverse() + +# Apply fixes and ensure integrity of results +for date in fixes: + for district in fixes[date]: + for type_ in fixes[date][district]: + data[date][district][type_] = fixes[date][district][type_] +for date in dates_sorted: + if date in fixes: + continue + for district in [d for d in districts_sorted if not d=='sum']: + prev_date = date - datetime.timedelta(days=1) + if prev_date not in dates_sorted: + # TODO: ensure dates until end of list are continuous + continue + prev_total = data[date - datetime.timedelta(days=1)][district]['total'] + cur_total = data[date][district]['total'] + if cur_total - data[date][district]['growth'] != prev_total: + raise Exception('Questionable district infection total in %s/%s' % (district, date)) + day_sum = 0 + for district in [d for d in districts_sorted if not d=='sum']: + day_sum += data[date][district]['total'] + if day_sum != data[date]['sum']['total']: + raise Exception('Questionable district infection total sum in %s' % date) + day_sum = 0 + for district in [d for d in districts_sorted if not d=='sum']: + day_sum += data[date][district]['growth'] + if day_sum != data[date]['sum']['growth']: + raise Exception('Questionable district infection growth sum in %s' % date) + +# Final output. +print(' '*10, ' '.join(['%3s' % d for d in districts_sorted])) +for date in dates_sorted: + growths = [] + for d in districts_sorted: + growths += [data[date][d]['growth']] + print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))