7 url_prefix = 'https://www.berlin.de'
8 pm_dir = '/sen/gpg/service/presse/2020/'
9 pm_nav_path = pm_dir + '?page_at_1_0='
11 # Map abbreviations to full names (and their alternate spellings).
13 'CW': {'Charlottenburg-Wilmersdorf'},
14 'FK': {'Friedrichshain-Kreuzberg'},
15 'Li': {'Lichtenberg'},
16 'MH': {'Marzahn-Hellersdorf'},
18 'Ne': {'Neukölln', 'Neuköln'},
20 'Re': {'Reinickendorf'},
22 'SZ': {'Steglitz-Zehlendorf'},
23 'TS': {'Tempelhof-Schöneberg'},
24 'TK': {'Treptow-Köpenick'},
25 'sum': {'Summe', 'Berlin'},
28 # Here only image files are available for the table data.
29 unparsable_graphics_fallback = {
30 datetime.datetime(2020, 7, 2): {
31 'CW': {'growth': 4, 'total': 851},
32 'FK': {'growth': 10, 'total': 681},
33 'Li': {'growth': 3, 'total': 427},
34 'MH': {'growth': 4, 'total': 468},
35 'Mi': {'growth': 0, 'total': 1202},
36 'Ne': {'growth': 7, 'total': 1031},
37 'Pa': {'growth': 3, 'total': 784},
38 'Re': {'growth': 6, 'total': 660},
39 'Sp': {'growth': 3, 'total': 450},
40 'SZ': {'growth': 0, 'total': 591},
41 'TS': {'growth': 3, 'total': 798},
42 'TK': {'growth': 0, 'total': 401},
43 'sum': {'growth': 43, 'total': 8344}
45 datetime.datetime(2020, 4, 5): {
46 'CW': {'growth': 9, 'total': 462},
47 'FK': {'growth': 2, 'total': 352},
48 'Li': {'growth': 0, 'total': 142},
49 'MH': {'growth': 3, 'total': 127},
50 'Mi': {'growth': 14, 'total': 537},
51 'Ne': {'growth': 0, 'total': 392},
52 'Pa': {'growth': 10, 'total': 378},
53 'Re': {'growth': 9, 'total': 248},
54 'Sp': {'growth': 3, 'total': 150},
55 'SZ': {'growth': 0, 'total': 312},
56 'TS': {'growth': 8, 'total': 394},
57 'TK': {'growth': 3, 'total': 193},
58 'sum': {'growth': 61, 'total': 3687}
60 datetime.datetime(2020, 4, 4): {
61 'CW': {'growth': 2, 'total': 453},
62 'FK': {'growth': 7, 'total': 350},
63 'Li': {'growth': 0, 'total': 142},
64 'MH': {'growth': 15, 'total': 124},
65 'Mi': {'growth': 22, 'total': 523},
66 'Ne': {'growth': 15, 'total': 392},
67 'Pa': {'growth': 10, 'total': 368},
68 'Re': {'growth': 5, 'total': 239},
69 'Sp': {'growth': 21, 'total': 147},
70 'SZ': {'growth': 12, 'total': 312},
71 'TS': {'growth': 24, 'total': 386},
72 'TK': {'growth': 7, 'total': 190},
73 'sum': {'growth': 140, 'total': 3626}
75 datetime.datetime(2020, 4, 3): {
76 'CW': {'growth': 44, 'total': 451},
77 'FK': {'growth': 17, 'total': 343},
78 'Li': {'growth': 7, 'total': 142},
79 'MH': {'growth': 4, 'total': 109},
80 'Mi': {'growth': 4, 'total': 501},
81 'Ne': {'growth': 40, 'total': 377},
82 'Pa': {'growth': 39, 'total': 358},
83 'Re': {'growth': 26, 'total': 234},
84 'Sp': {'growth': 9, 'total': 126},
85 'SZ': {'growth': 18, 'total': 300},
86 'TS': {'growth': 41, 'total': 362},
87 'TK': {'growth': 14, 'total': 183},
88 'sum': {'growth': 263, 'total': 3486}
92 # Here the official total is 215, while the summation of district
93 # numbers only adds up to 125 – pretty much looks like a mere
94 # transposition of digits.
95 datetime.datetime(2020, 3, 27): {
100 # Here the official total is 1937, while the summation of district
101 # numbers only adds up to 1917; furthermore, the original value for
102 # SZ is 118 (+18), which makes no sense, as the day before is
103 # 120 (+15) and the day after is 147 (+15). The following is a
104 # compromise to keep as many surrounding numbers stable as possible.
105 datetime.datetime(2020, 3, 26): {
113 # Here the official total is 220, while the summation of district
114 # numbers adds up to 228 – looks like someone misread an 8 as a 0.
115 datetime.datetime(2020, 3, 25): {
122 # Scan navigation bar for maximum pagination value.
123 url = url_prefix + pm_dir
124 with urllib.request.urlopen(url) as response:
125 html = response.read()
126 soup = bs4.BeautifulSoup(html, 'html.parser')
128 for link in soup.find_all('a'):
130 if str.startswith(href, pm_nav_path):
131 max_test = int(href.split('=')[1])
132 max_page = max_test if max_test > max_page else max_page
134 # Scan paginated press release links for daily Corona number briefing links.
136 for i in range(max_page):
137 url = url_prefix + pm_nav_path + str(i + 1)
138 with urllib.request.urlopen(url) as response:
139 html = response.read()
140 soup = bs4.BeautifulSoup(html, 'html.parser')
141 for link in soup.find_all('a'):
142 if (not link.string) or\
143 (not link.string.startswith('Coronavirus: Derzeit') and
144 not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')):
146 day_urls += [link['href']]
148 # Collect infection data.
151 districts_sorted = []
152 # TODO: Push limit further back (might need more data fixes for that).
153 date_limit = datetime.datetime(2020, 3, 16)
154 for path in day_urls:
155 url = url_prefix + path
156 with urllib.request.urlopen(url) as response:
157 html = response.read()
158 soup = bs4.BeautifulSoup(html, 'html.parser')
159 date_title = soup.find('div', class_='pressnumber')
160 m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string)
161 date_formatted = m.group(0)
162 date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
163 if date_limit > date:
166 raise Exception('Double date %s', date)
167 #date -= datetime.timedelta(days=1)
169 table = soup.find('table')
171 data[date] = unparsable_graphics_fallback[date]
173 for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
175 for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
176 printable_string = ' '.join([s for s in td.strings])
177 printable_tds += [printable_string.strip()]
178 district_long = printable_tds[0]
179 district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
181 districts_sorted += [district_short]
183 if not split_char in printable_tds[1]:
185 total_str, growth_str = printable_tds[1].split(split_char)
186 growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
187 total = int(total_str.replace('.', ''))
188 data[date][district_short] = {'growth': growth, 'total': total}
190 dates_sorted = list(data.keys())
192 dates_sorted.reverse()
194 # Apply fixes and ensure integrity of results
196 for district in fixes[date]:
197 for type_ in fixes[date][district]:
198 data[date][district][type_] = fixes[date][district][type_]
199 for date in dates_sorted:
202 for district in [d for d in districts_sorted if not d=='sum']:
203 prev_date = date - datetime.timedelta(days=1)
204 if prev_date not in dates_sorted:
205 if prev_date >= date_limit:
206 raise Exception('Dates not contiguous: %s missing', prev_date)
209 prev_total = data[date - datetime.timedelta(days=1)][district]['total']
210 cur_total = data[date][district]['total']
211 if cur_total - data[date][district]['growth'] != prev_total:
212 raise Exception('Questionable district infection total in %s/%s' % (district, date))
214 for district in [d for d in districts_sorted if not d=='sum']:
215 day_sum += data[date][district]['total']
216 if day_sum != data[date]['sum']['total']:
217 raise Exception('Questionable district infection total sum in %s' % date)
219 for district in [d for d in districts_sorted if not d=='sum']:
220 day_sum += data[date][district]['growth']
221 if day_sum != data[date]['sum']['growth']:
222 raise Exception('Questionable district infection growth sum in %s' % date)
225 print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
226 for date in dates_sorted:
228 for d in districts_sorted:
229 growths += [data[date][d]['growth']]
230 print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))