7 url_prefix = 'https://www.berlin.de'
8 pm_dir = '/sen/gpg/service/presse/2020/'
9 pm_nav_path = pm_dir + '?page_at_1_0='
11 # Map abbreviations to full names (and their alternate spellings).
13 'CW': {'Charlottenburg-Wilmersdorf'},
14 'FK': {'Friedrichshain-Kreuzberg'},
15 'Li': {'Lichtenberg'},
16 'MH': {'Marzahn-Hellersdorf'},
18 'Ne': {'Neukölln', 'Neuköln'},
20 'Re': {'Reinickendorf'},
22 'SZ': {'Steglitz-Zehlendorf'},
23 'TS': {'Tempelhof-Schöneberg'},
24 'TK': {'Treptow-Köpenick'},
25 'sum': {'Summe', 'Berlin'},
28 # some pre-filled values
30 # For these, only image files are available for the table data.
31 datetime.datetime(2020, 7, 2): {
32 'CW': {'growth': 4, 'total': 851},
33 'FK': {'growth': 10, 'total': 681},
34 'Li': {'growth': 3, 'total': 427},
35 'MH': {'growth': 4, 'total': 468},
36 'Mi': {'growth': 0, 'total': 1202},
37 'Ne': {'growth': 7, 'total': 1031},
38 'Pa': {'growth': 3, 'total': 784},
39 'Re': {'growth': 6, 'total': 660},
40 'Sp': {'growth': 3, 'total': 450},
41 'SZ': {'growth': 0, 'total': 591},
42 'TS': {'growth': 3, 'total': 798},
43 'TK': {'growth': 0, 'total': 401},
44 'sum': {'growth': 43, 'total': 8344}
46 datetime.datetime(2020, 4, 5): {
47 'CW': {'growth': 9, 'total': 462},
48 'FK': {'growth': 2, 'total': 352},
49 'Li': {'growth': 0, 'total': 142},
50 'MH': {'growth': 3, 'total': 127},
51 'Mi': {'growth': 14, 'total': 537},
52 'Ne': {'growth': 0, 'total': 392},
53 'Pa': {'growth': 10, 'total': 378},
54 'Re': {'growth': 9, 'total': 248},
55 'Sp': {'growth': 3, 'total': 150},
56 'SZ': {'growth': 0, 'total': 312},
57 'TS': {'growth': 8, 'total': 394},
58 'TK': {'growth': 3, 'total': 193},
59 'sum': {'growth': 61, 'total': 3687}
61 datetime.datetime(2020, 4, 4): {
62 'CW': {'growth': 2, 'total': 453},
63 'FK': {'growth': 7, 'total': 350},
64 'Li': {'growth': 0, 'total': 142},
65 'MH': {'growth': 15, 'total': 124},
66 'Mi': {'growth': 22, 'total': 523},
67 'Ne': {'growth': 15, 'total': 392},
68 'Pa': {'growth': 10, 'total': 368},
69 'Re': {'growth': 5, 'total': 239},
70 'Sp': {'growth': 21, 'total': 147},
71 'SZ': {'growth': 12, 'total': 312},
72 'TS': {'growth': 24, 'total': 386},
73 'TK': {'growth': 7, 'total': 190},
74 'sum': {'growth': 140, 'total': 3626}
76 datetime.datetime(2020, 4, 3): {
77 'CW': {'growth': 44, 'total': 451},
78 'FK': {'growth': 17, 'total': 343},
79 'Li': {'growth': 7, 'total': 142},
80 'MH': {'growth': 4, 'total': 109},
81 'Mi': {'growth': 4, 'total': 501},
82 'Ne': {'growth': 40, 'total': 377},
83 'Pa': {'growth': 39, 'total': 358},
84 'Re': {'growth': 26, 'total': 234},
85 'Sp': {'growth': 9, 'total': 126},
86 'SZ': {'growth': 18, 'total': 300},
87 'TS': {'growth': 41, 'total': 362},
88 'TK': {'growth': 14, 'total': 183},
89 'sum': {'growth': 263, 'total': 3486}
91 # This one has no press release but can be reconstructed from
93 datetime.datetime(2020, 3, 13): {
94 'CW': {'growth': 16, 'total': 47},
95 'FK': {'growth': 8, 'total': 22},
96 'Li': {'growth': 2, 'total': 8},
97 'MH': {'growth': 1, 'total': 4},
98 'Mi': {'growth': 9, 'total': 29},
99 'Ne': {'growth': 6, 'total': 16},
100 'Pa': {'growth': 11, 'total': 26},
101 'Re': {'growth': 0, 'total': 11},
102 'Sp': {'growth': 1, 'total': 9},
103 'SZ': {'growth': 0, 'total': 20},
104 'TS': {'growth': 1, 'total': 17},
105 'TK': {'growth': 3, 'total': 7},
106 'sum': {'growth': 58, 'total': 216}
108 # Here the growth numbers needed to be reconstructed.
109 datetime.datetime(2020, 3, 10): {
110 'CW': {'growth': 2, 'total': 15},
111 'FK': {'growth': 0, 'total': 12},
112 'Li': {'growth': 4, 'total': 5},
113 'MH': {'growth': 1, 'total': 3},
114 'Mi': {'growth': 0, 'total': 8},
115 'Ne': {'growth': 2, 'total': 5},
116 'Pa': {'growth': 2, 'total': 8},
117 'Re': {'growth': 0, 'total': 3},
118 'Sp': {'growth': 4, 'total': 6},
119 'SZ': {'growth': 3, 'total': 6},
120 'TS': {'growth': 2, 'total': 7},
121 'TK': {'growth': 3, 'total': 3},
122 'sum': {'growth': 23, 'total': 81}
124 # Here the totals needed to be reconstructed.
125 datetime.datetime(2020, 3, 9): {
126 'CW': {'growth': 4, 'total': 13},
127 'FK': {'growth': 3, 'total': 12},
128 'Li': {'growth': 0, 'total': 1},
129 'MH': {'growth': 1, 'total': 2},
130 'Mi': {'growth': 0, 'total': 8},
131 'Ne': {'growth': 1, 'total': 3},
132 'Pa': {'growth': 1, 'total': 6},
133 'Re': {'growth': 0, 'total': 3},
134 'Sp': {'growth': 0, 'total': 2},
135 'SZ': {'growth': 0, 'total': 3},
136 'TS': {'growth': 0, 'total': 5},
137 'TK': {'growth': 0, 'total': 0},
138 'sum': {'growth': 10, 'total': 58}
140 # Here the growth numbers needed to be reconstructed.
141 datetime.datetime(2020, 3, 8): {
142 'CW': {'growth': 0, 'total': 9},
143 'FK': {'growth': 4, 'total': 9},
144 'Li': {'growth': 1, 'total': 1},
145 'MH': {'growth': 0, 'total': 1},
146 'Mi': {'growth': 0, 'total': 8},
147 'Ne': {'growth': 0, 'total': 2},
148 'Pa': {'growth': 0, 'total': 5},
149 'Re': {'growth': 0, 'total': 3},
150 'Sp': {'growth': 2, 'total': 2},
151 'SZ': {'growth': 1, 'total': 3},
152 'TS': {'growth': 0, 'total': 5},
153 'TK': {'growth': 0, 'total': 0},
154 'sum': {'growth': 8, 'total': 48}
156 # Here the growth numbers needed to be reconstructed.
157 datetime.datetime(2020, 3, 7): {
158 'CW': {'growth': 6, 'total': 9},
159 'FK': {'growth': 1, 'total': 5},
160 'Li': {'growth': 0, 'total': 0},
161 'MH': {'growth': 0, 'total': 1},
162 'Mi': {'growth': 1, 'total': 8},
163 'Ne': {'growth': 0, 'total': 2},
164 'Pa': {'growth': 1, 'total': 5},
165 'Re': {'growth': 0, 'total': 3},
166 'Sp': {'growth': 0, 'total': 0},
167 'SZ': {'growth': 2, 'total': 2},
168 'TS': {'growth': 1, 'total': 5},
169 'TK': {'growth': 0, 'total': 0},
170 'sum': {'growth': 12, 'total': 40}
172 # Here the growth numbers needed to be reconstructed.
173 datetime.datetime(2020, 3, 6): {
174 'CW': {'growth': 1, 'total': 3},
175 'FK': {'growth': 0, 'total': 4},
176 'Li': {'growth': 0, 'total': 0},
177 'MH': {'growth': 0, 'total': 1},
178 'Mi': {'growth': 4, 'total': 7},
179 'Ne': {'growth': 1, 'total': 2},
180 'Pa': {'growth': 1, 'total': 4},
181 'Re': {'growth': 0, 'total': 3},
182 'Sp': {'growth': 0, 'total': 0},
183 'SZ': {'growth': 0, 'total': 0},
184 'TS': {'growth': 2, 'total': 4},
185 'TK': {'growth': 0, 'total': 0},
186 'sum': {'growth': 9, 'total': 28}
188 # Here the growth numbers needed to be reconstructed.
189 datetime.datetime(2020, 3, 5): {
190 'CW': {'growth': 2, 'total': 2},
191 'FK': {'growth': 0, 'total': 4},
192 'Li': {'growth': 0, 'total': 0},
193 'MH': {'growth': 0, 'total': 1},
194 'Mi': {'growth': 0, 'total': 3},
195 'Ne': {'growth': 0, 'total': 1},
196 'Pa': {'growth': 1, 'total': 3},
197 'Re': {'growth': 2, 'total': 3},
198 'Sp': {'growth': 0, 'total': 0},
199 'SZ': {'growth': 0, 'total': 0},
200 'TS': {'growth': 1, 'total': 2},
201 'TK': {'growth': 0, 'total': 0},
202 'sum': {'growth': 6, 'total': 19}
204 # Here the growth numbers needed to be reconstructed.
205 datetime.datetime(2020, 3, 4): {
206 'CW': {'growth': 0, 'total': 0},
207 'FK': {'growth': 2, 'total': 4},
208 'Li': {'growth': 0, 'total': 0},
209 'MH': {'growth': 0, 'total': 1},
210 'Mi': {'growth': 0, 'total': 3},
211 'Ne': {'growth': 0, 'total': 1},
212 'Pa': {'growth': 1, 'total': 2},
213 'Re': {'growth': 1, 'total': 1},
214 'Sp': {'growth': 0, 'total': 0},
215 'SZ': {'growth': 0, 'total': 0},
216 'TS': {'growth': 0, 'total': 1},
217 'TK': {'growth': 0, 'total': 0},
218 'sum': {'growth': 4, 'total': 13}
220 # Here the growth numbers needed to be reconstructed.
221 datetime.datetime(2020, 3, 3): {
222 'CW': {'growth': 0, 'total': 0},
223 'FK': {'growth': 2, 'total': 2},
224 'Li': {'growth': 0, 'total': 0},
225 'MH': {'growth': 0, 'total': 1},
226 'Mi': {'growth': 0, 'total': 3},
227 'Ne': {'growth': 0, 'total': 1},
228 'Pa': {'growth': 1, 'total': 1},
229 'Re': {'growth': 0, 'total': 0},
230 'Sp': {'growth': 0, 'total': 0},
231 'SZ': {'growth': 0, 'total': 0},
232 'TS': {'growth': 0, 'total': 1},
233 'TK': {'growth': 0, 'total': 0},
234 'sum': {'growth': 3, 'total': 9}
238 # Here the official total is 215, while the summation of district
239 # numbers only adds up to 125 – pretty much looks like a mere
240 # transposition of digits.
241 datetime.datetime(2020, 3, 27): {
246 # Here the official total is 1937, while the summation of district
247 # numbers only adds up to 1917; furthermore, the original value for
248 # SZ is 118 (+18), which makes no sense, as the day before is
249 # 120 (+15) and the day after is 147 (+15). The following is a
250 # compromise to keep as many surrounding numbers stable as possible.
251 datetime.datetime(2020, 3, 26): {
259 # Here the official total is 220, while the summation of district
260 # numbers adds up to 228 – looks like someone misread an 8 as a 0.
261 datetime.datetime(2020, 3, 25): {
268 # Scan navigation bar for maximum pagination value.
269 url = url_prefix + pm_dir
270 with urllib.request.urlopen(url) as response:
271 html = response.read()
272 soup = bs4.BeautifulSoup(html, 'html.parser')
274 for link in soup.find_all('a'):
276 if str.startswith(href, pm_nav_path):
277 max_test = int(href.split('=')[1])
278 max_page = max_test if max_test > max_page else max_page
280 # Scan paginated press release links for daily Corona number briefing links.
282 for i in range(max_page):
283 url = url_prefix + pm_nav_path + str(i + 1)
284 with urllib.request.urlopen(url) as response:
285 html = response.read()
286 soup = bs4.BeautifulSoup(html, 'html.parser')
287 for link in soup.find_all('a'):
288 if (not link.string) or\
289 (not link.string.startswith('Coronavirus: Derzeit') and
290 not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')):
292 day_urls += [link['href']]
294 # Collect infection data.
296 districts_sorted = []
297 # TODO: Push limit further back (might need more data fixes for that).
298 date_limit = datetime.datetime(2020, 3, 12)
299 for path in day_urls:
300 url = url_prefix + path
301 with urllib.request.urlopen(url) as response:
302 html = response.read()
303 soup = bs4.BeautifulSoup(html, 'html.parser')
304 date_title = soup.find('div', class_='pressnumber')
305 m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string)
306 date_formatted = m.group(0)
307 date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
308 if date_limit > date:
310 # On that day, two press releases were released, for that and the prev day.
311 if date == datetime.datetime(2020, 3, 15) and date in data:
312 date = datetime.datetime(2020, 3, 14)
313 # From here on, press releases describe numbers from prev day.
314 if date <= datetime.datetime(2020, 3, 13):
315 date = date - datetime.timedelta(days=1)
316 table = soup.find('table')
317 if table is None and date in data:
320 for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
322 for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
323 printable_string = ' '.join([s for s in td.strings])
324 printable_tds += [printable_string.strip()]
325 district_long = printable_tds[0]
326 district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
328 districts_sorted += [district_short]
330 if not split_char in printable_tds[1]:
332 total_str, growth_str = printable_tds[1].split(split_char)
333 growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
334 total = int(total_str.replace('.', ''))
335 data[date][district_short] = {'growth': growth, 'total': total}
337 dates_sorted = list(data.keys())
339 dates_sorted.reverse()
341 # Apply fixes and ensure integrity of results
343 for district in fixes[date]:
344 for type_ in fixes[date][district]:
345 data[date][district][type_] = fixes[date][district][type_]
346 for date in dates_sorted:
349 for district in [d for d in districts_sorted if not d=='sum']:
350 prev_date = date - datetime.timedelta(days=1)
351 if prev_date not in dates_sorted:
352 if prev_date >= date_limit:
353 raise Exception('Dates not contiguous: %s missing', prev_date)
356 prev_total = data[prev_date][district]['total']
357 cur_total = data[date][district]['total']
358 if cur_total - data[date][district]['growth'] != prev_total:
359 raise Exception('Questionable district infection total in %s/%s' % (district, date))
361 for district in [d for d in districts_sorted if not d=='sum']:
362 day_sum += data[date][district]['total']
363 if day_sum != data[date]['sum']['total']:
364 raise Exception('Questionable district infection total sum in %s' % date)
366 for district in [d for d in districts_sorted if not d=='sum']:
367 day_sum += data[date][district]['growth']
368 if day_sum != data[date]['sum']['growth']:
369 raise Exception('Questionable district infection growth sum in %s' % date)
372 print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
373 for date in dates_sorted:
375 for d in districts_sorted:
376 growths += [data[date][d]['growth']]
377 print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))