plomlompom.com Git - berlin-corona-table/blob - scrape.py

   1 #!/usr/bin/env python3
   2 import urllib.request
   3 import datetime
   4 import bs4
   5 import re
   6
   7 url_prefix = 'https://www.berlin.de'
   8 pm_dir = '/sen/gpg/service/presse/2020/'
   9 pm_nav_path = pm_dir + '?page_at_1_0='
  10
  11 # Map abbreviations to full names (and their alternate spellings).
  12 abbrevs = {
  13   'CW': {'Charlottenburg-Wilmersdorf'},
  14   'FK': {'Friedrichshain-Kreuzberg'},
  15   'Li': {'Lichtenberg'},
  16   'MH': {'Marzahn-Hellersdorf'},
  17   'Mi': {'Mitte'},
  18   'Ne': {'Neukölln', 'Neuköln'},
  19   'Pa': {'Pankow'},
  20   'Re': {'Reinickendorf'},
  21   'Sp': {'Spandau'},
  22   'SZ': {'Steglitz-Zehlendorf'},
  23   'TS': {'Tempelhof-Schöneberg'},
  24   'TK': {'Treptow-Köpenick'},
  25   'sum': {'Summe', 'Berlin'},
  26 }
  27
  28 # some pre-filled values
  29 data = {
  30    # For these, only image files are available for the table data.
  31     datetime.datetime(2020, 7, 2): {
  32         'CW': {'growth': 4, 'total': 851},
  33         'FK': {'growth': 10, 'total': 681},
  34         'Li': {'growth': 3, 'total': 427},
  35         'MH': {'growth': 4, 'total': 468},
  36         'Mi': {'growth': 0, 'total': 1202},
  37         'Ne': {'growth': 7, 'total': 1031},
  38         'Pa': {'growth': 3, 'total': 784},
  39         'Re': {'growth': 6, 'total': 660},
  40         'Sp': {'growth': 3, 'total': 450},
  41         'SZ': {'growth': 0, 'total': 591},
  42         'TS': {'growth': 3, 'total': 798},
  43         'TK': {'growth': 0, 'total': 401},
  44         'sum': {'growth': 43, 'total': 8344}
  45     },
  46     datetime.datetime(2020, 4, 5): {
  47         'CW': {'growth': 9, 'total': 462},
  48         'FK': {'growth': 2, 'total': 352},
  49         'Li': {'growth': 0, 'total': 142},
  50         'MH': {'growth': 3, 'total': 127},
  51         'Mi': {'growth': 14, 'total': 537},
  52         'Ne': {'growth': 0, 'total': 392},
  53         'Pa': {'growth': 10, 'total': 378},
  54         'Re': {'growth': 9, 'total': 248},
  55         'Sp': {'growth': 3, 'total': 150},
  56         'SZ': {'growth': 0, 'total': 312},
  57         'TS': {'growth': 8, 'total': 394},
  58         'TK': {'growth': 3, 'total': 193},
  59         'sum': {'growth': 61, 'total': 3687}
  60     },
  61     datetime.datetime(2020, 4, 4): {
  62         'CW': {'growth': 2, 'total': 453},
  63         'FK': {'growth': 7, 'total': 350},
  64         'Li': {'growth': 0, 'total': 142},
  65         'MH': {'growth': 15, 'total': 124},
  66         'Mi': {'growth': 22, 'total': 523},
  67         'Ne': {'growth': 15, 'total': 392},
  68         'Pa': {'growth': 10, 'total': 368},
  69         'Re': {'growth': 5, 'total': 239},
  70         'Sp': {'growth': 21, 'total': 147},
  71         'SZ': {'growth': 12, 'total': 312},
  72         'TS': {'growth': 24, 'total': 386},
  73         'TK': {'growth': 7, 'total': 190},
  74         'sum': {'growth': 140, 'total': 3626}
  75     },
  76     datetime.datetime(2020, 4, 3): {
  77         'CW': {'growth': 44, 'total': 451},
  78         'FK': {'growth': 17, 'total': 343},
  79         'Li': {'growth': 7, 'total': 142},
  80         'MH': {'growth': 4, 'total': 109},
  81         'Mi': {'growth': 4, 'total': 501},
  82         'Ne': {'growth': 40, 'total': 377},
  83         'Pa': {'growth': 39, 'total': 358},
  84         'Re': {'growth': 26, 'total': 234},
  85         'Sp': {'growth': 9, 'total': 126},
  86         'SZ': {'growth': 18, 'total': 300},
  87         'TS': {'growth': 41, 'total': 362},
  88         'TK': {'growth': 14, 'total': 183},
  89         'sum': {'growth': 263, 'total': 3486}
  90     },
  91    # This one has no press release but can be reconstructed from
  92    # the neighbour ones.
  93    datetime.datetime(2020, 3, 13): {
  94         'CW': {'growth': 16, 'total': 47},
  95         'FK': {'growth': 8, 'total': 22},
  96         'Li': {'growth': 2, 'total': 8},
  97         'MH': {'growth': 1, 'total': 4},
  98         'Mi': {'growth': 9, 'total': 29},
  99         'Ne': {'growth': 6, 'total': 16},
 100         'Pa': {'growth': 11, 'total': 26},
 101         'Re': {'growth': 0, 'total': 11},
 102         'Sp': {'growth': 1, 'total': 9},
 103         'SZ': {'growth': 0, 'total': 20},
 104         'TS': {'growth': 1, 'total': 17},
 105         'TK': {'growth': 3, 'total': 7},
 106         'sum': {'growth': 58, 'total': 216}
 107    },
 108    # Here the growth numbers needed to be reconstructed.
 109    datetime.datetime(2020, 3, 10): {
 110         'CW': {'growth': 2, 'total': 15},
 111         'FK': {'growth': 0, 'total': 12},
 112         'Li': {'growth': 4, 'total': 5},
 113         'MH': {'growth': 1, 'total': 3},
 114         'Mi': {'growth': 0, 'total': 8},
 115         'Ne': {'growth': 2, 'total': 5},
 116         'Pa': {'growth': 2, 'total': 8},
 117         'Re': {'growth': 0, 'total': 3},
 118         'Sp': {'growth': 4, 'total': 6},
 119         'SZ': {'growth': 3, 'total': 6},
 120         'TS': {'growth': 2, 'total': 7},
 121         'TK': {'growth': 3, 'total': 3},
 122         'sum': {'growth': 23, 'total': 81}
 123    },
 124    # Here the totals needed to be reconstructed.
 125    datetime.datetime(2020, 3, 9): {
 126         'CW': {'growth': 4, 'total': 13},
 127         'FK': {'growth': 3, 'total': 12},
 128         'Li': {'growth': 0, 'total': 1},
 129         'MH': {'growth': 1, 'total': 2},
 130         'Mi': {'growth': 0, 'total': 8},
 131         'Ne': {'growth': 1, 'total': 3},
 132         'Pa': {'growth': 1, 'total': 6},
 133         'Re': {'growth': 0, 'total': 3},
 134         'Sp': {'growth': 0, 'total': 2},
 135         'SZ': {'growth': 0, 'total': 3},
 136         'TS': {'growth': 0, 'total': 5},
 137         'TK': {'growth': 0, 'total': 0},
 138         'sum': {'growth': 10, 'total': 58}
 139    },
 140    # Here the growth numbers needed to be reconstructed.
 141    datetime.datetime(2020, 3, 8): {
 142         'CW': {'growth': 0, 'total': 9},
 143         'FK': {'growth': 4, 'total': 9},
 144         'Li': {'growth': 1, 'total': 1},
 145         'MH': {'growth': 0, 'total': 1},
 146         'Mi': {'growth': 0, 'total': 8},
 147         'Ne': {'growth': 0, 'total': 2},
 148         'Pa': {'growth': 0, 'total': 5},
 149         'Re': {'growth': 0, 'total': 3},
 150         'Sp': {'growth': 2, 'total': 2},
 151         'SZ': {'growth': 1, 'total': 3},
 152         'TS': {'growth': 0, 'total': 5},
 153         'TK': {'growth': 0, 'total': 0},
 154         'sum': {'growth': 8, 'total': 48}
 155    },
 156    # Here the growth numbers needed to be reconstructed.
 157    datetime.datetime(2020, 3, 7): {
 158         'CW': {'growth': 6, 'total': 9},
 159         'FK': {'growth': 1, 'total': 5},
 160         'Li': {'growth': 0, 'total': 0},
 161         'MH': {'growth': 0, 'total': 1},
 162         'Mi': {'growth': 1, 'total': 8},
 163         'Ne': {'growth': 0, 'total': 2},
 164         'Pa': {'growth': 1, 'total': 5},
 165         'Re': {'growth': 0, 'total': 3},
 166         'Sp': {'growth': 0, 'total': 0},
 167         'SZ': {'growth': 2, 'total': 2},
 168         'TS': {'growth': 1, 'total': 5},
 169         'TK': {'growth': 0, 'total': 0},
 170         'sum': {'growth': 12, 'total': 40}
 171    },
 172    # Here the growth numbers needed to be reconstructed.
 173    datetime.datetime(2020, 3, 6): {
 174         'CW': {'growth': 1, 'total': 3},
 175         'FK': {'growth': 0, 'total': 4},
 176         'Li': {'growth': 0, 'total': 0},
 177         'MH': {'growth': 0, 'total': 1},
 178         'Mi': {'growth': 4, 'total': 7},
 179         'Ne': {'growth': 1, 'total': 2},
 180         'Pa': {'growth': 1, 'total': 4},
 181         'Re': {'growth': 0, 'total': 3},
 182         'Sp': {'growth': 0, 'total': 0},
 183         'SZ': {'growth': 0, 'total': 0},
 184         'TS': {'growth': 2, 'total': 4},
 185         'TK': {'growth': 0, 'total': 0},
 186         'sum': {'growth': 9, 'total': 28}
 187    },
 188    # Here the growth numbers needed to be reconstructed.
 189    datetime.datetime(2020, 3, 5): {
 190         'CW': {'growth': 2, 'total': 2},
 191         'FK': {'growth': 0, 'total': 4},
 192         'Li': {'growth': 0, 'total': 0},
 193         'MH': {'growth': 0, 'total': 1},
 194         'Mi': {'growth': 0, 'total': 3},
 195         'Ne': {'growth': 0, 'total': 1},
 196         'Pa': {'growth': 1, 'total': 3},
 197         'Re': {'growth': 2, 'total': 3},
 198         'Sp': {'growth': 0, 'total': 0},
 199         'SZ': {'growth': 0, 'total': 0},
 200         'TS': {'growth': 1, 'total': 2},
 201         'TK': {'growth': 0, 'total': 0},
 202         'sum': {'growth': 6, 'total': 19}
 203    },
 204    # Here the growth numbers needed to be reconstructed.
 205    datetime.datetime(2020, 3, 4): {
 206         'CW': {'growth': 0, 'total': 0},
 207         'FK': {'growth': 2, 'total': 4},
 208         'Li': {'growth': 0, 'total': 0},
 209         'MH': {'growth': 0, 'total': 1},
 210         'Mi': {'growth': 0, 'total': 3},
 211         'Ne': {'growth': 0, 'total': 1},
 212         'Pa': {'growth': 1, 'total': 2},
 213         'Re': {'growth': 1, 'total': 1},
 214         'Sp': {'growth': 0, 'total': 0},
 215         'SZ': {'growth': 0, 'total': 0},
 216         'TS': {'growth': 0, 'total': 1},
 217         'TK': {'growth': 0, 'total': 0},
 218         'sum': {'growth': 4, 'total': 13}
 219    },
 220    # Here the growth numbers needed to be reconstructed.
 221    datetime.datetime(2020, 3, 3): {
 222         'CW': {'growth': 0, 'total': 0},
 223         'FK': {'growth': 2, 'total': 2},
 224         'Li': {'growth': 0, 'total': 0},
 225         'MH': {'growth': 0, 'total': 1},
 226         'Mi': {'growth': 0, 'total': 3},
 227         'Ne': {'growth': 0, 'total': 1},
 228         'Pa': {'growth': 1, 'total': 1},
 229         'Re': {'growth': 0, 'total': 0},
 230         'Sp': {'growth': 0, 'total': 0},
 231         'SZ': {'growth': 0, 'total': 0},
 232         'TS': {'growth': 0, 'total': 1},
 233         'TK': {'growth': 0, 'total': 0},
 234         'sum': {'growth': 3, 'total': 9}
 235    },
 236 }
 237 fixes = {
 238    # Here the official total is 215, while the summation of district
 239    # numbers only adds up to 125 – pretty much looks like a mere
 240    # transposition of digits.
 241    datetime.datetime(2020, 3, 27): {
 242        'sum': {
 243            'growth': 125
 244        }
 245    },
 246    # Here the official total is 1937, while the summation of district
 247    # numbers only adds up to 1917; furthermore, the original value for
 248    # SZ is 118 (+18), which makes no sense, as the day before is
 249    # 120 (+15) and the day after is 147 (+15).  The following is a
 250    # compromise to keep as many surrounding numbers stable as possible.
 251    datetime.datetime(2020, 3, 26): {
 252        'SZ': {
 253            'growth': 12
 254        },
 255        'sum': {
 256            'growth': 286
 257        }
 258    },
 259    # Here the official total is 220, while the summation of district
 260    # numbers adds up to 228 – looks like someone misread an 8 as a 0.
 261    datetime.datetime(2020, 3, 25): {
 262        'sum': {
 263            'growth': 220
 264        }
 265    },
 266 }
 267
 268 # Scan navigation bar for maximum pagination value.
 269 url = url_prefix + pm_dir
 270 with urllib.request.urlopen(url) as response:
 271    html = response.read()
 272 soup = bs4.BeautifulSoup(html, 'html.parser')
 273 max_page=0
 274 for link in soup.find_all('a'):
 275     href = link['href']
 276     if str.startswith(href, pm_nav_path):
 277         max_test = int(href.split('=')[1])
 278         max_page = max_test if max_test > max_page else max_page
 279
 280 # Scan paginated press release links for daily Corona number briefing links.
 281 day_urls = []
 282 for i in range(max_page):
 283     url = url_prefix + pm_nav_path + str(i + 1)
 284     with urllib.request.urlopen(url) as response:
 285         html = response.read()
 286     soup = bs4.BeautifulSoup(html, 'html.parser')
 287     for link in soup.find_all('a'):
 288         if (not link.string) or\
 289            (not link.string.startswith('Coronavirus: Derzeit') and
 290             not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')):
 291             continue
 292         day_urls += [link['href']]
 293
 294 # Collect infection data.
 295 first_run = True
 296 districts_sorted = []
 297 # TODO: Push limit further back (might need more data fixes for that).
 298 date_limit = datetime.datetime(2020, 3, 12)
 299 for path in day_urls:
 300     url = url_prefix + path
 301     with urllib.request.urlopen(url) as response:
 302         html = response.read()
 303     soup = bs4.BeautifulSoup(html, 'html.parser')
 304     date_title = soup.find('div', class_='pressnumber')
 305     m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string)
 306     date_formatted = m.group(0)
 307     date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
 308     if date_limit > date:
 309         break
 310     # On that day, two press releases were released, for that and the prev day.
 311     if date == datetime.datetime(2020, 3, 15) and date in data:
 312        date = datetime.datetime(2020, 3, 14)
 313     # From here on, press releases describe numbers from prev day.
 314     if date <= datetime.datetime(2020, 3, 13):
 315        date = date - datetime.timedelta(days=1)
 316     table = soup.find('table')
 317     if table is None and date in data:
 318         continue
 319     data[date] = {}
 320     for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
 321         printable_tds = []
 322         for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
 323             printable_string = ' '.join([s for s in td.strings])
 324             printable_tds += [printable_string.strip()]
 325         district_long = printable_tds[0]
 326         district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
 327         if first_run:
 328             districts_sorted += [district_short]
 329         split_char = ' '
 330         if not split_char in printable_tds[1]:
 331             split_char = '('
 332         total_str, growth_str = printable_tds[1].split(split_char)
 333         growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
 334         total = int(total_str.replace('.', ''))
 335         data[date][district_short] = {'growth': growth, 'total': total}
 336     first_run = False
 337 dates_sorted = list(data.keys())
 338 dates_sorted.sort()
 339 dates_sorted.reverse()
 340
 341 # Apply fixes and ensure integrity of results
 342 for date in fixes:
 343     for district in fixes[date]:
 344         for type_ in fixes[date][district]:
 345             data[date][district][type_] = fixes[date][district][type_]
 346 for date in dates_sorted:
 347     if date in fixes:
 348        continue
 349     for district in [d for d in districts_sorted if not d=='sum']:
 350         prev_date = date - datetime.timedelta(days=1)
 351         if prev_date not in dates_sorted:
 352            if prev_date >= date_limit:
 353               raise Exception('Dates not contiguous: %s missing', prev_date)
 354            else:
 355               continue
 356         prev_total = data[prev_date][district]['total']
 357         cur_total = data[date][district]['total']
 358         if cur_total - data[date][district]['growth'] != prev_total:
 359             raise Exception('Questionable district infection total in %s/%s' % (district, date))
 360     day_sum = 0
 361     for district in [d for d in districts_sorted if not d=='sum']:
 362        day_sum += data[date][district]['total']
 363     if day_sum != data[date]['sum']['total']:
 364         raise Exception('Questionable district infection total sum in %s' % date)
 365     day_sum = 0
 366     for district in [d for d in districts_sorted if not d=='sum']:
 367        day_sum += data[date][district]['growth']
 368     if day_sum != data[date]['sum']['growth']:
 369         raise Exception('Questionable district infection growth sum in %s' % date)
 370
 371 # Final output.
 372 print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
 373 for date in dates_sorted:
 374     growths = []
 375     for d in districts_sorted:
 376         growths += [data[date][d]['growth']]
 377     print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))