plomlompom.com Git - berlin-corona-table/blob - scrape.py

   1 #!/usr/bin/env python3
   2 import urllib.request
   3 import datetime
   4 import bs4
   5 import re
   6
   7 url_prefix = 'https://www.berlin.de'
   8 pm_dir = '/sen/gpg/service/presse/2020/'
   9 pm_nav_path = pm_dir + '?page_at_1_0='
  10
  11 # Map abbreviations to full names (and their alternate spellings).
  12 abbrevs = {
  13   'CW': {'Charlottenburg-Wilmersdorf'},
  14   'FK': {'Friedrichshain-Kreuzberg'},
  15   'Li': {'Lichtenberg'},
  16   'MH': {'Marzahn-Hellersdorf'},
  17   'Mi': {'Mitte'},
  18   'Ne': {'Neukölln', 'Neuköln'},
  19   'Pa': {'Pankow'},
  20   'Re': {'Reinickendorf'},
  21   'Sp': {'Spandau'},
  22   'SZ': {'Steglitz-Zehlendorf'},
  23   'TS': {'Tempelhof-Schöneberg'},
  24   'TK': {'Treptow-Köpenick'},
  25   'sum': {'Summe', 'Berlin'},
  26 }
  27
  28 # Here only image files are available for the table data.
  29 unparsable_graphics_fallback = {
  30     datetime.datetime(2020, 7, 2): {
  31         'CW': {'growth': 4, 'total': 851},
  32         'FK': {'growth': 10, 'total': 681},
  33         'Li': {'growth': 3, 'total': 427},
  34         'MH': {'growth': 4, 'total': 468},
  35         'Mi': {'growth': 0, 'total': 1202},
  36         'Ne': {'growth': 7, 'total': 1031},
  37         'Pa': {'growth': 3, 'total': 784},
  38         'Re': {'growth': 6, 'total': 660},
  39         'Sp': {'growth': 3, 'total': 450},
  40         'SZ': {'growth': 0, 'total': 591},
  41         'TS': {'growth': 3, 'total': 798},
  42         'TK': {'growth': 0, 'total': 401},
  43         'sum': {'growth': 43, 'total': 8344}
  44     },
  45     datetime.datetime(2020, 4, 5): {
  46         'CW': {'growth': 9, 'total': 462},
  47         'FK': {'growth': 2, 'total': 352},
  48         'Li': {'growth': 0, 'total': 142},
  49         'MH': {'growth': 3, 'total': 127},
  50         'Mi': {'growth': 14, 'total': 537},
  51         'Ne': {'growth': 0, 'total': 392},
  52         'Pa': {'growth': 10, 'total': 378},
  53         'Re': {'growth': 9, 'total': 248},
  54         'Sp': {'growth': 3, 'total': 150},
  55         'SZ': {'growth': 0, 'total': 312},
  56         'TS': {'growth': 8, 'total': 394},
  57         'TK': {'growth': 3, 'total': 193},
  58         'sum': {'growth': 61, 'total': 3687}
  59     },
  60     datetime.datetime(2020, 4, 4): {
  61         'CW': {'growth': 2, 'total': 453},
  62         'FK': {'growth': 7, 'total': 350},
  63         'Li': {'growth': 0, 'total': 142},
  64         'MH': {'growth': 15, 'total': 124},
  65         'Mi': {'growth': 22, 'total': 523},
  66         'Ne': {'growth': 15, 'total': 392},
  67         'Pa': {'growth': 10, 'total': 368},
  68         'Re': {'growth': 5, 'total': 239},
  69         'Sp': {'growth': 21, 'total': 147},
  70         'SZ': {'growth': 12, 'total': 312},
  71         'TS': {'growth': 24, 'total': 386},
  72         'TK': {'growth': 7, 'total': 190},
  73         'sum': {'growth': 140, 'total': 3626}
  74     },
  75     datetime.datetime(2020, 4, 3): {
  76         'CW': {'growth': 44, 'total': 451},
  77         'FK': {'growth': 17, 'total': 343},
  78         'Li': {'growth': 7, 'total': 142},
  79         'MH': {'growth': 4, 'total': 109},
  80         'Mi': {'growth': 4, 'total': 501},
  81         'Ne': {'growth': 40, 'total': 377},
  82         'Pa': {'growth': 39, 'total': 358},
  83         'Re': {'growth': 26, 'total': 234},
  84         'Sp': {'growth': 9, 'total': 126},
  85         'SZ': {'growth': 18, 'total': 300},
  86         'TS': {'growth': 41, 'total': 362},
  87         'TK': {'growth': 14, 'total': 183},
  88         'sum': {'growth': 263, 'total': 3486}
  89     }
  90 }
  91 fixes = {
  92    # Here the official total is 215, while the summation of district
  93    # numbers only adds up to 125 – pretty much looks like a mere
  94    # transposition of digits.
  95    datetime.datetime(2020, 3, 27): {
  96        'sum': {
  97            'growth': 125
  98        }
  99    },
 100    # Here the official total is 1937, while the summation of district
 101    # numbers only adds up to 1917; furthermore, the original value for
 102    # SZ is 118 (+18), which makes no sense, as the day before is
 103    # 120 (+15) and the day after is 147 (+15).  The following is a
 104    # compromise to keep as many surrounding numbers stable as possible.
 105    datetime.datetime(2020, 3, 26): {
 106        'SZ': {
 107            'growth': 12,
 108            'total': 132
 109        },
 110        'sum': {
 111            'growth': 286,
 112            'total': 1931
 113        }
 114    },
 115    # Here the official total is 220, while the summation of district
 116    # numbers adds up to 228 – looks like someone misread an 8 as a 0.
 117    datetime.datetime(2020, 3, 25): {
 118        'sum': {
 119            'growth': 220
 120        }
 121    },
 122 }
 123
 124 # Scan navigation bar for maximum pagination value.
 125 url = url_prefix + pm_dir
 126 with urllib.request.urlopen(url) as response:
 127    html = response.read()
 128 soup = bs4.BeautifulSoup(html, 'html.parser')
 129 max_page=0
 130 for link in soup.find_all('a'):
 131     href = link['href']
 132     if str.startswith(href, pm_nav_path):
 133         max_test = int(href.split('=')[1])
 134         max_page = max_test if max_test > max_page else max_page
 135
 136 # Scan paginated press release links for daily Corona number briefing links.
 137 day_urls = []
 138 for i in range(max_page):
 139     url = url_prefix + pm_nav_path + str(i + 1)
 140     with urllib.request.urlopen(url) as response:
 141         html = response.read()
 142     soup = bs4.BeautifulSoup(html, 'html.parser')
 143     for link in soup.find_all('a'):
 144         if (not link.string) or\
 145            (not link.string.startswith('Coronavirus: Derzeit') and
 146             not link.string.startswith('Coronavirus in Berlin: Bestätigte Fälle')):
 147             continue
 148         day_urls += [link['href']]
 149
 150 # Collect infection data.
 151 data = {}
 152 first_run = True
 153 districts_sorted = []
 154 # TODO: Push limit further back (might need more data fixes for that).
 155 date_limit = datetime.datetime(2020, 3, 16)
 156 for path in day_urls:
 157     url = url_prefix + path
 158     with urllib.request.urlopen(url) as response:
 159         html = response.read()
 160     soup = bs4.BeautifulSoup(html, 'html.parser')
 161     date_title = soup.find('div', class_='pressnumber')
 162     m = re.search('[0-9]+\\.[0-9]+\\.[0-9]+', date_title.string)
 163     date_formatted = m.group(0)
 164     date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
 165     if date_limit > date:
 166         break
 167     if date in data:
 168         raise Exception('Double date %s', date)
 169         #date -= datetime.timedelta(days=1)
 170     data[date] = {}
 171     table = soup.find('table')
 172     if table is None:
 173         data[date] = unparsable_graphics_fallback[date]
 174         continue
 175     for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
 176         printable_tds = []
 177         for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
 178             printable_string = ' '.join([s for s in td.strings])
 179             printable_tds += [printable_string.strip()]
 180         district_long = printable_tds[0]
 181         district_short = [k for k in abbrevs if district_long in abbrevs[k]][0]
 182         if first_run:
 183             districts_sorted += [district_short]
 184         split_char = ' '
 185         if not split_char in printable_tds[1]:
 186             split_char = '('
 187         total_str, growth_str = printable_tds[1].split(split_char)
 188         growth = int(growth_str.replace('(', '').replace(')', '').replace('+', ''))
 189         total = int(total_str.replace('.', ''))
 190         data[date][district_short] = {'growth': growth, 'total': total}
 191     first_run = False
 192 dates_sorted = list(data.keys())
 193 dates_sorted.sort()
 194 dates_sorted.reverse()
 195
 196 # Apply fixes and ensure integrity of results
 197 for date in fixes:
 198     for district in fixes[date]:
 199         for type_ in fixes[date][district]:
 200             data[date][district][type_] = fixes[date][district][type_]
 201 for date in dates_sorted:
 202     if date in fixes:
 203        continue
 204     for district in [d for d in districts_sorted if not d=='sum']:
 205         prev_date = date - datetime.timedelta(days=1)
 206         if prev_date not in dates_sorted:
 207             # TODO: ensure dates until end of list are continuous
 208             continue
 209         prev_total = data[date - datetime.timedelta(days=1)][district]['total']
 210         cur_total = data[date][district]['total']
 211         if cur_total - data[date][district]['growth'] != prev_total:
 212             raise Exception('Questionable district infection total in %s/%s' % (district, date))
 213     day_sum = 0
 214     for district in [d for d in districts_sorted if not d=='sum']:
 215        day_sum += data[date][district]['total']
 216     if day_sum != data[date]['sum']['total']:
 217         raise Exception('Questionable district infection total sum in %s' % date)
 218     day_sum = 0
 219     for district in [d for d in districts_sorted if not d=='sum']:
 220        day_sum += data[date][district]['growth']
 221     if day_sum != data[date]['sum']['growth']:
 222         raise Exception('Questionable district infection growth sum in %s' % date)
 223
 224 # Final output.
 225 print(' '*10, ' '.join(['%3s' % d for d in districts_sorted]))
 226 for date in dates_sorted:
 227     growths = []
 228     for d in districts_sorted:
 229         growths += [data[date][d]['growth']]
 230     print(date.strftime('%Y-%m-%d'), ' '.join(['%3s' % g for g in growths]))