Grow scraper date range.

author Christian Heller <c.heller@plomlompom.de>

Sun, 26 Jul 2020 17:52:18 +0000 (19:52 +0200)

committer Christian Heller <c.heller@plomlompom.de>

Sun, 26 Jul 2020 17:52:18 +0000 (19:52 +0200)
author Christian Heller <c.heller@plomlompom.de>
Sun, 26 Jul 2020 17:52:18 +0000 (19:52 +0200)
committer Christian Heller <c.heller@plomlompom.de>
Sun, 26 Jul 2020 17:52:18 +0000 (19:52 +0200)
diff --git a/scrape.py b/scrape.py

index e449b998b2821f4fe1fe9539fc1fbf28b2898b9c..de458f4234a241bff448dc6252df12b6ed2e9636 100755 (executable)
--- a/scrape.py
+++ b/scrape.py
@@ -25,8 +25,9 @@ abbrevs = {
    'sum': {'Summe', 'Berlin'},
  }
  
-# Here only image files are available for the table data.
-unparsable_graphics_fallback = {
+# some pre-filled values
+data = {
+   # For these, only image files are available for the table data.
      datetime.datetime(2020, 7, 2): {
          'CW': {'growth': 4, 'total': 851},
          'FK': {'growth': 10, 'total': 681},
@@ -86,7 +87,24 @@ unparsable_graphics_fallback = {
          'TS': {'growth': 41, 'total': 362},
          'TK': {'growth': 14, 'total': 183},
          'sum': {'growth': 263, 'total': 3486}
-    }
+    },
+   # This one has no press release but can be reconstructed from
+   # the neighbour ones.
+   datetime.datetime(2020, 3, 13): {
+        'CW': {'growth': 16, 'total': 47},
+        'FK': {'growth': 8, 'total': 22},
+        'Li': {'growth': 2, 'total': 8},
+        'MH': {'growth': 1, 'total': 4},
+        'Mi': {'growth': 9, 'total': 29},
+        'Ne': {'growth': 6, 'total': 16},
+        'Pa': {'growth': 11, 'total': 26},
+        'Re': {'growth': 0, 'total': 11},
+        'Sp': {'growth': 1, 'total': 9},
+        'SZ': {'growth': 0, 'total': 20},
+        'TS': {'growth': 1, 'total': 17},
+        'TK': {'growth': 3, 'total': 7},
+        'sum': {'growth': 58, 'total': 216}
+   }
  }
  fixes = {
     # Here the official total is 215, while the summation of district
@@ -146,11 +164,10 @@ for i in range(max_page):
          day_urls += [link['href']]
  
  # Collect infection data.
-data = {}
  first_run = True
  districts_sorted = []
  # TODO: Push limit further back (might need more data fixes for that).
-date_limit = datetime.datetime(2020, 3, 16)
+date_limit = datetime.datetime(2020, 3, 12)
  for path in day_urls:
      url = url_prefix + path
      with urllib.request.urlopen(url) as response:
@@ -162,14 +179,16 @@ for path in day_urls:
      date = datetime.datetime.strptime(date_formatted , '%d.%m.%Y')
      if date_limit > date:
          break
-    if date in data:
-        raise Exception('Double date %s', date)
-        #date -= datetime.timedelta(days=1)
-    data[date] = {}
+    # On that day, two press releases were released, for that and the prev day.
+    if date == datetime.datetime(2020, 3, 15) and date in data:
+       date = datetime.datetime(2020, 3, 14)
+    # From here on, press releases describe numbers from prev day.
+    if date <= datetime.datetime(2020, 3, 13):
+       date = date - datetime.timedelta(days=1)
      table = soup.find('table')
-    if table is None:
-        data[date] = unparsable_graphics_fallback[date]
+    if table is None and date in data:
          continue
+    data[date] = {}
      for tr in [tr for tr in table.children if type(tr) == bs4.element.Tag][1:]:
          printable_tds = []
          for td in [td for td in tr.children if type(td) == bs4.element.Tag][:2]:
@@ -206,7 +225,7 @@ for date in dates_sorted:
                raise Exception('Dates not contiguous: %s missing', prev_date)
             else:
                continue
-        prev_total = data[date - datetime.timedelta(days=1)][district]['total']
+        prev_total = data[prev_date][district]['total']
          cur_total = data[date][district]['total']
          if cur_total - data[date][district]['growth'] != prev_total:
              raise Exception('Questionable district infection total in %s/%s' % (district, date))
author	Christian Heller <c.heller@plomlompom.de>
	Sun, 26 Jul 2020 17:52:18 +0000 (19:52 +0200)
committer	Christian Heller <c.heller@plomlompom.de>
	Sun, 26 Jul 2020 17:52:18 +0000 (19:52 +0200)