plomlompom.com Git - berlin-corona-table/blob - enhance_table.py

   1 #!//usr/bin/env python3
   2
   3 import sys
   4 if len(sys.argv) != 2:
   5     print('Expecting infections table file path as only argument.')
   6     exit(1)
   7 infections_table = sys.argv[1]
   8
   9 # District population numbers as per Wikipedia.
  10 district_pops = {
  11   'CW': 342332,
  12   'FK': 289762,
  13   'Li': 291452,
  14   'MH': 268548,
  15   'Mi': 384172,
  16   'Ne': 329691,
  17   'Pa': 407765,
  18   'Re': 265225,
  19   'Sp': 243977,
  20   'SZ': 308697,
  21   'TS': 351644,
  22   'TK': 271153,
  23   'sum': 3754418,
  24 }
  25
  26 f = open(infections_table, 'r')
  27 lines = f.readlines()
  28 f.close()
  29
  30 # Basic input validation.
  31 import datetime
  32 header_elements = lines[0].split()
  33 if set(header_elements) != district_pops.keys() or \
  34        len(header_elements) != len(district_pops.keys()):
  35     raise Exception('infections table: invalid header')
  36 line_count = 0
  37 for line in lines[1:]:
  38     line_count += 1
  39     fields = line.split()
  40     if len(header_elements) != len(fields) - 1:
  41         raise Exception('infections table: too many elements on line %s',
  42                         line_count)
  43     try:
  44         datetime.date.fromisoformat(fields[0])
  45     except ValueError:
  46         raise Exception('infections table: bad ISO date on line %s',
  47                         line_count)
  48     for field in fields[1:]:
  49         try:
  50             int(field)
  51         except ValueError:
  52             raise Exception('infections table: bad value on line %s',
  53                             line_count)
  54
  55 # Parse first table file line for the names and order of districts.
  56 db = {}
  57 sorted_districts = []
  58 for header in lines[0].split():
  59     sorted_districts += [header]
  60     db[header] = {}
  61
  62 # Seed DB with daily new infections data per district, per date.
  63 sorted_dates = []
  64 for line in lines[1:]:
  65     fields = line.split()
  66     date = fields[0]
  67     sorted_dates += [date]
  68     for i in range(len(sorted_districts)):
  69         district = sorted_districts[i]
  70         district_data = fields[i + 1]
  71         db[district][date] = {'new_infections': int(district_data)}
  72 sorted_dates.sort()
  73
  74 # In LaGeSo's data, the last "district" is actually the sum of all districts /
  75 # the whole of Berlin.
  76 #
  77 # Fail on any day where the "sum" district's new infections are not the proper
  78 # sum of the individual districts new infections.  Yes, sometimes Lageso sends
  79 # data that is troubled in this way.  It will then have to be fixed manually in
  80 # the table file, since we should have a human look at what mistake was
  81 # probably made.
  82 for date in sorted_dates:
  83     sum_district = sorted_districts[-1]
  84     day_sum = 0
  85     for district in sorted_districts[:-1]:
  86         day_sum += db[district][date]['new_infections']
  87     if day_sum != db[sum_district][date]['new_infections']:
  88         raise Exception('Questionable district infection sum in %s' % date)
  89
  90 # Enhance DB with data about weekly sums, averages, incidences per day.  Ignore
  91 # days that have less than 6 predecessors (we can only know a weekly average if
  92 # we have a whole week of data).
  93 for i in range(len(sorted_dates)):
  94     if i < 6:
  95         continue
  96     date = sorted_dates[i]
  97     week_dates = []
  98     for j in range(7):
  99         week_dates += [sorted_dates[i - j]]
 100     for district in sorted_districts:
 101         district_pop = district_pops[district]
 102         week_sum = 0
 103         for week_date in week_dates:
 104             week_sum += db[district][week_date]['new_infections']
 105         db[district][date]['week_sum'] = week_sum
 106         db[district][date]['week_average'] = week_sum / 7
 107         db[district][date]['week_incidence'] = (week_sum / district_pop) * 100000
 108
 109 # Explain what this is.
 110 intro = """Table of Berlin's Corona infection number development by districts.
 111 Updated daily around 9pm.
 112
 113 Abbrevations/explanations:
 114
 115 CW: Charlottenburg-Wilmersdorf
 116 FK: Friedrichshain-Kreuzberg
 117 Li: Lichtenberg
 118 MH: Marzahn-Hellersdorf
 119 Mi: Mitte
 120 Ne: Neukölln
 121 Pa: Pankow
 122 Re: Reinickendorf
 123 Sp: Spandau
 124 SZ: Steglitz-Zehlendorf
 125 TS: Tempelhof-Schöneberg
 126 TK: Treptow-Köpenick
 127 sum: sum for all the districts
 128 wsum: sum for last 7 days
 129 wavg: per-day average of new infections for last 7 days
 130 winc: incidence (x per 100k inhabitants) of new infections for last 7 days
 131
 132 Source code: https://plomlompom.com/repos/?p=berlin-corona-table
 133 """
 134 print(intro)
 135
 136 # Output table of enhanced daily infection data, newest on top, separated into
 137 # 7-day units.
 138 sorted_dates.reverse()
 139 weekday_count = 0
 140 for date in sorted_dates:
 141
 142     # Week table header.
 143     if weekday_count == 0:
 144         print(' '*11, '  '.join(sorted_districts[:-1]),
 145               sorted_districts[-1], 'wsum', ' wavg', 'winc')
 146         week_start_date = date
 147
 148     # Day data line.
 149     new_infections = []
 150     for district in sorted_districts:
 151         new_infections += [db[district][date]['new_infections']]
 152     week_sum = week_avg = week_inc = ''
 153     sum_district = sorted_districts[-1]
 154     sum_district_data = db[sum_district][date]
 155     if 'week_sum' in sum_district_data:
 156         week_sum = '%4s' % sum_district_data['week_sum']
 157     if 'week_average' in sum_district_data:
 158         week_avg = '%5.1f' % sum_district_data['week_average']
 159     if 'week_incidence' in sum_district_data:
 160         week_inc = '%4.1f' % sum_district_data['week_incidence']
 161     print(date, ' '.join(['%3s' % infections for infections in new_infections]),
 162           week_sum, week_avg, week_inc)
 163
 164     # Maintain 7-day cycle.
 165     weekday_count += 1
 166     if weekday_count != 7:
 167         continue
 168     weekday_count = 0
 169
 170     # After each 7 days, print summary for individual districts.
 171     weekly_sums = []
 172     weekly_avgs = []
 173     weekly_incs = []
 174     for district in sorted_districts[:-1]:
 175         weekly_sums += [db[district][week_start_date]['week_sum']]
 176         weekly_avgs += [db[district][week_start_date]['week_average']]
 177         weekly_incs += [db[district][week_start_date]['week_incidence']]
 178     print()
 179     print('district stats for week from %s to %s:' % (date, week_start_date))
 180     print(' '*7, '    '.join(sorted_districts[:-1]))
 181     print('wsum', ' '.join(['%5.1f' % wsum for wsum in weekly_sums]))
 182     print('wavg', ' '.join(['%5.1f' % wavg for wavg in weekly_avgs]))
 183     print('winc', ' '.join(['%5.1f' % winc for winc in weekly_incs]))
 184     print()