forked from cdcepi/Flusight-forecast-data
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_predictions.py
More file actions
120 lines (106 loc) · 4.64 KB
/
process_predictions.py
File metadata and controls
120 lines (106 loc) · 4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
This is for parsing from CDC's github repo
Output: score card containing all models
i.e.,
model, forecast_week, ahead, location (as region abbreviation), type, quantile, value
e.g.
GT-FluFNP, 202205, 1, CA, point, NaN, 843
GT-FluFNP, 202205, 1, CA, quantile, 0.01, 338
....
GT-FluFNP, 202205, 2, CA, point, NaN, 900
GT-FluFNP, 202205, 2, CA, quantile, 0.01, 438
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import glob
from epiweeks import Week
import pdb
# death_target = ['1 wk ahead inc death' , '2 wk ahead inc death' , '3 wk ahead inc death' , '4 wk ahead inc death']
data_ew = Week.thisweek(system="CDC") - 1 # -1 because we have data for the previous (ending) week
DIR = './data-forecasts/'
models = [file.split(DIR)[1] for file in glob.glob(DIR + '/*') if ".md" not in file]
models.remove('GT-FluFNP-raw')
models.remove('Umass-ARIMA')
location_df = pd.read_csv('data-locations/locations.csv')
location_dict = {location_df['location'][i]:location_df['abbreviation'][i] for i in range(len(location_df))}
# for each model, get all submissions
df_list = []
model_point = ['CMU-TimeSeries', 'Flusight-ensemble', 'LUcompUncertLab-TEVA',
'LUcompUncertLab-VAR2', 'LUcompUncertLab-VAR2K', 'LUcompUncertLab-VAR2K_plusCOVID', 'LUcompUncertLab-VAR2_plusCOVID',
'LUcompUncertLab-humanjudgment','LosAlamos_NAU-CModel_Flu','UT_FluCast-Voltaire']
print(models)
for model in models:
model_dir = DIR + '/' + model + '/'
all_items_path = np.array(glob.glob(model_dir + '*.csv')) # list all csv files' paths
all_items = [path.replace(model_dir, '') for path in all_items_path] #list of all csv files' names
"""
remove forecasts that were duplicated in a given week (if any)
forecasts file should be unique for each epiweek
"""
subm_dict = {}
for i, item in enumerate(all_items):
date = datetime.strptime(item[:10], '%Y-%m-%d')
epiweek = date.isocalendar()[1]
if epiweek in subm_dict.keys():
if subm_dict[epiweek][0] <= date:
subm_dict[epiweek] = (date, i)
else:
subm_dict[epiweek] = (date, i)
select = [ value[1] for key, value in subm_dict.items()]
select_paths = all_items_path[select]
data_model = []
for path in select_paths:
df = pd.read_csv(path)
"""
create epiweek column
"""
date = path.split('/')[-1][:10]
# epiweek ends on Saturday, but submission is until Monday.
# we can subtract 2 days, thus, submission on Monday will be considered in the prev week
# this also aligns submission week and data
date = datetime.strptime(date, '%Y-%m-%d') - timedelta(days=2)
forecast_week = Week.fromdate(date)
df['forecast_week'] = forecast_week
#pdb.set_trace()
data_model.append(df)
# join all dataframes saved in data_model
"""
select, rename and sort columns
"""
"""
convert location to region abbreviation
"""
print(model, 'predicted', len(data_model), 'weeks')
df = pd.concat(data_model, ignore_index=True, sort=False)
df = df.rename(columns={'target': 'ahead'})
model_list = []
df['location']= df['location'].astype(str)
for i in range(len(df)):
key = df['location'][i]
if len(key) == 1:
key = '0' + key
df.at[i, 'location'] = location_dict[key]
df.at[i, 'ahead'] = df['ahead'][i][0]
model_list.append(model)
df['model'] = model
df = df[['model', 'forecast_week', 'ahead', 'location', 'type', 'quantile', 'value']]
final_row = {'model': [], 'forecast_week': [], 'ahead':[], 'location':[],'type':[],'quantile':[],
'value':[]}
for index, row in df.iterrows():
if row['quantile'] == 0.5 and model in model_point:
final_row['model'].append(row['model'])
final_row['forecast_week'].append(row['forecast_week'])
final_row['ahead'].append(row['ahead'])
final_row['location'].append(row['location'])
final_row['type'].append('point')
final_row['quantile'].append(np.nan)
final_row['value'].append(row['value'])
df2 = pd.DataFrame(final_row)
df3 = pd.concat([df,df2], ignore_index = False)
df3 = df3.sort_values(by=['forecast_week', 'location', 'ahead', 'type'], ascending=[True, True,True,True])
df_list.append(df3)
df = pd.concat(df_list, ignore_index=True, sort=False)
df = df.sort_values(by=['model','forecast_week', 'location', 'ahead', 'type'], ascending=[True,True, True,True,True])
df.to_csv('./predictions.csv',index=False)
print("done")