-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcap_helper.py
More file actions
144 lines (98 loc) · 4.87 KB
/
cap_helper.py
File metadata and controls
144 lines (98 loc) · 4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
import numpy as np
import re
"""
Notes on time conversion:
Where days are months are missing from the date it defaults to 1st day/month. So times before DOD will be overestimated. One option would be to use years rather than months.
Probably missing some patterns below, but this should cover most cases (notably 21 Jan 2020 will omit leave a hanging 21). There is one typo written '22/112007' which fails silently (just replaces date with a space). And other dates in formt 21-01-2020 which fail.
"""
def load_data(data_dir):
return pd.read_excel(data_dir + '20191028_committee_reviews_nlp_code.xlsx',
sheet_name='_20191028_committee_reviews_nlp', engine='openpyxl')
def concatenate_feature_columns(df, columns=None, remove_nl=True):
if columns is None:
# create column with concatenation of all columns for any case,
# except for the ones we are trying to predict (the last two)
cols = df.columns
cols = cols[2:27] # This excludes 'cp1id', 'cp1vig date completed', 'cp1vig summary'
cols = [c for c in cols if 'palliative' not in c] # remove palliative column because confounded
else:
cols = columns
df['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df['combined'] = df['combined'].replace({'nan': ''}, regex=True)
if remove_nl:
df['combined'] = df['combined'].replace({'_x000d_': ''}, regex=True)
return df
def add_dates(df, data_dir):
deaths = pd.read_csv(data_dir + '20200423_extra_dod_dodx.txt')
deaths = deaths.merge(pd.read_csv(data_dir + '20200429_cap_id_lookup.txt'), on='cp1random_id_5_char')
df = df.merge(deaths, on='cp1id')
df['cnr19datedeath'] = pd.to_datetime(df['cnr19datedeath'], dayfirst=True)
df['cnr_date_pca_diag'] = pd.to_datetime(df['cnr_date_pca_diag'], dayfirst=True)
df['cp1vig date completed'] = pd.to_datetime(df['cp1vig date completed'], dayfirst=True)
return df
def add_reviewer_ids(df, data_dir):
reviewers = pd.read_csv(data_dir + '20191119_committee_reviews_nlp_code_update.csv')
df = df.merge(reviewers, on='cp1id')
return df
def get_reviewer_counts(df):
return (df[['cp1id', 'vig_author']].groupby('vig_author')
.count()
.rename(columns={'cp1id': 'number of reviews'}))
def get_date_diff_string(date1, date2, months=True):
if months:
approximate_months = np.round((date1 - date2).days/30.417)
if approximate_months > 0:
diff_string = "moaf"
else:
diff_string = "mobf"
return str(int(np.abs(approximate_months))) + diff_string
else:
approximate_years = np.round((date1 - date2).days/365.25)
if approximate_years > 0:
diff_string = "yeaf"
else:
diff_string = "yebf"
return str(int(np.abs(approximate_years))) + diff_string
def replace_single_matches(text, pattern, relative_to, verbose=False):
match = re.search(pattern, text)
while match is not None:
try:
repl = get_date_diff_string(pd.to_datetime(match.group(), dayfirst=True), relative_to)
except ValueError:
if verbose:
print("Invaldid date format: ", match)
repl = ' '
text = text[0:match.span()[0]] + repl + text[match.span()[1]:]
match = re.search(pattern, text)
return text
def replace_dates(text, relative_to=pd.to_datetime('01/01/2010')):
patterns = ['\d{1,2}\/\d{1,2}\/\d{4}', # format:21/02/2020
'\d{1,2}\/\d{4}', # format:02/2020
'(?:January|February|March|April|May|June|July|August|September|October|November|December)[\s-]\d{1,2}[\s-]\d{2,4}', # format: January 02 2020
'(?:January|February|March|April|May|June|July|August|September|October|November|December)[\s-]\d{2,4}', # format: January 2020
'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[\s-]\d{2,4}' # format: Jan 2020
'(?:19|20)\d{2}'] # format: 2020
for pattern in patterns:
text = replace_single_matches(text, pattern, relative_to)
return text
def convert_dates_relative(df):
df['combined'] = [replace_dates(row.combined, row['cnr19datedeath']) for ri, row in df.iterrows()]
return df
def get_easy_and_hard_cases(df, subset_x, subset_y):
easy = [1, 5]
hard = [2, 4]
easy_x = []
hard_x = []
easy_y = []
hard_y = []
for i, xi in enumerate(subset_x):
if df.loc[subset_y.index[i]].cp1do_cod_route in easy:
easy_x.append(xi)
easy_y.append(subset_y.iloc[i])
elif df.loc[subset_y.index[i]].cp1do_cod_route in hard:
hard_x.append(xi)
hard_y.append(subset_y.iloc[i])
else:
print("Not easy or hard: ", i)
return easy_x, hard_x, easy_y, hard_y