CAP-classification/cap_helper.py at master · UHBristolDataScience/CAP-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
import numpy as np
import re

"""
Notes on time conversion:
Where days are months are missing from the date it defaults to 1st day/month. So times before DOD will be overestimated. One option would be to use years rather than months.
Probably missing some patterns below, but this should cover most cases (notably 21 Jan 2020 will omit leave a hanging 21). There is one typo written '22/112007' which fails silently (just replaces date with a space). And other dates in formt 21-01-2020 which fail.
"""


def load_data(data_dir):

    return pd.read_excel(data_dir + '20191028_committee_reviews_nlp_code.xlsx',
                         sheet_name='_20191028_committee_reviews_nlp', engine='openpyxl')


def concatenate_feature_columns(df, columns=None, remove_nl=True):

    if columns is None:
        # create column with concatenation of all columns for any case,
        # except for the ones we are trying to predict (the last two)
        cols = df.columns
        cols = cols[2:27]  # This excludes 'cp1id', 'cp1vig date completed', 'cp1vig summary'
        cols = [c for c in cols if 'palliative' not in c]  # remove palliative column because confounded
    else:
        cols = columns

    df['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    df['combined'] = df['combined'].replace({'nan': ''}, regex=True)

    if remove_nl:
        df['combined'] = df['combined'].replace({'_x000d_': ''}, regex=True)

    return df


def add_dates(df, data_dir):

    deaths = pd.read_csv(data_dir + '20200423_extra_dod_dodx.txt')
    deaths = deaths.merge(pd.read_csv(data_dir + '20200429_cap_id_lookup.txt'), on='cp1random_id_5_char')
    df = df.merge(deaths, on='cp1id')

    df['cnr19datedeath'] = pd.to_datetime(df['cnr19datedeath'], dayfirst=True)
    df['cnr_date_pca_diag'] = pd.to_datetime(df['cnr_date_pca_diag'], dayfirst=True)
    df['cp1vig date completed'] = pd.to_datetime(df['cp1vig date completed'], dayfirst=True)

    return df


def add_reviewer_ids(df, data_dir):

    reviewers = pd.read_csv(data_dir + '20191119_committee_reviews_nlp_code_update.csv')
    df = df.merge(reviewers, on='cp1id')

    return df


def get_reviewer_counts(df):

    return (df[['cp1id', 'vig_author']].groupby('vig_author')
                                       .count()
                                       .rename(columns={'cp1id': 'number of reviews'}))


def get_date_diff_string(date1, date2, months=True):

    if months:
        approximate_months = np.round((date1 - date2).days/30.417)
        if approximate_months > 0:
            diff_string = "moaf"
        else:
            diff_string = "mobf"
        return str(int(np.abs(approximate_months))) + diff_string
    else:
        approximate_years = np.round((date1 - date2).days/365.25)
        if approximate_years > 0:
            diff_string = "yeaf"
        else:
            diff_string = "yebf"
        return str(int(np.abs(approximate_years))) + diff_string


def replace_single_matches(text, pattern, relative_to, verbose=False):

    match = re.search(pattern, text)
    while match is not None:

        try:
            repl = get_date_diff_string(pd.to_datetime(match.group(), dayfirst=True), relative_to)

        except ValueError:
            if verbose:
                print("Invaldid date format: ", match)
            repl = ' '

        text = text[0:match.span()[0]] + repl + text[match.span()[1]:]
        match = re.search(pattern, text)

    return text


def replace_dates(text, relative_to=pd.to_datetime('01/01/2010')):

    patterns = ['\d{1,2}\/\d{1,2}\/\d{4}',  # format:21/02/2020
                '\d{1,2}\/\d{4}',           # format:02/2020
                '(?:January|February|March|April|May|June|July|August|September|October|November|December)[\s-]\d{1,2}[\s-]\d{2,4}',  # format: January 02 2020
                '(?:January|February|March|April|May|June|July|August|September|October|November|December)[\s-]\d{2,4}',  # format: January 2020
                '(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[\s-]\d{2,4}'  # format: Jan 2020
                '(?:19|20)\d{2}']  # format: 2020

    for pattern in patterns:
        text = replace_single_matches(text, pattern, relative_to)

    return text


def convert_dates_relative(df):

    df['combined'] = [replace_dates(row.combined, row['cnr19datedeath']) for ri, row in df.iterrows()]
    return df


def get_easy_and_hard_cases(df, subset_x, subset_y):

    easy = [1, 5]
    hard = [2, 4]

    easy_x = []
    hard_x = []
    easy_y = []
    hard_y = []

    for i, xi in enumerate(subset_x):
        if df.loc[subset_y.index[i]].cp1do_cod_route in easy:
            easy_x.append(xi)
            easy_y.append(subset_y.iloc[i])
        elif df.loc[subset_y.index[i]].cp1do_cod_route in hard:
            hard_x.append(xi)
            hard_y.append(subset_y.iloc[i])
        else:
            print("Not easy or hard: ", i)

    return easy_x, hard_x, easy_y, hard_y