DS3-Project/dataframe_creation.py at main · deepaldeleena/DS3-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# All necessary imports
import pandas as pd
from functools import reduce

# Function that, given xpt file and dictionary of old/new column names,
# return DataFrame with those columns
def select_columns(file_path, names):
    data_df = pd.read_sas(file_path)
    data_df = data_df[names.keys()]
    final_df = data_df.rename(columns= names)
    return final_df

# Simple Demographic Dataframe
demographics_df = select_columns("data/DEMO_L.xpt", {"SEQN": "respondent_id",
                                                "RIAGENDR": "gender",
                                                "RIDAGEYR": "age",
                                                "RIDRETH3": "race",
                                                "DMDEDUC2": "education",
                                                "DMDMARTZ": "marital_status",
                                                "WTINT2YR": "weight"
                                                })

# Id Dataframe
id_df = demographics_df[['respondent_id']]

#single lab_data dataframes
cholesterol_df = select_columns("data/TCHOL_L.xpt", {"SEQN": "respondent_id",
"LBXTC": "total_cholesteral"})

glycohemoglobin_df = select_columns("data/GHB_L.xpt", {"SEQN": "respondent_id",
"LBXGH": "glycohemoglobin"
})

insulin_df = select_columns("data/INS_L.xpt", {"SEQN": "respondent_id",
"LBXIN": "insulin"
})

ferritin_df = select_columns("data/FERTIN_L.xpt", {"SEQN": "respondent_id",
"LBXFER": "ferritin"
})

plasma_fasting_glucose = select_columns("data/GLU_L.xpt", {"SEQN": "respondent_id",
"LBXGLU": "fasting_glucose"
})

# merged lab_data dataframe using inner join
data_frames = [id_df, cholesterol_df, glycohemoglobin_df, insulin_df, ferritin_df, plasma_fasting_glucose]
lab_data = reduce(lambda left, right: pd.merge(left, right, on = ['respondent_id'], how= 'left'), data_frames)

# single diabetes_df
diabetes_df = select_columns("data/DIQ_L.xpt", {"SEQN":  "respondent_id",
                                        "DIQ010": "diabetes_diagnosis",
                                        "DID040": "age_of_diabetes_diagnosis",
                                        "DIQ160": "prediabetes_diagnosis",
                                        "DIQ180": "blood_tested",
                                        "DIQ050": "taking_insulin",
                                        "DIQ070": "taking_diabetic_pills"
})

print(diabetes_df)

# Questionaire Dataframe
# Building Smaller Dataframes first then merging

# Does not include physical activity just yet
blood_pressure_df = select_columns("data/BPQ_L.xpt", {'SEQN': 'respondent_id',
                                                    'BPQ150': "high_blood_pressure_meds",
                                                    'BPQ101D': 'cholesterol_meds'}
                                                    )
alcohol_use_df = select_columns("data/ALQ_L.xpt", {'SEQN': 'respondent_id',
                                                    'ALQ121': 'yearly_alc_count'
                                                    })
balance_df = select_columns("data/BAQ_L.xpt", {'SEQN': 'respondent_id',
                                                'BAQ530': 'five_years_fall_count',
                                                'BAQ321D': 'yearly_light_headedness_count',
                                                'BAQ321B': 'yearly_blurry_vision_count',
                                                'BAQ560': 'yearly_fall_injury_count'
                                                })
sleep_disorder_df = select_columns("data/SLQ_L.xpt", {'SEQN': 'respondent_id',
                                                    'SLD012': 'sleep_hours'
                                                    })
cigarette_use_df = select_columns("data/SMQ_L.xpt", {'SEQN': 'respondent_id',
                                                    'SMQ020': 'hundred_cigars'
                                                    })
household_smoke_df = select_columns("data/SMQFAM_L.xpt", {'SEQN': 'respondent_id',
                                                        'SMD470': 'household_smoker_count'
                                                        })
weight_df = select_columns("data/WHQ_L.xpt", {'SEQN': 'respondent_id',
                                            'WHD020': 'current_weight_pounds',
                                            'WHD050': 'weight_lbs_year_ago',
                                            'WHD010': 'height_inch'
                                            })
aspirin_df = select_columns("data/RXQASA_L.xpt", {'SEQN': 'respondent_id',
                                            'RXQ520': 'self_aspirin',
                                            'RXQ515': 'advice_aspirin'
                                                                                        })

exercise_df = select_columns("data/PAQ_L.xpt", {'SEQN': 'respondent_id',
                                                'PAD790Q': 'moderate_frequency',
                                                'PAD790U': 'moderate_units',
                                                'PAD810Q': 'vigorous_frequency',
                                                'PAD810U': 'vigorous_units'
                                                })

# merged questionaire_data dataframe using inner join
data_frames = [id_df, blood_pressure_df, alcohol_use_df, balance_df, sleep_disorder_df, cigarette_use_df, household_smoke_df, weight_df, aspirin_df, exercise_df]
questionaire_data = reduce(lambda left, right: pd.merge(left, right, on = ['respondent_id'], how= 'left'), data_frames)
print(questionaire_data)

# Merging all dataframes
final_dataframe = pd.merge(demographics_df, diabetes_df, on = ['respondent_id'])
final_dataframe = pd.merge(final_dataframe, lab_data, on = ['respondent_id'], how= 'left')
final_dataframe = pd.merge(final_dataframe, questionaire_data, on = ['respondent_id'], how= 'left')
print(final_dataframe)

final_dataframe.to_csv("merged_df.csv", index= False)