-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataframe_creation.py
More file actions
116 lines (99 loc) · 5.84 KB
/
dataframe_creation.py
File metadata and controls
116 lines (99 loc) · 5.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# All necessary imports
import pandas as pd
from functools import reduce
# Function that, given xpt file and dictionary of old/new column names,
# return DataFrame with those columns
def select_columns(file_path, names):
data_df = pd.read_sas(file_path)
data_df = data_df[names.keys()]
final_df = data_df.rename(columns= names)
return final_df
# Simple Demographic Dataframe
demographics_df = select_columns("data/DEMO_L.xpt", {"SEQN": "respondent_id",
"RIAGENDR": "gender",
"RIDAGEYR": "age",
"RIDRETH3": "race",
"DMDEDUC2": "education",
"DMDMARTZ": "marital_status",
"WTINT2YR": "weight"
})
# Id Dataframe
id_df = demographics_df[['respondent_id']]
#single lab_data dataframes
cholesterol_df = select_columns("data/TCHOL_L.xpt", {"SEQN": "respondent_id",
"LBXTC": "total_cholesteral"})
glycohemoglobin_df = select_columns("data/GHB_L.xpt", {"SEQN": "respondent_id",
"LBXGH": "glycohemoglobin"
})
insulin_df = select_columns("data/INS_L.xpt", {"SEQN": "respondent_id",
"LBXIN": "insulin"
})
ferritin_df = select_columns("data/FERTIN_L.xpt", {"SEQN": "respondent_id",
"LBXFER": "ferritin"
})
plasma_fasting_glucose = select_columns("data/GLU_L.xpt", {"SEQN": "respondent_id",
"LBXGLU": "fasting_glucose"
})
# merged lab_data dataframe using inner join
data_frames = [id_df, cholesterol_df, glycohemoglobin_df, insulin_df, ferritin_df, plasma_fasting_glucose]
lab_data = reduce(lambda left, right: pd.merge(left, right, on = ['respondent_id'], how= 'left'), data_frames)
# single diabetes_df
diabetes_df = select_columns("data/DIQ_L.xpt", {"SEQN": "respondent_id",
"DIQ010": "diabetes_diagnosis",
"DID040": "age_of_diabetes_diagnosis",
"DIQ160": "prediabetes_diagnosis",
"DIQ180": "blood_tested",
"DIQ050": "taking_insulin",
"DIQ070": "taking_diabetic_pills"
})
print(diabetes_df)
# Questionaire Dataframe
# Building Smaller Dataframes first then merging
# Does not include physical activity just yet
blood_pressure_df = select_columns("data/BPQ_L.xpt", {'SEQN': 'respondent_id',
'BPQ150': "high_blood_pressure_meds",
'BPQ101D': 'cholesterol_meds'}
)
alcohol_use_df = select_columns("data/ALQ_L.xpt", {'SEQN': 'respondent_id',
'ALQ121': 'yearly_alc_count'
})
balance_df = select_columns("data/BAQ_L.xpt", {'SEQN': 'respondent_id',
'BAQ530': 'five_years_fall_count',
'BAQ321D': 'yearly_light_headedness_count',
'BAQ321B': 'yearly_blurry_vision_count',
'BAQ560': 'yearly_fall_injury_count'
})
sleep_disorder_df = select_columns("data/SLQ_L.xpt", {'SEQN': 'respondent_id',
'SLD012': 'sleep_hours'
})
cigarette_use_df = select_columns("data/SMQ_L.xpt", {'SEQN': 'respondent_id',
'SMQ020': 'hundred_cigars'
})
household_smoke_df = select_columns("data/SMQFAM_L.xpt", {'SEQN': 'respondent_id',
'SMD470': 'household_smoker_count'
})
weight_df = select_columns("data/WHQ_L.xpt", {'SEQN': 'respondent_id',
'WHD020': 'current_weight_pounds',
'WHD050': 'weight_lbs_year_ago',
'WHD010': 'height_inch'
})
aspirin_df = select_columns("data/RXQASA_L.xpt", {'SEQN': 'respondent_id',
'RXQ520': 'self_aspirin',
'RXQ515': 'advice_aspirin'
})
exercise_df = select_columns("data/PAQ_L.xpt", {'SEQN': 'respondent_id',
'PAD790Q': 'moderate_frequency',
'PAD790U': 'moderate_units',
'PAD810Q': 'vigorous_frequency',
'PAD810U': 'vigorous_units'
})
# merged questionaire_data dataframe using inner join
data_frames = [id_df, blood_pressure_df, alcohol_use_df, balance_df, sleep_disorder_df, cigarette_use_df, household_smoke_df, weight_df, aspirin_df, exercise_df]
questionaire_data = reduce(lambda left, right: pd.merge(left, right, on = ['respondent_id'], how= 'left'), data_frames)
print(questionaire_data)
# Merging all dataframes
final_dataframe = pd.merge(demographics_df, diabetes_df, on = ['respondent_id'])
final_dataframe = pd.merge(final_dataframe, lab_data, on = ['respondent_id'], how= 'left')
final_dataframe = pd.merge(final_dataframe, questionaire_data, on = ['respondent_id'], how= 'left')
print(final_dataframe)
final_dataframe.to_csv("merged_df.csv", index= False)