-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcds_demo_complete.py
More file actions
109 lines (82 loc) · 2.91 KB
/
cds_demo_complete.py
File metadata and controls
109 lines (82 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# clear all
%reset -f
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as sm
import os
# get current directory
os.getcwd()
# set working directory
os.chdir('/Users/jng2/Dropbox/Work/Library/CDS/Python pandas')
dfauto = pd.read_csv('auto.csv')
dfauto
dfauto.head()
dfauto.dtypes
dfauto.columns.values
dfauto.describe()
# split make into two parts, store as new columns
dfauto['make'] = dfauto['vehicle'].str.split().str[0]
dfauto['model'] = dfauto['vehicle'].str.split().str[1:]
dfauto['model'] = dfauto['model'].apply(' '.join)
# move vehicle, make and model to front
cols = list(dfauto.columns)
cols = [cols[-1]] + cols[:-1]
cols = [cols[-1]] + cols[:-1]
dfauto = dfauto[cols]
# compute average price and mpg by make
aveprice = dfauto['price'].groupby(dfauto['make']).mean()
aveprice.name = 'ave_price'
avempg = dfauto['mpg'].groupby(dfauto['make']).mean()
avempg.name = 'ave_mpg'
# store as new dataframe
dfmeans = pd.concat([aveprice, avempg], axis='columns')
dfmeans['make'] = dfmeans.index
# remove row names and commit change in place
dfmeans.reset_index(drop=True, inplace=True)
# scatter plot of ave_price vs ave_mpg
dfmeans.plot(x='ave_price', y='ave_mpg', kind='scatter')
# scatter plot with labeled points
fig, ax=plt.subplots()
ax.scatter( dfmeans['ave_price'], dfmeans['ave_mpg'])
for i, row in dfmeans.iterrows():
ax.annotate( row['make'], xy=(row['ave_price'], row['ave_mpg']))
# save figure to working directory
plt.savefig('scatterplot.png')
# back to original data for more exploration
dfauto['foreign'].value_counts()
dfauto['foreign'].value_counts().plot(kind='bar')
# create dummy variable for good fuel economy
dfauto['goodmpg'] = dfauto['mpg'] >= 30
# cross tabulate
pd.crosstab(dfauto['foreign'], dfauto['goodmpg'])
# identify vehicles with good mpg
dfauto.loc[ dfauto['goodmpg']==True ]
dfauto.loc[ dfauto['goodmpg']==True, 'make' ]
# regression
model = sm.ols(formula="price ~ mpg + foreign", data=dfauto).fit()
print(model.summary())
# reshape data from wide to long
df_bpwide = pd.read_csv('bpwide.csv')
df_bplong = pd.wide_to_long(df_bpwide, 'bp', i='patient', j='when',
sep='_',
suffix='.')
df_bplong.reset_index(inplace=True)
# sort data by patient
df_bplong = df_bplong.sort_values('patient')
# merging two datasets
dfhappy = pd.read_csv('happy_annual.csv')
dfgdp = pd.read_csv('gdp_annual.csv')
df_happy_gdp = pd.merge(dfhappy, dfgdp)
# plot gdp and happiness over time (two y axes)
fig, ax1 = plt.subplots()
line1 = ax1.plot( df_happy_gdp['year'], df_happy_gdp['happy'])
ax2 = ax1.twinx()
line2 = ax2.plot( df_happy_gdp['year'], df_happy_gdp['gdppc'],
linestyle='dashed',
color='green')
# add legend
line1, label1 = ax1.get_legend_handles_labels()
line2, label2 = ax2.get_legend_handles_labels()
ax2.legend(line1 + line2, label1 + label2)
plt.savefig('gdphappytrends.png')