-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCSV_Data_Processing_Project_C3W4.py
More file actions
332 lines (286 loc) · 12 KB
/
CSV_Data_Processing_Project_C3W4.py
File metadata and controls
332 lines (286 loc) · 12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
"""
Project for Week 4 of "Python Data Analysis".
Processing CSV files with baseball stastics.
Be sure to read the project description page for further information
about the expected behavior of the program.
"""
import csv
##
## Provided code from Week 3 Project
##
def read_csv_as_list_dict(filename, separator, quote):
"""
Inputs:
filename - name of CSV file
separator - character that separates fields
quote - character used to optionally quote fields
Output:
Returns a list of dictionaries where each item in the list
corresponds to a row in the CSV file. The dictionaries in the
list map the field names to the field values for that row.
"""
table = []
with open(filename, newline='') as csvfile:
csvreader = csv.DictReader(csvfile, delimiter=separator, quotechar=quote)
for row in csvreader:
table.append(row)
return table
def read_csv_as_nested_dict(filename, keyfield, separator, quote):
"""
Inputs:
filename - name of CSV file
keyfield - field to use as key for rows
separator - character that separates fields
quote - character used to optionally quote fields
Output:
Returns a dictionary of dictionaries where the outer dictionary
maps the value in the key_field to the corresponding row in the
CSV file. The inner dictionaries map the field names to the
field values for that row.
"""
table = {}
with open(filename, newline='') as csvfile:
csvreader = csv.DictReader(csvfile, delimiter=separator, quotechar=quote)
for row in csvreader:
rowid = row[keyfield]
table[rowid] = row
return table
##
## Provided formulas for common batting statistics
##
# Typical cutoff used for official statistics
MINIMUM_AB = 500
def batting_average(info, batting_stats):
"""
Inputs:
batting_stats - dictionary of batting statistics (values are strings)
Output:
Returns the batting average as a float
"""
hits = float(batting_stats[info["hits"]])
at_bats = float(batting_stats[info["atbats"]])
if at_bats >= MINIMUM_AB:
return hits / at_bats
else:
return 0
def onbase_percentage(info, batting_stats):
"""
Inputs:
batting_stats - dictionary of batting statistics (values are strings)
Output:
Returns the on-base percentage as a float
"""
hits = float(batting_stats[info["hits"]])
at_bats = float(batting_stats[info["atbats"]])
walks = float(batting_stats[info["walks"]])
if at_bats >= MINIMUM_AB:
return (hits + walks) / (at_bats + walks)
else:
return 0
def slugging_percentage(info, batting_stats):
"""
Inputs:
batting_stats - dictionary of batting statistics (values are strings)
Output:
Returns the slugging percentage as a float
"""
hits = float(batting_stats[info["hits"]])
doubles = float(batting_stats[info["doubles"]])
triples = float(batting_stats[info["triples"]])
home_runs = float(batting_stats[info["homeruns"]])
singles = hits - doubles - triples - home_runs
at_bats = float(batting_stats[info["atbats"]])
if at_bats >= MINIMUM_AB:
return (singles + 2 * doubles + 3 * triples + 4 * home_runs) / at_bats
else:
return 0
##
## Part 1: Functions to compute top batting statistics by year
##
def filter_by_year(statistics, year, yearid):
"""
Inputs:
statistics - List of batting statistics dictionaries
year - Year to filter by
yearid - Year ID field in statistics
Outputs:
Returns a list of batting statistics dictionaries that
are from the input year.
"""
filtered_list = []
for item in statistics:
if item[yearid]==str(year):
filtered_list.append(item)
return filtered_list
def top_player_ids(info, statistics, formula, numplayers):
"""
Inputs:
info - Baseball data information dictionary
statistics - List of batting statistics dictionaries
formula - function that takes an info dictionary and a
batting statistics dictionary as input and
computes a compound statistic
numplayers - Number of top players to return
Outputs:
Returns a list of tuples, player ID and compound statistic
computed by formula, of the top numplayers players sorted in
decreasing order of the computed statistic.
"""
top_players_list = []
for player in statistics:
computed_statistics = formula(info, player)
top_players_list.append((player[info["playerid"]], computed_statistics))
top_players_list.sort(key = lambda player: player[1] , reverse=True)
required_num_of_top_players = top_players_list[:numplayers]
return required_num_of_top_players
def lookup_player_names(info, top_ids_and_stats):
"""
Inputs:
info - Baseball data information dictionary
top_ids_and_stats - list of tuples containing player IDs and
computed statistics
Outputs:
List of strings of the form "x.xxx --- FirstName LastName",
where "x.xxx" is a string conversion of the float stat in
the input and "FirstName LastName" is the name of the player
corresponding to the player ID in the input.
"""
player_names_list = []
nested_dict = read_csv_as_nested_dict(info["masterfile"], info["playerid"],info["separator"],info["quote"])
for item in top_ids_and_stats:
player_detail = nested_dict[item[0]]
first_name = player_detail[info["firstname"]]
last_name = player_detail[info["lastname"]]
required_string = "{:.3f} --- ".format(item[1])+first_name + " " + last_name
player_names_list.append(required_string)
return player_names_list
def compute_top_stats_year(info, formula, numplayers, year):
"""
Inputs:
info - Baseball data information dictionary
formula - function that takes an info dictionary and a
batting statistics dictionary as input and
computes a compound statistic
numplayers - Number of top players to return
year - Year to filter by
Outputs:
Returns a list of strings for the top numplayers in the given year
according to the given formula.
"""
statistics = read_csv_as_list_dict(info["battingfile"], info["separator"], info["quote"])
filter_by_year_list = filter_by_year(statistics, year, info["yearid"])
top_ids_and_stats = top_player_ids(info, filter_by_year_list, formula, numplayers)
top_players_name_list = lookup_player_names(info, top_ids_and_stats)
return top_players_name_list
##
## Part 2: Functions to compute top batting statistics by career
##
def aggregate_by_player_id(statistics, playerid, fields):
"""
Inputs:
statistics - List of batting statistics dictionaries
playerid - Player ID field name
fields - List of fields to aggregate
Output:
Returns a nested dictionary whose keys are player IDs and whose values
are dictionaries of aggregated stats. Only the fields from the fields
input will be aggregated in the aggregated stats dictionaries.
"""
aggregate_dict= {}
player_ids_visited= []
for player in statistics:
if player[playerid] not in player_ids_visited:
player_ids_visited.append(player[playerid])
modified_player = {}
modified_player[playerid]=player[playerid]
for field in fields:
modified_player[field] = 0
modified_player[field] += int(player[field])
aggregate_dict[player[playerid]] = modified_player
else:
old_info = aggregate_dict[player[playerid]]
for field in fields:
old_info[field] += int(player[field])
return aggregate_dict
def compute_top_stats_career(info, formula, numplayers):
"""
Inputs:
info - Baseball data information dictionary
formula - function that takes an info dictionary and a
batting statistics dictionary as input and
computes a compound statistic
numplayers - Number of top players to return
"""
statistics = read_csv_as_list_dict(info["battingfile"], info["separator"], info["quote"])
fields = [info["atbats"], info["hits"],info["doubles"],info["triples"],info["homeruns"],info["walks"]]
aggregate_by_player_list=[]
aggregate_by_player_dict = aggregate_by_player_id(statistics, info["playerid"], fields)
for item in aggregate_by_player_dict.values():
aggregate_by_player_list.append(item)
top_ids_and_stats = top_player_ids(info, aggregate_by_player_list, formula, numplayers)
top_players_name_list = lookup_player_names(info, top_ids_and_stats)
return top_players_name_list
##
## Provided testing code
##
def test_baseball_statistics():
"""
Simple testing code.
"""
#
# Dictionary containing information needed to access baseball statistics
# This information is all tied to the format and contents of the CSV files
#
baseballdatainfo = {"masterfile": "Master_2016.csv", # Name of Master CSV file
"battingfile": "Batting_2016.csv", # Name of Batting CSV file
"separator": ",", # Separator character in CSV files
"quote": '"', # Quote character in CSV files
"playerid": "playerID", # Player ID field name
"firstname": "nameFirst", # First name field name
"lastname": "nameLast", # Last name field name
"yearid": "yearID", # Year field name
"atbats": "AB", # At bats field name
"hits": "H", # Hits field name
"doubles": "2B", # Doubles field name
"triples": "3B", # Triples field name
"homeruns": "HR", # Home runs field name
"walks": "BB", # Walks field name
"battingfields": ["AB", "H", "2B", "3B", "HR", "BB"]}
print("Top 5 batting averages in 1923")
top_batting_average_1923 = compute_top_stats_year(baseballdatainfo, batting_average, 5, 1923)
for player in top_batting_average_1923:
print(player)
print("")
print("Top 10 batting averages in 2010")
top_batting_average_2010 = compute_top_stats_year(baseballdatainfo, batting_average, 10, 2010)
for player in top_batting_average_2010:
print(player)
print("")
print("Top 10 on-base percentage in 2010")
top_onbase_2010 = compute_top_stats_year(baseballdatainfo, onbase_percentage, 10, 2010)
for player in top_onbase_2010:
print(player)
print("")
print("Top 10 slugging percentage in 2010")
top_slugging_2010 = compute_top_stats_year(baseballdatainfo, slugging_percentage, 10, 2010)
for player in top_slugging_2010:
print(player)
print("")
# You can also use lambdas for the formula
# This one computes onbase plus slugging percentage
print("Top 10 OPS in 2010")
top_ops_2010 = compute_top_stats_year(baseballdatainfo,
lambda info, stats: (onbase_percentage(info, stats) +
slugging_percentage(info, stats)),
10, 2010)
for player in top_ops_2010:
print(player)
print("")
print("Top 20 career batting averages")
top_batting_average_career = compute_top_stats_career(baseballdatainfo, batting_average, 20)
for player in top_batting_average_career:
print(player)
print("")
# Make sure the following call to test_baseball_statistics is
# commented out when submitting to OwlTest/CourseraTest.
#test_baseball_statistics()