-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpfr_meta_data_pull.py
More file actions
354 lines (333 loc) · 12.8 KB
/
pfr_meta_data_pull.py
File metadata and controls
354 lines (333 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
## This script uses the box score URLs from the game linke scraper
## and collects meta data on that page
import requests
import time
import random
from bs4 import BeautifulSoup
import pandas as pd
import numpy
data_folder = 'file path to folder where all data will be held...no trailing slash'
## Pull in URLs by turning data fram into list ##
url_file = '{0}/game_links_1960_to_2018.csv'.format(data_folder)
url_df = pd.read_csv(url_file)
filtered_df = url_df[url_df['Season'] >= 1990] ## hasn't been tested before 1990, but would work in theory ##
urls = filtered_df['Box Score Link'].tolist()
## helper data structures ##
month_translation = {
'Jan' : 1,
'Feb' : 2,
'Mar' : 3,
'Apr' : 4,
'May' : 5,
'Jun' : 6,
'Jul' : 7,
'Jul' : 8,
'Aug' : 8,
'Sep' : 9,
'Oct' : 10,
'Nov' : 11,
'Dec' : 12,
}
## helper functions for scraping ##
## These are seperate mainly for read-ability ##
def get_meta_data_points (score_box_div):
sub_divs = score_box_div.find_all('div', recursive=False)
del sub_divs[-1] ## last div is a citation ##
game_day = sub_divs[0].text.split(' ')[0]
game_year = sub_divs[0].text.split(' ')[3]
game_month = month_translation[sub_divs[0].text.split(' ')[1]]
game_day_num = sub_divs[0].text.split(' ')[2].split(',')[0]
if len(str(game_month)) == 1:
game_month = '0{0}'.format(game_month)
else:
game_month = str(game_month)
if len(str(game_day_num)) == 1:
game_day_num = '0{0}'.format(game_day_num)
else:
game_day_num = str(game_day_num)
game_date = '{0}-{1}-{2}'.format(game_year,game_month,game_day_num)
local_start_time = sub_divs[1].text.split(': ')[1]
stadium = sub_divs[2].text.split(': ')[1].strip()
stadium_link = sub_divs[2].find('a').get('href')
if len(sub_divs) < 4:
game_length = numpy.nan
attendance = numpy.nan
elif len(sub_divs) < 5:
game_length = numpy.nan
attendance = sub_divs[3].text.split(': ')[1]
else:
game_length_unformat = sub_divs[4].text.split(': ')[1]
game_length = int(game_length_unformat.split(':')[0]) * 60 + int(game_length_unformat.split(':')[1])
attendance = sub_divs[3].text.split(': ')[1]
return game_day, game_date, local_start_time, game_length, stadium, stadium_link, attendance
def get_game_info (game_info_div):
won_toss = numpy.nan
won_toss_ot = numpy.nan
roof = numpy.nan
surface = numpy.nan
weather = numpy.nan
vegas_line = numpy.nan
over_under = numpy.nan
if game_info_div == None:
pass
else:
for row in game_info_div.find_all('tr'):
try:
stat_name = row.find('th').text
stat_value = row.find('td').text
except:
stat_name = None
stat_value = None
if stat_name == 'Won Toss':
won_toss = stat_value
elif stat_name == 'Roof':
roof = stat_value
elif stat_name == 'Surface':
surface = stat_value
elif stat_name == 'Weather':
weather = stat_value
elif stat_name == 'Vegas Line':
vegas_line = stat_value
elif stat_name == 'Over/Under':
over_under = stat_value
return won_toss, won_toss_ot, roof, surface, weather, vegas_line, over_under
def get_qb_info (starter_div):
qb = numpy.nan
qb_link = numpy.nan
if starter_div == None:
pass
else:
for row in starter_div.find_all('tr'):
try:
player_name = row.find('th').find('a')
player_position = row.find('td').text
except:
player_name = None
player_position = None
if player_position == 'QB':
qb = player_name.text
qb_link = player_name.get('href')
else:
pass
return qb, qb_link
def get_officials_info(officials_div):
referee = numpy.nan
umpire = numpy.nan
down_judge = numpy.nan
line_judge = numpy.nan
back_judge = numpy.nan
side_judge = numpy.nan
field_judge = numpy.nan
if(officials_div) == None:
pass
else:
for row in officials_div.find_all('tr'):
try:
official_pos = row.find('th').text
official_name = row.find('td').find('a').text
except:
official_pos = None
official_name = None
if official_pos == 'Referee':
referee = official_name
elif official_pos == 'Umpire':
umpire = official_name
elif official_pos == 'Down Judge' or official_pos == 'Head Linesman':
down_judge = official_name
elif official_pos == 'Line Judge':
line_judge = official_name
elif official_pos == 'Back Judge':
back_judge = official_name
elif official_pos == 'Side Judge':
side_judge = official_name
elif official_pos == 'Field Judge':
field_judge = official_name
return referee, umpire, down_judge, line_judge, back_judge, side_judge, field_judge
game_data_rows = []
broken_box_list = []
for url in urls:
time.sleep((.75 + random.random() * .5))
try:
game_data_points = {
'Game Link' : None,
'Game Date' : None,
'Game Day' : None,
'Local Start Time' : None,
'Game Length' : None,
'Stadium' : None,
'Stadium Link' : None,
'Attendance' : None,
'Season': None,
'Week' : None,
'Home Team' : None,
'Away Team' : None,
'Home Record' : None,
'Away Record' : None,
'Home Score' : None,
'Away Score' : None,
'Home Coach' : None,
'Away Coach' : None,
'Home Coach Link' : None,
'Away Coach Link' : None,
'Home Starting QB' : None,
'Away Starting QB' : None,
'Home Starting QB Link' : None,
'Away Starting QB Link' : None,
'Won Toss' : None,
'Won Toss (OT)' : None,
'Roof' : None,
'Surface' : None,
'Weather' : None,
'Vegas Line' : None,
'Over/Under' : None,
'Referee' : None,
'Umpire' : None,
'Head Linesman / Down Judge' : None,
'Line Judge' : None,
'Back Judge' : None,
'Side Judge' : None,
'Field Judge' : None,
}
raw = requests.get(url)
parsed = BeautifulSoup(raw.content, 'html.parser')
score_board_divs = parsed.find('div', {'class' : 'scorebox'}).find_all('div', recursive=False)
home_div = score_board_divs[0]
away_div = score_board_divs[1]
meta_div = score_board_divs[2]
away_div_divs = away_div.find_all('div', recursive=False)
away_team = away_div_divs[0].find('a', {'itemprop' : 'name'}).text
try:
away_score = int(away_div_divs[1].find('div').text)
except:
away_score = int(away_div_divs[1].text)
away_record = away_div_divs[2].text
away_coach = away_div_divs[4].find('a').text
away_coach_link = away_div_divs[4].find('a').get('href')
home_div_divs = home_div.find_all('div', recursive=False)
home_team = home_div_divs[0].find('a', {'itemprop' : 'name'}).text
try:
home_score = int(home_div_divs[1].find('div').text)
except:
home_score = int(home_div_divs[1].text)
home_record = home_div_divs[2].text
home_coach = home_div_divs[4].find('a').text
home_coach_link = home_div_divs[4].find('a').get('href')
try: ## pfr's commenting messes up bs4s parsing, so the specific part has to get pulled as text and re-parsed ##
game_info_div_effed = str(parsed.find('div', {'id': 'all_game_info'}))
game_info_div = BeautifulSoup(game_info_div_effed.split('<!--')[1].split('-->')[0], 'html.parser')
except:
game_info_div = None
try:
home_starter_div_effed = str(parsed.find('div', {'id' : 'all_home_starters'}))
home_starter_div = BeautifulSoup(home_starter_div_effed.split('<!--')[1].split('-->')[0], 'html.parser')
except:
home_starter_div = None
try:
away_starter_div_effed = str(parsed.find('div', {'id' : 'all_vis_starters'}))
away_starter_div = BeautifulSoup(away_starter_div_effed.split('<!--')[1].split('-->')[0], 'html.parser')
except:
away_starter_div = None
try:
officials_div_effed = str(parsed.find('div', {'id' : 'all_officials'}))
officials_div = BeautifulSoup(officials_div_effed.split('<!--')[1].split('-->')[0], 'html.parser')
except:
officials_div = None
game_day, game_date, local_start_time, game_length, stadium, stadium_link, attendance = get_meta_data_points(meta_div)
won_toss, won_toss_ot, roof, surface, weather, vegas_line, over_under = get_game_info(game_info_div)
home_qb, home_qb_link = get_qb_info(home_starter_div)
away_qb, away_qb_link = get_qb_info(away_starter_div)
referee, umpire, down_judge, line_judge, back_judge, side_judge, field_judge = get_officials_info(officials_div)
game_data_points['Game Link'] = url
game_data_points['Game Date'] = game_date
game_data_points['Game Day'] = game_day
game_data_points['Local Start Time'] = local_start_time
game_data_points['Game Length'] = game_length
game_data_points['Stadium'] = stadium
game_data_points['Stadium Link'] = stadium_link
game_data_points['Attendance'] = attendance
game_data_points['Season'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Season']
game_data_points['Week'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Week Number']
game_data_points['Home Team'] = home_team
game_data_points['Away Team'] = away_team
game_data_points['Home Record'] = home_record
game_data_points['Away Record'] = away_record
game_data_points['Home Score'] = home_score
game_data_points['Away Score'] = away_score
game_data_points['Home Coach'] = home_coach
game_data_points['Away Coach'] = away_coach
game_data_points['Home Coach Link'] = home_coach_link
game_data_points['Away Coach Link'] = away_coach_link
game_data_points['Home Starting QB'] = home_qb
game_data_points['Away Starting QB'] = away_qb
game_data_points['Home Starting QB Link'] = home_qb_link
game_data_points['Away Starting QB Link'] = away_qb_link
game_data_points['Won Toss'] = won_toss
game_data_points['Won Toss (OT)'] = won_toss_ot
game_data_points['Roof'] = roof
game_data_points['Surface'] = surface
game_data_points['Weather'] = weather
game_data_points['Vegas Line'] = vegas_line
game_data_points['Over/Under'] = over_under
game_data_points['Referee'] = referee
game_data_points['Umpire'] = umpire
game_data_points['Head Linesman / Down Judge'] = down_judge
game_data_points['Line Judge'] = line_judge
game_data_points['Back Judge'] = back_judge
game_data_points['Side Judge'] = side_judge
game_data_points['Field Judge'] = field_judge
game_data_rows.append(game_data_points)
except:
broken_row = {
'Season' : None,
'Week' : None,
'URL' : None,
}
broken_row['Season'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Season']
broken_row['Week'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Week']
broken_row['URL'] = url
broken_box_list.append(broken_row)
print('ROW BROKEN {0}'.format(broken_row))
df = pd.DataFrame(game_data_rows)
df_two = pd.DataFrame(broken_box_list)
headers = [
'Game Link',
'Game Date',
'Game Day',
'Local Start Time',
'Game Length',
'Stadium',
'Stadium Link',
'Attendance',
'Season',
'Week',
'Home Team',
'Away Team',
'Home Record',
'Away Record',
'Home Score',
'Away Score',
'Home Coach',
'Away Coach',
'Home Coach Link',
'Away Coach Link',
'Home Starting QB',
'Away Starting QB',
'Home Starting QB Link',
'Away Starting QB Link',
'Won Toss',
'Won Toss (OT)',
'Roof',
'Surface',
'Weather',
'Vegas Line',
'Over/Under',
'Referee',
'Umpire',
'Head Linesman / Down Judge',
'Line Judge',
'Back Judge',
'Side Judge',
'Field Judge'
]
df = df[headers]
df.to_csv('{0}/game_meta_data.csv'.format(data_folder))