-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtracker_scraper.py
More file actions
397 lines (330 loc) · 15.3 KB
/
tracker_scraper.py
File metadata and controls
397 lines (330 loc) · 15.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import re
import os
from urllib.parse import urljoin
import cloudscraper # Added for bypassing Cloudflare protection
class TrackerScraper:
def __init__(self):
self.base_url = "https://tracker.gg/valorant/guides/clips"
# Use cloudscraper to bypass Cloudflare protection
self.scraper = cloudscraper.create_scraper(
browser={
'browser': 'chrome',
'platform': 'windows',
'desktop': True
},
delay=10
)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "https://tracker.gg/",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "max-age=0"
}
# Agent ID mapping
self.agent_ids = {
"Astra": "41fb69c1-4189-7b37-f117-bcaf1e96f1bf",
"Breach": "5f8d3a7f-467b-97f3-062c-13acf203c006",
"Brimstone": "9f0d8ba9-4140-b941-57d3-a7ad57c6b417",
"Chamber": "22697a3d-45bf-8dd7-4fec-84a9e28c69d7",
"Clove": "1dbf2edd-4729-0984-3115-daa5eed44993",
"Cypher": "117ed9e3-49f3-6512-3ccf-0cada7e3823b",
"Deadlock": "cc8b64c8-4b25-4ff9-6e7f-37b4da43d235",
"Fade": "dade69b4-4f5a-8528-247b-219e5a1facd6",
"Gekko": "e370fa57-4757-3604-3648-499e1f642d3f",
"Harbor": "95b78ed7-4637-86d9-7e41-71ba8c293152",
"Iso": "0e38b510-41a8-5780-5e8f-568b2a4f2d6c",
"Jett": "add6443a-41bd-e414-f6ad-e58d267f4e95",
"KAY/O": "601dbbe7-43ce-be57-2a40-4abd24953621",
"Killjoy": "1e58de9c-4950-5125-93e9-a0aee9f98746",
"Neon": "bb2a4828-46eb-8cd1-e765-15848195d751",
"Omen": "8e253930-4c05-31dd-1b6c-968525494517",
"Phoenix": "eb93336a-449b-9c1b-0a54-a891f7921d69",
"Raze": "f94c3b30-42be-e959-889c-5aa313dba261",
"Reyna": "a3bfb853-43b2-7238-a4f1-ad90e9e46bcc",
"Sage": "569fdd95-4d10-43ab-ca70-79becc718b46",
"Skye": "6f2a04ca-43e0-be17-7f36-b3908627744d",
"Sova": "320b2a48-4d9b-a075-30f1-1f93a9b638fa",
"Tejo": "b444168c-4e35-8076-db47-ef9bf368f384",
"Viper": "707eab51-4836-f488-046a-cda6bf494859",
"Vyse": "efba5359-4016-a1e5-7626-b1ae76895940",
"Yoru": "7f94d92c-4234-0a36-9646-3a87eb8b5c89"
}
# Map ID mapping
self.map_ids = {
"Abyss": "Infinity",
"Ascent": "Ascent",
"Bind": "Duality",
"Breeze": "Foxtrot",
"Fracture": "Canyon",
"Haven": "Triad",
"Icebox": "Port",
"Lotus": "Jam",
"Sunset": "Juliett",
"Split": "Bonsai",
"Pearl": "Pitt"
}
# Team ID mapping
self.team_ids = {
"Attack": "Red",
"Defense": "Blue"
}
self.results = []
self.skipped_entries = 0
self.total_entries = 0
def get_agent_id(self, agent_name):
"""Convert agent name to agent ID"""
return self.agent_ids.get(agent_name, "")
def get_map_id(self, map_name):
"""Convert map name to map ID"""
return self.map_ids.get(map_name, "")
def get_team_id(self, team_name):
"""Convert team name to team ID"""
return self.team_ids.get(team_name, "Neutral")
def clean_tags(self, tags):
"""Remove map, team, and agent tags"""
clean_tags = []
for tag in tags:
# Skip map tags
if tag in self.map_ids:
continue
# Skip team tags (attack, defense)
if tag.lower() in ["attack", "defense"]:
continue
# Skip agent tags
if tag in self.agent_ids:
continue
# Add the tag to the clean tags list
clean_tags.append(tag)
return clean_tags
def fetch_page(self, page_num):
"""Fetch a specific page of clips"""
url = f"{self.base_url}?page={page_num}"
try:
# Use cloudscraper instead of requests
response = self.scraper.get(url, headers=self.headers)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching page {page_num}: {e}")
# Add more detailed error information
if hasattr(e, 'response') and e.response:
print(f"Response status code: {e.response.status_code}")
print(f"Response headers: {e.response.headers}")
return None
def parse_clip(self, clip_element):
"""Parse a single clip element"""
try:
self.total_entries += 1
# Extract title - updated selector
title_element = clip_element.find("p", class_="guide-tile__title")
if title_element and title_element.find("a"):
title = title_element.find("a").text.strip()
else:
title = ""
# Extract description - leave empty if not found
description = ""
# Extract tags from badges
badges_element = clip_element.find("div", class_="guide-tile__badges")
tags = []
if badges_element:
badge_elements = badges_element.find_all("span", class_="badge")
tags = [tag.text.strip() for tag in badge_elements]
# Extract map ID from tags
map_id = ""
for tag in tags:
if tag in self.map_ids:
map_id = self.get_map_id(tag)
break
# Extract team ID from tags (convert "attack" or "defense" to proper case)
team_id = "Neutral"
for tag in tags:
tag_lower = tag.lower()
if tag_lower == "attack":
team_id = self.get_team_id("Attack")
break
elif tag_lower == "defense":
team_id = self.get_team_id("Defense")
break
# Extract agent ID from tags
agent_id = ""
for tag in tags:
if tag in self.agent_ids:
agent_id = self.get_agent_id(tag)
break
# Clean tags (remove map, team, agent)
clean_tags = self.clean_tags(tags)
# Extract source URL from the title link (needed for fetching the video URL)
source_url = ""
if title_element and title_element.find("a") and title_element.find("a").get("href"):
source_url = "https://tracker.gg" + title_element.find("a")["href"]
# Extract thumbnail URL from the img element in guide-tile__video
thumbnail_url = ""
video_div = clip_element.find("div", class_="guide-tile__video")
if video_div:
img_element = video_div.find("img")
if img_element and img_element.get("src"):
thumbnail_url = img_element["src"]
# Extract video URL - try to get the iframe URL if possible
video_url = ""
if source_url:
try:
# Fetch the clip detail page to get the iframe
detail_response = self.scraper.get(source_url, headers=self.headers)
if detail_response.status_code == 200:
detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
# Look for iframe with videodelivery.net URL
iframe = detail_soup.find("iframe", src=lambda s: s and "videodelivery.net" in s)
if iframe and iframe.get("src"):
video_url = iframe["src"]
# If iframe not found, look for a video element
if not video_url:
video_element = detail_soup.find("video", src=lambda s: s and "cloudflarestream" in s)
if video_element and video_element.get("src"):
video_url = video_element["src"]
# If we still don't have a video URL, try to extract it from the thumbnail
if not video_url and thumbnail_url:
import re
video_id_match = re.search(r'cloudflarestream\.com/([^/]+)/', thumbnail_url)
if video_id_match:
video_id = video_id_match.group(1)
# Try the iframe format
video_url = f"https://iframe.videodelivery.net/{video_id}"
except Exception as e:
print(f"Error fetching detail page: {e}")
# If we still don't have a video URL, skip this entry
if not video_url:
self.skipped_entries += 1
return None
# Extract author from guide-tile__author
author_element = clip_element.find("span", class_="guide-tile__author")
author = ""
if author_element:
author_text = author_element.text.strip()
if "By" in author_text:
author = author_text.replace("By", "").strip()
# If there's an anchor tag, use that text instead
author_link = author_element.find("a")
if author_link:
author = author_link.text.strip()
# Create clip data dictionary
clip_data = {
"title": title,
"tags": clean_tags
}
# Only add description if it's not empty
if description:
clip_data["description"] = description
# Only add non-empty fields
if map_id:
clip_data["mapID"] = map_id
if team_id != "Neutral":
clip_data["teamID"] = team_id
if agent_id:
clip_data["agentID"] = agent_id
if video_url:
clip_data["videoURL"] = video_url
if thumbnail_url:
clip_data["thumbnailURL"] = thumbnail_url
if author:
clip_data["author"] = author
if source_url:
clip_data["sourceURL"] = source_url
return clip_data
except Exception as e:
print(f"Error parsing clip: {e}")
self.skipped_entries += 1
return None
def parse_page(self, html_content):
"""Parse the HTML content of a page and extract clip data"""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
# Updated class name from guide-card to guide-tile
clip_elements = soup.find_all("div", class_="guide-tile")
# Debug information
print(f"Found {len(clip_elements)} clip elements on the page")
page_results = []
for clip_element in clip_elements:
clip_data = self.parse_clip(clip_element)
if clip_data:
page_results.append(clip_data)
return page_results
def check_next_page_exists(self, html_content):
"""Check if there's a next page"""
if not html_content:
return False
soup = BeautifulSoup(html_content, 'html.parser')
next_button = soup.find("button", string=re.compile("Next"))
# Check if the next button exists and is not disabled
if next_button and not next_button.get("disabled"):
return True
return False
def scrape(self, start_page=1, max_pages=None):
"""Scrape clips from tracker.gg"""
page_num = start_page
while True:
print(f"Scraping page {page_num}...")
# Fetch the page
html_content = self.fetch_page(page_num)
# Parse the page
page_results = self.parse_page(html_content)
# Add the results to the list
self.results.extend(page_results)
# Print the number of clips found on this page
print(f"Found {len(page_results)} clips on page {page_num}")
# If no results were found on this page, we've reached the end
if len(page_results) == 0:
print("No more clips found, stopping.")
break
# Check if we should stop based on max_pages
if max_pages and page_num >= start_page + max_pages - 1:
break
# Go to the next page
page_num += 1
# Add a delay to avoid rate limiting
time.sleep(random.uniform(1, 3))
# Print statistics
if self.skipped_entries > 0:
print(f"Skipped {self.skipped_entries} entries out of {self.total_entries} total entries due to missing video URLs")
return self.results
def save_to_json(self, filename="tracker_clips.json"):
"""Save the results to a JSON file"""
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(filename), exist_ok=True)
# Check if the file already exists and get the current version
version = 1
if os.path.exists(filename):
try:
with open(filename, "r", encoding="utf-8") as f:
existing_data = json.load(f)
if "version" in existing_data:
version = existing_data["version"] + 1
except Exception as e:
print(f"Error reading existing file: {e}")
# Create the structured output with version and data
output_data = {
"version": version,
"data": self.results
}
# Save the results to a JSON file
with open(filename, "w", encoding="utf-8") as f:
json.dump(output_data, f, indent=4, ensure_ascii=False)
print(f"Saved {len(self.results)} clips to {filename}")
if __name__ == "__main__":
# Create output directory if it doesn't exist
os.makedirs("output", exist_ok=True)
# Create the scraper
scraper = TrackerScraper()
# Scrape the clips (optionally specify max_pages to limit the number of pages)
# scraper.scrape(max_pages=5) # Uncomment to limit to 5 pages
scraper.scrape()
# Save the results to a JSON file
scraper.save_to_json("output/tracker_clips.json")