-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_scraping.py
More file actions
62 lines (49 loc) · 2.35 KB
/
debug_scraping.py
File metadata and controls
62 lines (49 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import logging
logging.basicConfig(level=logging.INFO)
def debug_yahoo_page():
"""Debug function to examine Yahoo Finance page structure"""
# Test the page structure
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
response = session.get('https://finance.yahoo.com/quote/AAPL', timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
print('Examining page structure...')
print(f'Page title: {soup.title.text if soup.title else "No title"}')
# Look for all h3 elements
h3_elements = soup.find_all('h3')
print(f'\nFound {len(h3_elements)} h3 elements')
for i, h3 in enumerate(h3_elements[:10]):
text = h3.get_text(strip=True)[:100]
parent_tag = h3.parent.name if h3.parent else 'None'
has_link = bool(h3.find('a') or (h3.parent and h3.parent.name == 'a'))
href = ''
if h3.find('a'):
href = h3.find('a').get('href', '')[:50]
elif h3.parent and h3.parent.name == 'a':
href = h3.parent.get('href', '')[:50]
print(f'{i}: [{parent_tag}] {text} - Has link: {has_link} - href: {href}')
# Look for any elements containing 'news'
news_links = soup.find_all('a', href=lambda x: x and 'news' in x.lower())
print(f'\nFound {len(news_links)} news links')
for i, link in enumerate(news_links[:5]):
href = link.get('href')
text = link.get_text(strip=True)[:100]
print(f'{i}: {href} - {text}')
# Look for stream elements
stream_divs = soup.find_all('div', attrs={'data-testid': lambda x: x and 'stream' in str(x).lower()})
print(f'\nFound {len(stream_divs)} stream divs')
# Look for all links to see what's available
all_links = soup.find_all('a', href=True)
finance_links = [link for link in all_links if any(kw in link.get('href', '').lower() for kw in ['news', 'story', 'article', 'press'])]
print(f'\nFound {len(finance_links)} potential finance news links')
for i, link in enumerate(finance_links[:10]):
href = link.get('href')
text = link.get_text(strip=True)[:100]
print(f'{i}: {href} - {text}')
if __name__ == "__main__":
debug_yahoo_page()