FinHubTasks/debug_scraping.py at master · Daniel0813/FinHubTasks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.INFO)

def debug_yahoo_page():
    """Debug function to examine Yahoo Finance page structure"""

    # Test the page structure
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })

    response = session.get('https://finance.yahoo.com/quote/AAPL', timeout=15)
    soup = BeautifulSoup(response.content, 'html.parser')

    print('Examining page structure...')
    print(f'Page title: {soup.title.text if soup.title else "No title"}')

    # Look for all h3 elements
    h3_elements = soup.find_all('h3')
    print(f'\nFound {len(h3_elements)} h3 elements')

    for i, h3 in enumerate(h3_elements[:10]):
        text = h3.get_text(strip=True)[:100]
        parent_tag = h3.parent.name if h3.parent else 'None'
        has_link = bool(h3.find('a') or (h3.parent and h3.parent.name == 'a'))
        href = ''
        if h3.find('a'):
            href = h3.find('a').get('href', '')[:50]
        elif h3.parent and h3.parent.name == 'a':
            href = h3.parent.get('href', '')[:50]
        print(f'{i}: [{parent_tag}] {text} - Has link: {has_link} - href: {href}')

    # Look for any elements containing 'news'
    news_links = soup.find_all('a', href=lambda x: x and 'news' in x.lower())
    print(f'\nFound {len(news_links)} news links')
    for i, link in enumerate(news_links[:5]):
        href = link.get('href')
        text = link.get_text(strip=True)[:100]
        print(f'{i}: {href} - {text}')

    # Look for stream elements
    stream_divs = soup.find_all('div', attrs={'data-testid': lambda x: x and 'stream' in str(x).lower()})
    print(f'\nFound {len(stream_divs)} stream divs')

    # Look for all links to see what's available
    all_links = soup.find_all('a', href=True)
    finance_links = [link for link in all_links if any(kw in link.get('href', '').lower() for kw in ['news', 'story', 'article', 'press'])]
    print(f'\nFound {len(finance_links)} potential finance news links')

    for i, link in enumerate(finance_links[:10]):
        href = link.get('href')
        text = link.get_text(strip=True)[:100]
        print(f'{i}: {href} - {text}')

if __name__ == "__main__":
    debug_yahoo_page()