Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,22 @@ Fetch only from Radio New Zealand:
python forcible.py fetch --source rnz
```

### Fetch Full Article HTML

Fetch full HTML content for articles that don't have it yet:

```bash
python forcible.py fetch-html
```

Fetch HTML for a limited number of articles:

```bash
python forcible.py fetch-html --limit 10
```

This command fetches article content from URLs and extracts only the essential text (paragraphs, headings, links) to minimize token usage for LLM processing. Navigation, ads, and other non-content elements are removed.

### List Articles

List recent articles:
Expand Down Expand Up @@ -150,6 +166,7 @@ This displays the article headline, content, and structured LLM analysis includi
- **config.py**: Configuration management (supports both INI and JSON formats)
- **database.py**: SQLite database interface for storing articles
- **rnz_ingester.py**: Radio New Zealand RSS feed ingester
- **html_fetcher.py**: HTML content fetcher for retrieving full article content
- **llm_processor.py**: LLM-based article analysis with structured outputs
- **forcible.py**: Command-line interface

Expand All @@ -162,7 +179,8 @@ This displays the article headline, content, and structured LLM analysis includi
- `headline`: Article headline
- `published_date`: Publication date (ISO format)
- `fetched_date`: Date fetched from source
- `content`: Article content/summary
- `content`: Article content/summary from RSS feed
- `raw_html`: Extracted article content (text, headings, links) from article URL
- `data`: JSON field for LLM analysis results (facts, relevance, PR probability, etc.)
- `created_at`: Record creation timestamp
- `updated_at`: Last update timestamp
Expand Down
56 changes: 56 additions & 0 deletions database.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def _init_schema(self):
published_date TEXT,
fetched_date TEXT NOT NULL,
content TEXT,
raw_html TEXT,
data TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
Expand Down Expand Up @@ -140,6 +141,61 @@ def update_article_data(self, article_id: int, data: Dict[str, Any]):

self.conn.commit()

def update_article_html(self, article_id: int, raw_html: str):
"""
Update article raw HTML content.
Args:
article_id: Article ID
raw_html: Raw HTML content of the article
"""
cursor = self.conn.cursor()
updated_at = datetime.now(UTC).isoformat()

cursor.execute('''
UPDATE articles
SET raw_html = ?, updated_at = ?
WHERE id = ?
''', (raw_html, updated_at, article_id))

self.conn.commit()

def get_articles_without_html(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
"""
Get articles that don't have raw HTML fetched yet.
Args:
limit: Maximum number of articles to return (optional)
Returns:
List of article dictionaries
"""
cursor = self.conn.cursor()

query = '''
SELECT * FROM articles
WHERE raw_html IS NULL
ORDER BY published_date DESC
'''

if limit:
query += ' LIMIT ?'
cursor.execute(query, (limit,))
else:
cursor.execute(query)

articles = []
for row in cursor.fetchall():
article = dict(row)
if article['data']:
try:
article['data'] = json.loads(article['data'])
except json.JSONDecodeError:
article['data'] = None
articles.append(article)

return articles

def get_last_scrape_time(self, source_name: str) -> Optional[str]:
"""
Get the last scrape time for a source.
Expand Down
52 changes: 51 additions & 1 deletion forcible.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
import argparse
import sys
import json
import traceback
from pathlib import Path
from datetime import datetime, UTC

from config import Config
from database import Database
from rnz_ingester import RNZIngester
from llm_processor import LLMProcessor
from html_fetcher import HTMLFetcher


def cmd_fetch(args):
Expand Down Expand Up @@ -43,6 +45,46 @@ def cmd_fetch(args):
sys.exit(1)


def cmd_fetch_html(args):
"""Fetch full HTML content for articles."""
try:
config = Config(args.config)
db = Database(config.get_database_path())

fetcher = HTMLFetcher(db)

# Get articles without HTML
articles_to_fetch = db.get_articles_without_html(limit=args.limit)

if not articles_to_fetch:
print("No articles need HTML fetching.")
db.close()
return

print(f"Fetching HTML for {len(articles_to_fetch)} article(s)...\n")

# Progress callback
def progress_callback(current, total, headline):
print(f"[{current}/{total}] Fetching: {headline[:60]}...")

# Fetch HTML
success_count = fetcher.fetch_all_missing_html(
limit=args.limit,
progress_callback=progress_callback
)

db.close()
print(f"\nFetch complete! Successfully fetched {success_count} article(s).")

except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error during HTML fetch: {e}", file=sys.stderr)
traceback.print_exc()
sys.exit(1)


def cmd_list(args):
"""List articles from the database."""
try:
Expand Down Expand Up @@ -244,7 +286,6 @@ def progress_callback(current, total, headline):
sys.exit(1)
except Exception as e:
print(f"Error during processing: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)

Expand Down Expand Up @@ -345,6 +386,15 @@ def main():
)
parser_fetch.set_defaults(func=cmd_fetch)

# fetch-html command
parser_fetch_html = subparsers.add_parser('fetch-html', help='Fetch full HTML content for articles')
parser_fetch_html.add_argument(
'--limit',
type=int,
help='Maximum number of articles to fetch (default: all without HTML)'
)
parser_fetch_html.set_defaults(func=cmd_fetch_html)

# list command
parser_list = subparsers.add_parser('list', help='List articles')
parser_list.add_argument('--source', help='Filter by source')
Expand Down
182 changes: 182 additions & 0 deletions html_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
HTML content fetcher for news articles.
"""
import requests
from typing import Optional
from bs4 import BeautifulSoup


class HTMLFetcher:
"""Handles fetching and extracting article content from URLs.

Extracts essential content (text, headings, links) while removing
navigation, ads, and other non-content elements to minimize token usage.
"""

def __init__(self, database):
"""
Initialize the HTML fetcher.

Args:
database: Database instance
"""
self.db = database
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

def fetch_html(self, url: str) -> Optional[str]:
"""
Fetch and extract article content from a URL.

Extracts only the main article content (paragraphs, headings, links)
to minimize token usage for LLM processing.

Args:
url: Article URL

Returns:
Extracted article content or None if fetch fails
"""
try:
response = requests.get(url, headers=self.headers, timeout=30, allow_redirects=True)
response.raise_for_status()

# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer',
'aside', 'iframe', 'noscript', 'form']):
element.decompose()

# Try to find the main article content
# Common article containers
article_content = None
for selector in ['article', 'main', '[role="main"]', '.article-content',
'.post-content', '.entry-content', '#content']:
article_content = soup.select_one(selector)
if article_content:
break

# If no article container found, use body
if not article_content:
article_content = soup.body if soup.body else soup

# Extract text content with structure
extracted = []
seen_title = False

# Extract paragraphs, headings, and lists from the article content
for element in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'blockquote']):
text = element.get_text(strip=True)
if not text: # Skip empty elements
continue

tag = element.name

if tag == 'p':
extracted.append(text)
extracted.append("") # Blank line after paragraph
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Skip duplicate h1 (title) if we've already seen one
if tag == 'h1' and seen_title:
continue
if tag == 'h1':
seen_title = True

level = int(tag[1])
# Add blank line before heading (except first one)
if extracted:
extracted.append("")
extracted.append(f"{'#' * level} {text}")
extracted.append("") # Blank line after heading
elif tag in ['ul', 'ol']:
# Extract list items
for li in element.find_all('li', recursive=False):
li_text = li.get_text(strip=True)
if li_text:
extracted.append(f"- {li_text}")
extracted.append("") # Blank line after list
elif tag == 'blockquote':
extracted.append(f"> {text}")
extracted.append("") # Blank line after quote


# Extract external links
links = []
for a in article_content.find_all('a', href=True):
href = a['href']
link_text = a.get_text(strip=True)
# Only include external links (http/https)
if href.startswith('http') and link_text:
links.append(f"[{link_text}]({href})")

# Combine content and clean up excessive blank lines
content = "\n".join(extracted).strip()

# Remove consecutive blank lines (replace multiple \n\n with just \n\n)
while "\n\n\n" in content:
content = content.replace("\n\n\n", "\n\n")

# Add links section if there are external links
if links:
content += "\n\n## External Links\n" + "\n".join(links)

return content if content else None

except requests.RequestException as e:
print(f"Error fetching HTML from {url}: {e}")
return None
except Exception as e:
print(f"Unexpected error fetching HTML from {url}: {e}")
return None

def fetch_article_html(self, article_id: int, url: str) -> bool:
"""
Fetch HTML for a specific article and store it in the database.

Args:
article_id: Article ID
url: Article URL

Returns:
True if successful, False otherwise
"""
html = self.fetch_html(url)
if html:
self.db.update_article_html(article_id, html)
return True
return False

def fetch_all_missing_html(self, limit: Optional[int] = None, progress_callback: Optional[callable] = None) -> int:
"""
Fetch HTML for all articles that don't have it yet.

Args:
limit: Maximum number of articles to process (optional)
progress_callback: Optional callback function for progress updates

Returns:
Number of articles successfully fetched
"""
articles = self.db.get_articles_without_html(limit=limit)

if not articles:
return 0

success_count = 0
total = len(articles)

for i, article in enumerate(articles):
article_id = article['id']
url = article['url']
headline = article['headline']

if progress_callback:
progress_callback(i + 1, total, headline)

if self.fetch_article_html(article_id, url):
success_count += 1

return success_count
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ requests>=2.31.0
openai>=1.3.0
python-dateutil>=2.8.2
pydantic>=2.0.0
beautifulsoup4>=4.12.0