njt · Copilot · Dec 14, 2025 · Dec 14, 2025 · Dec 14, 2025 · Dec 14, 2025
diff --git a/README.md b/README.md
@@ -74,6 +74,22 @@ Fetch only from Radio New Zealand:
 python forcible.py fetch --source rnz
 ```
 
+### Fetch Full Article HTML
+
+Fetch full HTML content for articles that don't have it yet:
+
+```bash
+python forcible.py fetch-html
+```
+
+Fetch HTML for a limited number of articles:
+
+```bash
+python forcible.py fetch-html --limit 10
+```
+
+This command fetches article content from URLs and extracts only the essential text (paragraphs, headings, links) to minimize token usage for LLM processing. Navigation, ads, and other non-content elements are removed.
+
 ### List Articles
 
 List recent articles:
@@ -150,6 +166,7 @@ This displays the article headline, content, and structured LLM analysis includi
 - **config.py**: Configuration management (supports both INI and JSON formats)
 - **database.py**: SQLite database interface for storing articles
 - **rnz_ingester.py**: Radio New Zealand RSS feed ingester
+- **html_fetcher.py**: HTML content fetcher for retrieving full article content
 - **llm_processor.py**: LLM-based article analysis with structured outputs
 - **forcible.py**: Command-line interface
 
@@ -162,7 +179,8 @@ This displays the article headline, content, and structured LLM analysis includi
 - `headline`: Article headline
 - `published_date`: Publication date (ISO format)
 - `fetched_date`: Date fetched from source
-- `content`: Article content/summary
+- `content`: Article content/summary from RSS feed
+- `raw_html`: Extracted article content (text, headings, links) from article URL
 - `data`: JSON field for LLM analysis results (facts, relevance, PR probability, etc.)
 - `created_at`: Record creation timestamp
 - `updated_at`: Last update timestamp

diff --git a/database.py b/database.py
@@ -36,6 +36,7 @@ def _init_schema(self):
                 published_date TEXT,
                 fetched_date TEXT NOT NULL,
                 content TEXT,
+                raw_html TEXT,
                 data TEXT,
                 created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
                 updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
@@ -140,6 +141,61 @@ def update_article_data(self, article_id: int, data: Dict[str, Any]):
 
         self.conn.commit()
 
+    def update_article_html(self, article_id: int, raw_html: str):
+        """
+        Update article raw HTML content.
+        
+        Args:
+            article_id: Article ID
+            raw_html: Raw HTML content of the article
+        """
+        cursor = self.conn.cursor()
+        updated_at = datetime.now(UTC).isoformat()
+
+        cursor.execute('''
+            UPDATE articles 
+            SET raw_html = ?, updated_at = ?
+            WHERE id = ?
+        ''', (raw_html, updated_at, article_id))
+
+        self.conn.commit()
+
+    def get_articles_without_html(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Get articles that don't have raw HTML fetched yet.
+        
+        Args:
+            limit: Maximum number of articles to return (optional)
+            
+        Returns:
+            List of article dictionaries
+        """
+        cursor = self.conn.cursor()
+
+        query = '''
+            SELECT * FROM articles 
+            WHERE raw_html IS NULL
+            ORDER BY published_date DESC
+        '''
+
+        if limit:
+            query += ' LIMIT ?'
+            cursor.execute(query, (limit,))
+        else:
+            cursor.execute(query)
+
+        articles = []
+        for row in cursor.fetchall():
+            article = dict(row)
+            if article['data']:
+                try:
+                    article['data'] = json.loads(article['data'])
+                except json.JSONDecodeError:
+                    article['data'] = None
+            articles.append(article)
+
+        return articles
+
     def get_last_scrape_time(self, source_name: str) -> Optional[str]:
         """
         Get the last scrape time for a source.

diff --git a/forcible.py b/forcible.py
@@ -7,13 +7,15 @@
 import argparse
 import sys
 import json
+import traceback
 from pathlib import Path
 from datetime import datetime, UTC
 
 from config import Config
 from database import Database
 from rnz_ingester import RNZIngester
 from llm_processor import LLMProcessor
+from html_fetcher import HTMLFetcher
 
 
 def cmd_fetch(args):
@@ -43,6 +45,46 @@ def cmd_fetch(args):
         sys.exit(1)
 
 
+def cmd_fetch_html(args):
+    """Fetch full HTML content for articles."""
+    try:
+        config = Config(args.config)
+        db = Database(config.get_database_path())
+
+        fetcher = HTMLFetcher(db)
+
+        # Get articles without HTML
+        articles_to_fetch = db.get_articles_without_html(limit=args.limit)
+
+        if not articles_to_fetch:
+            print("No articles need HTML fetching.")
+            db.close()
+            return
+
+        print(f"Fetching HTML for {len(articles_to_fetch)} article(s)...\n")
+
+        # Progress callback
+        def progress_callback(current, total, headline):
+            print(f"[{current}/{total}] Fetching: {headline[:60]}...")
+
+        # Fetch HTML
+        success_count = fetcher.fetch_all_missing_html(
+            limit=args.limit,
+            progress_callback=progress_callback
+        )
+
+        db.close()
+        print(f"\nFetch complete! Successfully fetched {success_count} article(s).")
+
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error during HTML fetch: {e}", file=sys.stderr)
+        traceback.print_exc()
+        sys.exit(1)
+
+
 def cmd_list(args):
     """List articles from the database."""
     try:
@@ -244,7 +286,6 @@ def progress_callback(current, total, headline):
         sys.exit(1)
     except Exception as e:
         print(f"Error during processing: {e}", file=sys.stderr)
-        import traceback
         traceback.print_exc()
         sys.exit(1)
 
@@ -345,6 +386,15 @@ def main():
     )
     parser_fetch.set_defaults(func=cmd_fetch)
 
+    # fetch-html command
+    parser_fetch_html = subparsers.add_parser('fetch-html', help='Fetch full HTML content for articles')
+    parser_fetch_html.add_argument(
+        '--limit',
+        type=int,
+        help='Maximum number of articles to fetch (default: all without HTML)'
+    )
+    parser_fetch_html.set_defaults(func=cmd_fetch_html)
+
     # list command
     parser_list = subparsers.add_parser('list', help='List articles')
     parser_list.add_argument('--source', help='Filter by source')

diff --git a/html_fetcher.py b/html_fetcher.py
@@ -0,0 +1,182 @@
+"""
+HTML content fetcher for news articles.
+"""
+import requests
+from typing import Optional
+from bs4 import BeautifulSoup
+
+
+class HTMLFetcher:
+    """Handles fetching and extracting article content from URLs.
+
+    Extracts essential content (text, headings, links) while removing
+    navigation, ads, and other non-content elements to minimize token usage.
+    """
+
+    def __init__(self, database):
+        """
+        Initialize the HTML fetcher.
+
+        Args:
+            database: Database instance
+        """
+        self.db = database
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        }
+
+    def fetch_html(self, url: str) -> Optional[str]:
+        """
+        Fetch and extract article content from a URL.
+
+        Extracts only the main article content (paragraphs, headings, links)
+        to minimize token usage for LLM processing.
+
+        Args:
+            url: Article URL
+
+        Returns:
+            Extracted article content or None if fetch fails
+        """
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30, allow_redirects=True)
+            response.raise_for_status()
+
+            # Parse HTML
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'header', 'footer', 
+                                'aside', 'iframe', 'noscript', 'form']):
+                element.decompose()
+
+            # Try to find the main article content
+            # Common article containers
+            article_content = None
+            for selector in ['article', 'main', '[role="main"]', '.article-content', 
+                           '.post-content', '.entry-content', '#content']:
+                article_content = soup.select_one(selector)
+                if article_content:
+                    break
+
+            # If no article container found, use body
+            if not article_content:
+                article_content = soup.body if soup.body else soup
+
+            # Extract text content with structure
+            extracted = []
+            seen_title = False
+
+            # Extract paragraphs, headings, and lists from the article content
+            for element in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'blockquote']):
+                text = element.get_text(strip=True)
+                if not text:  # Skip empty elements
+                    continue
+
+                tag = element.name
+
+                if tag == 'p':
+                    extracted.append(text)
+                    extracted.append("")  # Blank line after paragraph
+                elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                    # Skip duplicate h1 (title) if we've already seen one
+                    if tag == 'h1' and seen_title:
+                        continue
+                    if tag == 'h1':
+                        seen_title = True
+
+                    level = int(tag[1])
+                    # Add blank line before heading (except first one)
+                    if extracted:
+                        extracted.append("")
+                    extracted.append(f"{'#' * level} {text}")
+                    extracted.append("")  # Blank line after heading
+                elif tag in ['ul', 'ol']:
+                    # Extract list items
+                    for li in element.find_all('li', recursive=False):
+                        li_text = li.get_text(strip=True)
+                        if li_text:
+                            extracted.append(f"- {li_text}")
+                    extracted.append("")  # Blank line after list
+                elif tag == 'blockquote':
+                    extracted.append(f"> {text}")
+                    extracted.append("")  # Blank line after quote
+
+
+            # Extract external links
+            links = []
+            for a in article_content.find_all('a', href=True):
+                href = a['href']
+                link_text = a.get_text(strip=True)
+                # Only include external links (http/https)
+                if href.startswith('http') and link_text:
+                    links.append(f"[{link_text}]({href})")
+
+            # Combine content and clean up excessive blank lines
+            content = "\n".join(extracted).strip()
+
+            # Remove consecutive blank lines (replace multiple \n\n with just \n\n)
+            while "\n\n\n" in content:
+                content = content.replace("\n\n\n", "\n\n")
+
+            # Add links section if there are external links
+            if links:
+                content += "\n\n## External Links\n" + "\n".join(links)
+
+            return content if content else None
+
+        except requests.RequestException as e:
+            print(f"Error fetching HTML from {url}: {e}")
+            return None
+        except Exception as e:
+            print(f"Unexpected error fetching HTML from {url}: {e}")
+            return None
+
+    def fetch_article_html(self, article_id: int, url: str) -> bool:
+        """
+        Fetch HTML for a specific article and store it in the database.
+
+        Args:
+            article_id: Article ID
+            url: Article URL
+
+        Returns:
+            True if successful, False otherwise
+        """
+        html = self.fetch_html(url)
+        if html:
+            self.db.update_article_html(article_id, html)
+            return True
+        return False
+
+    def fetch_all_missing_html(self, limit: Optional[int] = None, progress_callback: Optional[callable] = None) -> int:
+        """
+        Fetch HTML for all articles that don't have it yet.
+
+        Args:
+            limit: Maximum number of articles to process (optional)
+            progress_callback: Optional callback function for progress updates
+
+        Returns:
+            Number of articles successfully fetched
+        """
+        articles = self.db.get_articles_without_html(limit=limit)
+
+        if not articles:
+            return 0
+
+        success_count = 0
+        total = len(articles)
+
+        for i, article in enumerate(articles):
+            article_id = article['id']
+            url = article['url']
+            headline = article['headline']
+
+            if progress_callback:
+                progress_callback(i + 1, total, headline)
+
+            if self.fetch_article_html(article_id, url):
+                success_count += 1
+
+        return success_count
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ requests>=2.31.0
 openai>=1.3.0
 python-dateutil>=2.8.2
 pydantic>=2.0.0
+beautifulsoup4>=4.12.0