Sam-Bieberich.github.io/update_medium.py at main · Sam-Bieberich/Sam-Bieberich.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env python3
"""
Scrape Medium profile for new articles and update index.html automatically.
"""
import os
import json
import re
import time
import random
from datetime import datetime
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup


def load_config():
    """Load configuration from medium-config.json"""
    config_path = os.path.join(os.path.dirname(__file__), 'medium-config.json')
    with open(config_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def scrape_medium_profile(username):
    """
    Scrape Medium RSS feed for article listings.
    Returns list of dicts: [{'title': ..., 'url': ..., 'date': 'YYYY-MM'}, ...]
    """
    # Medium RSS feed is more reliable than scraping the profile page
    rss_url = f"https://medium.com/feed/@{username}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://medium.com/',
        'Connection': 'keep-alive',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1'
    }

    print(f"Fetching {rss_url}...")

    # Retry logic with exponential backoff for rate limiting
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(rss_url, headers=headers, timeout=30)
            response.raise_for_status()
            break  # Success, exit retry loop
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:  # Rate limited
                if attempt < max_retries - 1:
                    wait_time = (2 ** attempt) * 5  # 5s, 10s, 20s
                    print(f"Rate limited (429). Waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
                    time.sleep(wait_time)
                else:
                    print(f"ERROR: Rate limited after {max_retries} attempts. Medium may be blocking requests.")
                    print("The workflow will try again tomorrow. No changes made.")
                    return []
            else:
                raise  # Re-raise other HTTP errors
        except requests.exceptions.RequestException as e:
            print(f"ERROR: Failed to fetch RSS feed: {e}")
            return []

    soup = BeautifulSoup(response.text, 'xml')
    articles = []

    # Parse RSS items
    items = soup.find_all('item')
    print(f"Found {len(items)} items in RSS feed")

    for item in items:
        title_elem = item.find('title')
        link_elem = item.find('link')
        pubdate_elem = item.find('pubDate')

        if not title_elem or not link_elem:
            continue

        title = title_elem.get_text(strip=True)
        url = link_elem.get_text(strip=True)

        # Parse publication date
        article_date = None
        if pubdate_elem:
            pubdate_str = pubdate_elem.get_text(strip=True)
            try:
                # RSS pubDate format: "Wed, 23 Oct 2025 12:34:56 GMT"
                dt = datetime.strptime(pubdate_str, '%a, %d %b %Y %H:%M:%S %Z')
                article_date = dt.strftime('%Y-%m')
            except Exception as e:
                print(f"  Could not parse date '{pubdate_str}': {e}")

        # Default to current month if no date
        if not article_date:
            article_date = datetime.now().strftime('%Y-%m')

        articles.append({
            'title': title,
            'url': url,
            'date': article_date
        })

    print(f"Parsed {len(articles)} articles from RSS feed")
    return articles


def extract_date_from_article(url, headers):
    """Fetch individual article page to extract publication date"""
    try:
        resp = requests.get(url, headers=headers, timeout=15)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Look for <time> with datetime attribute
        time_elem = soup.find('time')
        if time_elem:
            datetime_attr = time_elem.get('datetime')
            if datetime_attr:
                dt = datetime.fromisoformat(datetime_attr.replace('Z', '+00:00'))
                return dt.strftime('%Y-%m')

        # Look for meta property with publication date
        meta_date = soup.find('meta', property='article:published_time')
        if meta_date:
            content = meta_date.get('content')
            if content:
                dt = datetime.fromisoformat(content.replace('Z', '+00:00'))
                return dt.strftime('%Y-%m')

    except Exception as e:
        print(f"  Could not extract date from {url}: {e}")

    return None


def normalize_medium_url(url):
    """
    Normalize Medium URL by removing query params and extracting article slug.
    Returns just the article slug for comparison (e.g., "article-name-abc123")
    """
    # Remove query parameters
    url = url.split('?')[0]

    # Extract the article slug (last part after final /)
    # Medium URLs end with: username/article-title-hexid or just article-title-hexid
    parts = url.rstrip('/').split('/')
    if parts:
        slug = parts[-1]
        # Return the slug for comparison
        return slug

    return url


def parse_existing_articles(html_content):
    """Parse index.html to find existing Medium articles"""
    existing = set()

    # Match Medium article rows: [M] type + Medium URL
    pattern = r'<tr><td class="type">\[M\]</td><td class="title"><a href="(https://[^"]*medium\.com[^"]*)"'
    matches = re.finditer(pattern, html_content)

    for match in matches:
        url = match.group(1)
        normalized = normalize_medium_url(url)
        existing.add(normalized)

    print(f"Found {len(existing)} existing Medium articles in index.html")
    return existing


def format_article_row(article):
    """Format an article dict as an HTML table row"""
    title = article['title'].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
    url = article['url']
    date_str = article['date']

    # Parse date for display (e.g., "2025-10" -> "Oct 2025")
    try:
        dt = datetime.strptime(date_str, '%Y-%m')
        display_date = dt.strftime('%b %Y')
    except:
        display_date = date_str

    row = f'          <tr><td class="type">[M]</td><td class="title"><a href="{url}" target="_blank" rel="noopener">{title}</a></td><td class="date" data-date="{date_str}">{display_date}</td></tr>'
    return row


def update_index_html(new_articles):
    """Insert new articles into index.html after the first Medium article"""
    index_path = os.path.join(os.path.dirname(__file__), 'index.html')

    with open(index_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Find insertion point: after the first [M] row in writing section
    # We'll insert right after the first Medium article row
    pattern = r'(<tr><td class="type">\[M\]</td><td class="title">.*?</tr>)'
    match = re.search(pattern, content, re.DOTALL)

    if not match:
        print("ERROR: Could not find Medium article insertion point in index.html")
        return False

    insertion_point = match.end()

    # Build new rows
    new_rows = '\n'.join([format_article_row(a) for a in new_articles])

    # Insert
    updated_content = content[:insertion_point] + '\n' + new_rows + content[insertion_point:]

    with open(index_path, 'w', encoding='utf-8') as f:
        f.write(updated_content)

    print(f"✓ Added {len(new_articles)} new article(s) to index.html")
    return True


def main():
    """Main entry point"""
    # Add random delay (0-30s) to avoid all GitHub Actions hitting Medium simultaneously
    delay = random.randint(0, 10)
    print(f"Waiting {delay}s to avoid rate limiting...")
    time.sleep(delay)

    config = load_config()
    username = config.get('medium_username')

    if not username:
        print("ERROR: medium_username not found in medium-config.json")
        return 1

    # Scrape Medium
    articles = scrape_medium_profile(username)

    if not articles:
        print("No articles found on Medium profile")
        return 0

    # Read existing index.html
    index_path = os.path.join(os.path.dirname(__file__), 'index.html')
    with open(index_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    existing_urls = parse_existing_articles(html_content)

    # Filter for new articles (normalize URLs for comparison)
    new_articles = []
    for article in articles:
        normalized_url = normalize_medium_url(article['url'])
        if normalized_url not in existing_urls:
            new_articles.append(article)

    if not new_articles:
        print("✓ No new articles to add")
        return 0

    print(f"\nNew articles to add:")
    for article in new_articles:
        print(f"  - {article['title']} ({article['date']})")

    # Update index.html
    success = update_index_html(new_articles)

    return 0 if success else 1


if __name__ == '__main__':
    exit(main())