From 091b3ab3264cc9a7e82c19025764101bea7387a1 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Mon, 18 May 2026 20:57:28 -0400 Subject: [PATCH 1/6] Refactor GitHub commit activity fetching Replace the previous ThreadPoolExecutor-based timeout approach with direct REST calls for repo commit activity. Add status constants (ready/pending/failed), helper functions (_commit_activity_url, _write_commit_activity, _fetch_commit_activity) to handle GitHub 202/204/200 responses, and a two-pass _collect_commit_activity to trigger and then collect pending stats. Update update_github to prefetch commit activity for active repos, include proper headers, and iterate only non-archived repos. Update tests to cover the new fetch/collect behavior and remove the previous timeout-based tests. --- src/updater.py | 146 ++++++++++++++++++++++++++++++------- tests/unit/test_updater.py | 122 ++++++++++++++++++++++--------- 2 files changed, 208 insertions(+), 60 deletions(-) diff --git a/src/updater.py b/src/updater.py index 3cbefedcc..a02c1cded 100644 --- a/src/updater.py +++ b/src/updater.py @@ -2,7 +2,6 @@ import json import math import os -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout from datetime import datetime, timezone from threading import Thread @@ -18,6 +17,10 @@ from src import helpers from src.logger import log +COMMIT_ACTIVITY_READY = 'ready' +COMMIT_ACTIVITY_PENDING = 'pending' +COMMIT_ACTIVITY_FAILED = 'failed' + def update_aur(aur_repos: list): """ @@ -193,30 +196,90 @@ def update_fb(): helpers.write_json_files(file_path=file_path, data=data) -def _get_stats_with_timeout(repo, timeout=60): +def _commit_activity_url(repo) -> str: """ - Fetch commit activity for a repo, capping total wait time. + Build the GitHub REST URL for a repository's weekly commit activity. Parameters ---------- repo : PyGithub Repository object. - timeout : int - Maximum seconds to wait before giving up (GitHub may return 202 while - computing stats, causing PyGithub to retry indefinitely without this guard). Returns ------- - list or None - Weekly commit-activity objects, or None on timeout. + str + GitHub REST API URL. """ - with ThreadPoolExecutor(max_workers=1) as pool: - future = pool.submit(repo.get_stats_commit_activity) - try: - return future.result(timeout=timeout) - except FuturesTimeout: - log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.') - return None + return f'https://api.github.com/repos/{repo.owner.login}/{repo.name}/stats/commit_activity' + + +def _write_commit_activity(repo, commit_activity: list) -> None: + """ + Write weekly commit activity for a repository. + + Parameters + ---------- + repo : + PyGithub Repository object. + commit_activity : list + Weekly commit activity records from GitHub's REST API. + """ + file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) + helpers.write_json_files(file_path=file_path, data=commit_activity) + + +def _fetch_commit_activity(repo, headers: dict) -> str: + """ + Fetch or trigger weekly commit activity for a repository. + + GitHub returns ``202 Accepted`` while it computes repository statistics. + This function treats that response as a successful trigger instead of + waiting in-place. + + Parameters + ---------- + repo : + PyGithub Repository object. + headers : dict + HTTP headers including the GitHub authorisation token. + + Returns + ------- + str + One of ``COMMIT_ACTIVITY_READY``, ``COMMIT_ACTIVITY_PENDING``, or + ``COMMIT_ACTIVITY_FAILED``. + """ + url = _commit_activity_url(repo) + try: + response = helpers.s.get(url=url, headers=headers) + except requests.exceptions.RequestException as e: + log.warning(f'Error fetching commit activity for {repo.name}: {e}') + return COMMIT_ACTIVITY_FAILED + + if response.status_code == 202: + return COMMIT_ACTIVITY_PENDING + + if response.status_code == 204: + _write_commit_activity(repo, []) + return COMMIT_ACTIVITY_READY + + try: + data = response.json() + except Exception as e: + log.warning(f'Error parsing commit activity for {repo.name}: {e}') + return COMMIT_ACTIVITY_FAILED + + if response.status_code != 200: + message = data.get('message', response.text) if isinstance(data, dict) else response.text + log.warning(f'Error fetching commit activity for {repo.name}: {message}') + return COMMIT_ACTIVITY_FAILED + + if not isinstance(data, list): + log.warning(f'Unexpected commit activity response for {repo.name}: {data}') + return COMMIT_ACTIVITY_FAILED + + _write_commit_activity(repo, data) + return COMMIT_ACTIVITY_READY def _seed_star_history(repo, total: int, initial_samples: int) -> list[dict]: @@ -428,6 +491,43 @@ def _build_code_scanning_history(alerts: list) -> list[dict]: ] +def _collect_commit_activity(repos: list, headers: dict) -> None: + """ + Trigger and collect weekly commit activity for active repositories. + + GitHub may return ``202 Accepted`` for the stats endpoint while it starts + its server-side calculation. The first pass gives every repository a chance + to start that work. The second pass only revisits repositories that were + still pending after the first request. + + Parameters + ---------- + repos : list + Active PyGithub Repository objects. + headers : dict + HTTP headers including the GitHub authorisation token. + """ + pending_repos = [] + for repo in tqdm( + iterable=repos, + desc='Triggering GitHub commit activity', + ): + status = _fetch_commit_activity(repo, headers) + if status == COMMIT_ACTIVITY_PENDING: + pending_repos.append(repo) + + if not pending_repos: + return + + for repo in tqdm( + iterable=pending_repos, + desc='Collecting GitHub commit activity', + ): + status = _fetch_commit_activity(repo, headers) + if status == COMMIT_ACTIVITY_PENDING: + log.warning(f'Commit activity for {repo.name} is still being calculated by GitHub, skipping.') + + def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: """ Collect and cache all per-repository data for a single GitHub repo. @@ -446,13 +546,6 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: file_path = os.path.join(BASE_DIR, 'github', 'languages', repo.name) helpers.write_json_files(file_path=file_path, data=languages) - # commit activity (last year, weekly buckets) - commit_activity = _get_stats_with_timeout(repo) - if commit_activity: - commits = [week.raw_data for week in commit_activity] - file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) - helpers.write_json_files(file_path=file_path, data=commits) - # open pull requests pulls_data = [] for pr in repo.get_pulls(state='open'): @@ -542,16 +635,19 @@ def update_github(): # GraphQL query still uses direct requests headers = { + 'Accept': 'application/vnd.github+json', 'Authorization': f'token {os.environ["GITHUB_TOKEN"]}', + 'X-GitHub-Api-Version': '2022-11-28', } graphql_url = 'https://api.github.com/graphql' + active_repos = [repo for repo in repos if not repo.archived] + _collect_commit_activity(active_repos, headers) + for repo in tqdm( - iterable=repos, + iterable=active_repos, desc='Updating GitHub data', ): - if repo.archived: - continue _process_github_repo(repo, headers, graphql_url) diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 42759f87a..43ef3f0b7 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -1,6 +1,5 @@ # standard imports import json -from concurrent.futures import TimeoutError as FuturesTimeout from datetime import datetime, timezone from types import SimpleNamespace @@ -26,11 +25,6 @@ def json(self): return self._payload -class FakeWeek: - def __init__(self, week, total): - self.raw_data = {'week': week, 'total': total} - - class FakePull: def __init__(self, number=1): self.number = number @@ -71,9 +65,6 @@ def __init__(self, name='repo1', archived=False, stars=4): def get_languages(self): return {'Python': 100} - def get_stats_commit_activity(self): - return [FakeWeek(1, 1)] - def get_pulls(self, state='open'): assert state == 'open' return [FakePull(3)] @@ -229,37 +220,88 @@ def fake_get(url): assert 'paging' not in writes[0][1] -def test_get_stats_with_timeout_success_and_timeout(monkeypatch): - class FutureOk: - def result(self, timeout): - return [1] +def test_fetch_commit_activity(monkeypatch, tmp_path): + monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) + + writes = [] + monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) + + responses = [ + FakeResponse([{'week': 1, 'total': 2}], status=200), + FakeResponse(status=202), + FakeResponse(status=204), + ] + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: responses.pop(0)) + + repo = FakeRepo(name='demo') + headers = {'Authorization': 'token'} + + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_READY + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_PENDING + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_READY + assert len(writes) == 2 + assert writes[0][1] == [{'week': 1, 'total': 2}] + assert writes[1][1] == [] + assert '/repos/owner/demo/stats/commit_activity' in updater._commit_activity_url(repo) + + +def test_fetch_commit_activity_errors(monkeypatch): + repo = FakeRepo(name='demo') + headers = {'Authorization': 'token'} + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + + monkeypatch.setattr( + updater.helpers.s, + 'get', + lambda url, headers: (_ for _ in ()).throw(requests.exceptions.Timeout('timeout')), + ) + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED + + monkeypatch.setattr( + updater.helpers.s, + 'get', + lambda url, headers: FakeResponse(status=500, raises=ValueError('bad')), + ) + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED - class FutureTimeout: - def result(self, timeout): - raise FuturesTimeout() + monkeypatch.setattr( + updater.helpers.s, + 'get', + lambda url, headers: FakeResponse({'message': 'rate limit'}, status=403), + ) + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED - class Pool: - def __init__(self, future): - self.future = future + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: FakeResponse({'bad': 1}, status=200)) + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED + assert len(warnings) == 4 - def __enter__(self): - return self - def __exit__(self, *args): - return False +def test_collect_commit_activity(monkeypatch): + ready = FakeRepo('ready') + pending = FakeRepo('pending') + stuck = FakeRepo('stuck') - def submit(self, func): - return self.future + statuses = { + 'ready': [updater.COMMIT_ACTIVITY_READY, updater.COMMIT_ACTIVITY_READY], + 'pending': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_READY], + 'stuck': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_PENDING], + } + calls = [] - monkeypatch.setattr(updater, 'ThreadPoolExecutor', lambda max_workers: Pool(FutureOk())) - repo = SimpleNamespace(name='x', get_stats_commit_activity=lambda: [1]) - assert updater._get_stats_with_timeout(repo) == [1] + def fake_fetch(repo, headers): + calls.append(repo.name) + return statuses[repo.name].pop(0) warnings = [] + monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - monkeypatch.setattr(updater, 'ThreadPoolExecutor', lambda max_workers: Pool(FutureTimeout())) - assert updater._get_stats_with_timeout(repo) is None - assert warnings + + updater._collect_commit_activity([ready], {}) + updater._collect_commit_activity([ready, pending, stuck], {}) + + assert calls == ['ready', 'ready', 'pending', 'stuck', 'pending', 'stuck'] + assert warnings == ['Commit activity for stuck is still being calculated by GitHub, skipping.'] def test_seed_star_history(monkeypatch): @@ -331,7 +373,6 @@ def test_process_github_repo(monkeypatch, tmp_path): 'save_image_from_url', lambda **kwargs: writes.append(('img', kwargs['file_path'])) ) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: [FakeWeek(1, 1)]) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: [{'date': '2026-01-01', 'stars': 1}]) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr( @@ -357,7 +398,6 @@ def post_ok(url, json, headers): def test_process_github_repo_error_and_avatar_skip(monkeypatch, tmp_path): monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) monkeypatch.setattr(updater.helpers, 'write_json_files', lambda **kwargs: None) - monkeypatch.setattr(updater, '_get_stats_with_timeout', lambda repo: None) monkeypatch.setattr(updater, '_collect_star_history', lambda repo: []) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr(updater, '_build_code_scanning_history', lambda alerts: []) @@ -387,9 +427,10 @@ def test_update_github(monkeypatch): monkeypatch.setenv('GITHUB_REPOSITORY_OWNER', 'owner') repo_active = FakeRepo('active', archived=False) + repo_pending = FakeRepo('pending', archived=False) repo_archived = FakeRepo('archived', archived=True) - owner = SimpleNamespace(get_repos=lambda: [repo_active, repo_archived]) + owner = SimpleNamespace(get_repos=lambda: [repo_active, repo_pending, repo_archived]) class FakeGithub: def __init__(self, auth, timeout): @@ -403,6 +444,16 @@ def get_user(self, name): writes = [] monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) + commit_calls = [] + commit_statuses = { + 'active': [updater.COMMIT_ACTIVITY_READY], + 'pending': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_READY], + } + monkeypatch.setattr( + updater, + '_fetch_commit_activity', + lambda repo, headers: commit_calls.append(repo.name) or commit_statuses[repo.name].pop(0), + ) processed = [] monkeypatch.setattr(updater, '_process_github_repo', lambda repo, headers, graphql_url: processed.append(repo.name)) monkeypatch.setattr(updater, 'BASE_DIR', 'base') @@ -410,7 +461,8 @@ def get_user(self, name): updater.update_github() assert any(path.endswith('github\\repos') or path.endswith('github/repos') for path, _ in writes) - assert processed == ['active'] + assert commit_calls == ['active', 'pending', 'pending'] + assert processed == ['active', 'pending'] def test_update_patreon(monkeypatch): From af10df708c43ff2663c8aa23744af761bb963425 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Mon, 18 May 2026 21:21:45 -0400 Subject: [PATCH 2/6] Add timeout wrapper for GitHub repo steps Introduce a per-repository GitHub step runner with timeouts (GITHUB_REPO_STEP_TIMEOUT=90s) to guard against slow or failing API calls. Adds _run_github_repo_step which runs callables in a daemon thread, returns a default on timeout/error, and logs warnings. Extracts helper functions _collect_open_pulls and _fetch_open_graph_image_url and integrate the timeout wrapper into _process_github_repo for languages, pulls, code scanning alerts, star history, and OpenGraph image fetch/download (image download uses a 30s timeout). Update tests: add test_run_github_repo_step_timeout and adjust an existing test to assert warning behavior instead of raising SystemExit. --- src/updater.py | 212 +++++++++++++++++++++++++++---------- tests/unit/test_updater.py | 27 ++++- 2 files changed, 183 insertions(+), 56 deletions(-) diff --git a/src/updater.py b/src/updater.py index a02c1cded..1c9dbeae2 100644 --- a/src/updater.py +++ b/src/updater.py @@ -2,6 +2,7 @@ import json import math import os +from queue import Queue from datetime import datetime, timezone from threading import Thread @@ -20,6 +21,7 @@ COMMIT_ACTIVITY_READY = 'ready' COMMIT_ACTIVITY_PENDING = 'pending' COMMIT_ACTIVITY_FAILED = 'failed' +GITHUB_REPO_STEP_TIMEOUT = 90 def update_aur(aur_repos: list): @@ -228,6 +230,58 @@ def _write_commit_activity(repo, commit_activity: list) -> None: helpers.write_json_files(file_path=file_path, data=commit_activity) +def _run_github_repo_step(repo, step: str, func: callable, default=None, timeout: int = GITHUB_REPO_STEP_TIMEOUT): + """ + Run an optional per-repository GitHub step with a total timeout. + + Parameters + ---------- + repo : + PyGithub Repository object. + step : str + Human-readable step name for logs. + func : callable + Function to run. + default : + Value returned when the step errors or times out. + timeout : int + Maximum seconds to wait for the step. + + Returns + ------- + any + The callable result, or ``default`` when the step fails. + """ + tqdm.write(f'GitHub {repo.name}: {step}...') + + result_queue = Queue(maxsize=1) + + def runner(): + try: + result_queue.put((True, func())) + except BaseException as e: + result_queue.put((False, e)) + + thread = Thread(target=runner, daemon=True) + thread.start() + thread.join(timeout=timeout) + + if thread.is_alive(): + message = f'Timeout after {timeout}s while running GitHub {step} for {repo.name}, skipping.' + log.warning(message) + tqdm.write(message) + return default + + success, value = result_queue.get() + if success: + return value + + message = f'Error running GitHub {step} for {repo.name}: {value}' + log.warning(message) + tqdm.write(message) + return default + + def _fetch_commit_activity(repo, headers: dict) -> str: """ Fetch or trigger weekly commit activity for a repository. @@ -491,6 +545,70 @@ def _build_code_scanning_history(alerts: list) -> list[dict]: ] +def _collect_open_pulls(repo) -> list[dict]: + """ + Fetch open pull request summary data for a repository. + + Parameters + ---------- + repo : + PyGithub Repository object. + + Returns + ------- + list + Pull request summary dictionaries. + """ + pulls_data = [] + for pr in repo.get_pulls(state='open'): + pulls_data.append({ + 'number': pr.number, + 'title': pr.title, + 'author': pr.user.login, + 'labels': [label.name for label in pr.labels], + 'assignees': [assignee.login for assignee in pr.assignees], + 'created_at': pr.created_at.isoformat(), + 'updated_at': pr.updated_at.isoformat(), + 'draft': pr.draft, + 'milestone': pr.milestone.title if pr.milestone else None, + }) + return pulls_data + + +def _fetch_open_graph_image_url(repo, headers: dict, graphql_url: str) -> str: + """ + Fetch a repository's OpenGraph image URL from GitHub GraphQL. + + Parameters + ---------- + repo : + PyGithub Repository object. + headers : dict + HTTP headers including the GitHub authorisation token. + graphql_url : str + GitHub GraphQL endpoint URL. + + Returns + ------- + str + OpenGraph image URL. + """ + query = """ + { + repository(owner: "%s", name: "%s") { + openGraphImageUrl + } + } + """ % (repo.owner.login, repo.name) + + response = helpers.s.post(url=graphql_url, json={'query': query}, headers=headers) + repo_data = response.json() + try: + return repo_data['data']['repository']['openGraphImageUrl'] + except KeyError: + raise RuntimeError(f'Error: update_github: {repo_data}') from None + + def _collect_commit_activity(repos: list, headers: dict) -> None: """ Trigger and collect weekly commit activity for active repositories. @@ -542,73 +660,59 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: GitHub GraphQL endpoint URL. """ # languages - languages = repo.get_languages() - file_path = os.path.join(BASE_DIR, 'github', 'languages', repo.name) - helpers.write_json_files(file_path=file_path, data=languages) + languages = _run_github_repo_step(repo, 'languages', repo.get_languages) + if languages is not None: + file_path = os.path.join(BASE_DIR, 'github', 'languages', repo.name) + helpers.write_json_files(file_path=file_path, data=languages) # open pull requests - pulls_data = [] - for pr in repo.get_pulls(state='open'): - pulls_data.append({ - 'number': pr.number, - 'title': pr.title, - 'author': pr.user.login, - 'labels': [label.name for label in pr.labels], - 'assignees': [assignee.login for assignee in pr.assignees], - 'created_at': pr.created_at.isoformat(), - 'updated_at': pr.updated_at.isoformat(), - 'draft': pr.draft, - 'milestone': pr.milestone.title if pr.milestone else None, - }) - file_path = os.path.join(BASE_DIR, 'github', 'pulls', repo.name) - helpers.write_json_files(file_path=file_path, data=pulls_data) + pulls_data = _run_github_repo_step(repo, 'pull requests', lambda: _collect_open_pulls(repo)) + if pulls_data is not None: + file_path = os.path.join(BASE_DIR, 'github', 'pulls', repo.name) + helpers.write_json_files(file_path=file_path, data=pulls_data) # open code scanning alerts and per-day history - alerts = _fetch_code_scanning_alerts(repo) - open_alert_count = sum( - 1 for a in alerts if getattr(a, 'state', None) == 'open' - ) - file_path = os.path.join(BASE_DIR, 'github', 'codeScanning', repo.name) - helpers.write_json_files(file_path=file_path, data={ - 'repo': repo.name, - 'open': open_alert_count, - 'updated_at': datetime.now(tz=timezone.utc).isoformat(), - }) + alerts = _run_github_repo_step(repo, 'code scanning alerts', lambda: _fetch_code_scanning_alerts(repo)) + if alerts is not None: + open_alert_count = sum( + 1 for a in alerts if getattr(a, 'state', None) == 'open' + ) + file_path = os.path.join(BASE_DIR, 'github', 'codeScanning', repo.name) + helpers.write_json_files(file_path=file_path, data={ + 'repo': repo.name, + 'open': open_alert_count, + 'updated_at': datetime.now(tz=timezone.utc).isoformat(), + }) - code_scanning_history = _build_code_scanning_history(alerts) - file_path = os.path.join(BASE_DIR, 'github', 'codeScanningHistory', repo.name) - helpers.write_json_files(file_path=file_path, data=code_scanning_history) + code_scanning_history = _build_code_scanning_history(alerts) + file_path = os.path.join(BASE_DIR, 'github', 'codeScanningHistory', repo.name) + helpers.write_json_files(file_path=file_path, data=code_scanning_history) # star history (sampled to cap API calls) - star_history = _collect_star_history(repo) + star_history = _run_github_repo_step(repo, 'star history', lambda: _collect_star_history(repo)) if star_history: file_path = os.path.join(BASE_DIR, 'github', 'starHistory', repo.name) helpers.write_json_files(file_path=file_path, data=star_history) # openGraphImages - uses GraphQL - query = """ - { - repository(owner: "%s", name: "%s") { - openGraphImageUrl - } - } - """ % (repo.owner.login, repo.name) - - response = helpers.s.post(url=graphql_url, json={'query': query}, headers=headers) - repo_data = response.json() - try: - image_url = repo_data['data']['repository']['openGraphImageUrl'] - except KeyError: - log.error(f'Error: update_github: {repo_data}') - raise SystemExit('"GITHUB_TOKEN" is invalid.') - if 'avatars' not in image_url: + image_url = _run_github_repo_step( + repo, + 'OpenGraph image URL', + lambda: _fetch_open_graph_image_url(repo, headers, graphql_url), + ) + if image_url and 'avatars' not in image_url: file_path = os.path.join(BASE_DIR, 'github', 'openGraphImages', repo.name) - helpers.save_image_from_url( - file_path=file_path, - file_extension='png', - image_url=image_url, - size_x=624, - size_y=312, + _run_github_repo_step( + repo, + 'OpenGraph image download', + lambda: helpers.save_image_from_url( + file_path=file_path, + file_extension='png', + image_url=image_url, + size_x=624, + size_y=312, + ), + timeout=30, ) diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 43ef3f0b7..81985c5f2 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -1,5 +1,6 @@ # standard imports import json +import time from datetime import datetime, timezone from types import SimpleNamespace @@ -277,6 +278,26 @@ def test_fetch_commit_activity_errors(monkeypatch): assert len(warnings) == 4 +def test_run_github_repo_step_timeout(monkeypatch): + repo = FakeRepo(name='demo') + warnings = [] + messages = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + monkeypatch.setattr(updater.tqdm, 'write', lambda msg: messages.append(msg)) + + result = updater._run_github_repo_step( + repo, + 'slow step', + lambda: time.sleep(0.05), + default='fallback', + timeout=0.001, + ) + + assert result == 'fallback' + assert warnings == ['Timeout after 0.001s while running GitHub slow step for demo, skipping.'] + assert messages[-1] == warnings[0] + + def test_collect_commit_activity(monkeypatch): ready = FakeRepo('ready') pending = FakeRepo('pending') @@ -401,6 +422,8 @@ def test_process_github_repo_error_and_avatar_skip(monkeypatch, tmp_path): monkeypatch.setattr(updater, '_collect_star_history', lambda repo: []) monkeypatch.setattr(updater, '_fetch_code_scanning_alerts', lambda repo: []) monkeypatch.setattr(updater, '_build_code_scanning_history', lambda alerts: []) + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) monkeypatch.setattr( updater.helpers.s, @@ -418,8 +441,8 @@ def test_process_github_repo_error_and_avatar_skip(monkeypatch, tmp_path): updater._process_github_repo(FakeRepo(name='demo'), {'Authorization': 'x'}, 'https://api.github.com/graphql') monkeypatch.setattr(updater.helpers.s, 'post', lambda url, json, headers: FakeResponse({'bad': 1})) - with pytest.raises(SystemExit): - updater._process_github_repo(FakeRepo(name='demo'), {'Authorization': 'x'}, 'https://api.github.com/graphql') + updater._process_github_repo(FakeRepo(name='demo'), {'Authorization': 'x'}, 'https://api.github.com/graphql') + assert any('OpenGraph image URL' in warning for warning in warnings) def test_update_github(monkeypatch): From f91583749d992b2dfa6ccec26aa0de1df72aaa6b Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Mon, 18 May 2026 21:35:26 -0400 Subject: [PATCH 3/6] Poll GitHub commit activity until ready Add retry polling to _collect_commit_activity to handle GitHub's 202/async stats calculation. Introduce COMMIT_ACTIVITY_POLL_ATTEMPTS and COMMIT_ACTIVITY_POLL_INTERVAL (defaults 6 and 15s) and allow callers to override poll_attempts and poll_interval. Between attempts the function logs and writes progress messages, sleeps for the configured interval, and retries only pending repos; when repos remain pending after all attempts it logs a consolidated warning. Update unit tests to pass poll parameters, add a test that verifies repeated polls and sleeps, and ensure existing tests monkeypatch time.sleep where needed. --- src/updater.py | 57 +++++++++++++++++++++++++++++--------- tests/unit/test_updater.py | 37 +++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/src/updater.py b/src/updater.py index 1c9dbeae2..a76cfb160 100644 --- a/src/updater.py +++ b/src/updater.py @@ -5,6 +5,7 @@ from queue import Queue from datetime import datetime, timezone from threading import Thread +import time # lib imports from github import Auth, Github @@ -21,6 +22,8 @@ COMMIT_ACTIVITY_READY = 'ready' COMMIT_ACTIVITY_PENDING = 'pending' COMMIT_ACTIVITY_FAILED = 'failed' +COMMIT_ACTIVITY_POLL_ATTEMPTS = 6 +COMMIT_ACTIVITY_POLL_INTERVAL = 15 GITHUB_REPO_STEP_TIMEOUT = 90 @@ -609,14 +612,19 @@ def _fetch_open_graph_image_url(repo, headers: dict, graphql_url: str) -> str: raise RuntimeError(f'Error: update_github: {repo_data}') from None -def _collect_commit_activity(repos: list, headers: dict) -> None: +def _collect_commit_activity( + repos: list, + headers: dict, + poll_attempts: int = COMMIT_ACTIVITY_POLL_ATTEMPTS, + poll_interval: int = COMMIT_ACTIVITY_POLL_INTERVAL, +) -> None: """ Trigger and collect weekly commit activity for active repositories. GitHub may return ``202 Accepted`` for the stats endpoint while it starts its server-side calculation. The first pass gives every repository a chance - to start that work. The second pass only revisits repositories that were - still pending after the first request. + to start that work. Later passes only revisit repositories that were still + pending after the previous request. Parameters ---------- @@ -624,6 +632,10 @@ def _collect_commit_activity(repos: list, headers: dict) -> None: Active PyGithub Repository objects. headers : dict HTTP headers including the GitHub authorisation token. + poll_attempts : int + Maximum number of follow-up passes for pending repositories. + poll_interval : int + Seconds to wait between follow-up passes. """ pending_repos = [] for repo in tqdm( @@ -634,16 +646,35 @@ def _collect_commit_activity(repos: list, headers: dict) -> None: if status == COMMIT_ACTIVITY_PENDING: pending_repos.append(repo) - if not pending_repos: - return - - for repo in tqdm( - iterable=pending_repos, - desc='Collecting GitHub commit activity', - ): - status = _fetch_commit_activity(repo, headers) - if status == COMMIT_ACTIVITY_PENDING: - log.warning(f'Commit activity for {repo.name} is still being calculated by GitHub, skipping.') + for attempt in range(1, poll_attempts + 1): + if not pending_repos: + return + + if poll_interval > 0: + message = ( + f'Waiting {poll_interval}s for GitHub commit activity calculation ' + f'({len(pending_repos)} repos pending, attempt {attempt}/{poll_attempts}).' + ) + log.info(message) + tqdm.write(message) + time.sleep(poll_interval) + + next_pending = [] + for repo in tqdm( + iterable=pending_repos, + desc=f'Collecting GitHub commit activity ({attempt}/{poll_attempts})', + ): + status = _fetch_commit_activity(repo, headers) + if status == COMMIT_ACTIVITY_PENDING: + next_pending.append(repo) + + pending_repos = next_pending + + if pending_repos: + repo_names = ', '.join(repo.name for repo in pending_repos) + message = f'GitHub commit activity is still being calculated for: {repo_names}' + log.warning(message) + tqdm.write(message) def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 81985c5f2..c6ff65853 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -318,11 +318,41 @@ def fake_fetch(repo, headers): monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - updater._collect_commit_activity([ready], {}) - updater._collect_commit_activity([ready, pending, stuck], {}) + updater._collect_commit_activity([ready], {}, poll_attempts=1, poll_interval=0) + updater._collect_commit_activity([ready, pending, stuck], {}, poll_attempts=1, poll_interval=0) assert calls == ['ready', 'ready', 'pending', 'stuck', 'pending', 'stuck'] - assert warnings == ['Commit activity for stuck is still being calculated by GitHub, skipping.'] + assert warnings == ['GitHub commit activity is still being calculated for: stuck'] + + +def test_collect_commit_activity_polls_until_ready(monkeypatch): + repo = FakeRepo('repo') + + statuses = [ + updater.COMMIT_ACTIVITY_PENDING, + updater.COMMIT_ACTIVITY_PENDING, + updater.COMMIT_ACTIVITY_READY, + ] + calls = [] + sleeps = [] + infos = [] + messages = [] + + def fake_fetch(repo, headers): + calls.append(repo.name) + return statuses.pop(0) + + monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) + monkeypatch.setattr(updater.time, 'sleep', lambda seconds: sleeps.append(seconds)) + monkeypatch.setattr(updater.log, 'info', lambda msg: infos.append(msg)) + monkeypatch.setattr(updater.tqdm, 'write', lambda msg: messages.append(msg)) + + updater._collect_commit_activity([repo], {}, poll_attempts=2, poll_interval=1) + + assert calls == ['repo', 'repo', 'repo'] + assert sleeps == [1, 1] + assert len(infos) == 2 + assert messages == infos def test_seed_star_history(monkeypatch): @@ -477,6 +507,7 @@ def get_user(self, name): '_fetch_commit_activity', lambda repo, headers: commit_calls.append(repo.name) or commit_statuses[repo.name].pop(0), ) + monkeypatch.setattr(updater.time, 'sleep', lambda seconds: None) processed = [] monkeypatch.setattr(updater, '_process_github_repo', lambda repo, headers, graphql_url: processed.append(repo.name)) monkeypatch.setattr(updater, 'BASE_DIR', 'base') From a80f26f887c6d51c83ce93586747c8855c852308 Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Mon, 18 May 2026 22:08:19 -0400 Subject: [PATCH 4/6] Use participation stats and cache commit activity Switch commit-activity collection to use /stats/participation (weekly totals) instead of /stats/commit_activity to avoid long 202 responses in CI. Add cache helper paths and functions to read/write commitActivity and commitActivityHashes (by default-branch SHA), compute commitActivity-shaped records from participation totals, and only refresh stats when the default-branch SHA changes. Remove polling logic/constants and simplify collection to skip repos with up-to-date cached data. Also add a GH Actions step to restore the generated data cache before collection. Update unit tests to cover the new caching behaviour, participation conversion, and workflow changes. --- .github/workflows/update-pages.yml | 10 ++ src/updater.py | 242 ++++++++++++++++++++--------- tests/unit/test_updater.py | 140 +++++++++-------- 3 files changed, 260 insertions(+), 132 deletions(-) diff --git a/.github/workflows/update-pages.yml b/.github/workflows/update-pages.yml index 2c9c44b3e..062f15690 100644 --- a/.github/workflows/update-pages.yml +++ b/.github/workflows/update-pages.yml @@ -35,6 +35,16 @@ jobs: python -m pip install --upgrade pip setuptools wheel python -m pip install . + - name: Restore generated data cache + shell: bash + run: | + mkdir -p gh-pages + git fetch --depth=1 origin gh-pages + git archive origin/gh-pages github/commitActivity | tar -x -C gh-pages + if git cat-file -e origin/gh-pages:github/commitActivityHashes; then + git archive origin/gh-pages github/commitActivityHashes | tar -x -C gh-pages + fi + - name: Collect data env: DASHBOARD_AUR_REPOS: sunshine,sunshine-bin,sunshine-git diff --git a/src/updater.py b/src/updater.py index a76cfb160..119209ece 100644 --- a/src/updater.py +++ b/src/updater.py @@ -3,9 +3,8 @@ import math import os from queue import Queue -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from threading import Thread -import time # lib imports from github import Auth, Github @@ -22,8 +21,6 @@ COMMIT_ACTIVITY_READY = 'ready' COMMIT_ACTIVITY_PENDING = 'pending' COMMIT_ACTIVITY_FAILED = 'failed' -COMMIT_ACTIVITY_POLL_ATTEMPTS = 6 -COMMIT_ACTIVITY_POLL_INTERVAL = 15 GITHUB_REPO_STEP_TIMEOUT = 90 @@ -201,9 +198,9 @@ def update_fb(): helpers.write_json_files(file_path=file_path, data=data) -def _commit_activity_url(repo) -> str: +def _commit_participation_url(repo) -> str: """ - Build the GitHub REST URL for a repository's weekly commit activity. + Build the GitHub REST URL for a repository's weekly commit participation. Parameters ---------- @@ -215,10 +212,106 @@ def _commit_activity_url(repo) -> str: str GitHub REST API URL. """ - return f'https://api.github.com/repos/{repo.owner.login}/{repo.name}/stats/commit_activity' + return f'https://api.github.com/repos/{repo.owner.login}/{repo.name}/stats/participation' -def _write_commit_activity(repo, commit_activity: list) -> None: +def _commit_activity_cache_path(repo) -> str: + """ + Return the cache path for a repository's weekly commit activity. + + Parameters + ---------- + repo : + PyGithub Repository object. + + Returns + ------- + str + File path without the ``.json`` extension. + """ + return os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) + + +def _commit_activity_hash_cache_path(repo) -> str: + """ + Return the cache path for a repository's commit-activity source SHA. + + Parameters + ---------- + repo : + PyGithub Repository object. + + Returns + ------- + str + File path without the ``.json`` extension. + """ + return os.path.join(BASE_DIR, 'github', 'commitActivityHashes', repo.name) + + +def _has_cached_commit_activity(repo) -> bool: + """ + Return whether cached weekly commit activity exists for a repository. + + Parameters + ---------- + repo : + PyGithub Repository object. + + Returns + ------- + bool + True when a valid cached stats file exists. + """ + try: + with open(f'{_commit_activity_cache_path(repo)}.json') as f: + return isinstance(json.load(f), list) + except Exception: + return False + + +def _cached_commit_activity_sha(repo) -> str | None: + """ + Return the cached default-branch SHA for a repository's commit activity. + + Parameters + ---------- + repo : + PyGithub Repository object. + + Returns + ------- + str or None + Cached SHA when available. + """ + try: + with open(f'{_commit_activity_hash_cache_path(repo)}.json') as f: + data = json.load(f) + except Exception: + return None + + sha = data.get('sha') if isinstance(data, dict) else None + return sha if isinstance(sha, str) else None + + +def _default_branch_sha(repo) -> str: + """ + Return the current default-branch commit SHA for a repository. + + Parameters + ---------- + repo : + PyGithub Repository object. + + Returns + ------- + str + Default-branch commit SHA. + """ + return repo.get_branch(repo.default_branch).commit.sha + + +def _write_commit_activity(repo, commit_activity: list, sha: str | None = None) -> None: """ Write weekly commit activity for a repository. @@ -228,9 +321,12 @@ def _write_commit_activity(repo, commit_activity: list) -> None: PyGithub Repository object. commit_activity : list Weekly commit activity records from GitHub's REST API. + sha : str or None + Default-branch commit SHA that produced the stats. """ - file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name) - helpers.write_json_files(file_path=file_path, data=commit_activity) + helpers.write_json_files(file_path=_commit_activity_cache_path(repo), data=commit_activity) + if sha: + helpers.write_json_files(file_path=_commit_activity_hash_cache_path(repo), data={'sha': sha}) def _run_github_repo_step(repo, step: str, func: callable, default=None, timeout: int = GITHUB_REPO_STEP_TIMEOUT): @@ -285,13 +381,53 @@ def runner(): return default -def _fetch_commit_activity(repo, headers: dict) -> str: +def _participation_to_commit_activity(participation: dict) -> list[dict]: """ - Fetch or trigger weekly commit activity for a repository. + Convert GitHub participation stats into commit-activity-shaped records. - GitHub returns ``202 Accepted`` while it computes repository statistics. - This function treats that response as a successful trigger instead of - waiting in-place. + Parameters + ---------- + participation : dict + Response body from ``/stats/participation``. + + Returns + ------- + list + Weekly commit activity records with ``week`` and ``total`` keys. + """ + totals = participation.get('all', []) + if not isinstance(totals, list): + return [] + + today = datetime.now(tz=timezone.utc).date() + days_since_sunday = (today.weekday() + 1) % 7 + newest_week = today - timedelta(days=days_since_sunday) + + return [ + { + 'days': [0, 0, 0, 0, 0, 0, 0], + 'total': total, + 'week': int( + datetime.combine( + newest_week - timedelta(weeks=len(totals) - index - 1), + datetime.min.time(), + tzinfo=timezone.utc, + ).timestamp() + ), + } + for index, total in enumerate(totals) + if isinstance(total, int) + ] + + +def _fetch_commit_activity(repo, headers: dict, sha: str | None = None) -> str: + """ + Fetch weekly total commit counts for a repository. + + GitHub's ``/stats/commit_activity`` endpoint can return ``202`` for a long + time in CI. The dashboard only charts weekly totals, so use + ``/stats/participation`` and keep writing the existing ``commitActivity`` + cache files for builder compatibility. Parameters ---------- @@ -299,6 +435,8 @@ def _fetch_commit_activity(repo, headers: dict) -> str: PyGithub Repository object. headers : dict HTTP headers including the GitHub authorisation token. + sha : str or None + Default-branch commit SHA that produced the stats. Returns ------- @@ -306,7 +444,9 @@ def _fetch_commit_activity(repo, headers: dict) -> str: One of ``COMMIT_ACTIVITY_READY``, ``COMMIT_ACTIVITY_PENDING``, or ``COMMIT_ACTIVITY_FAILED``. """ - url = _commit_activity_url(repo) + # Use participation instead of commit_activity because the dashboard only + # needs weekly totals, and commit_activity can remain at 202 in CI. + url = _commit_participation_url(repo) try: response = helpers.s.get(url=url, headers=headers) except requests.exceptions.RequestException as e: @@ -316,10 +456,6 @@ def _fetch_commit_activity(repo, headers: dict) -> str: if response.status_code == 202: return COMMIT_ACTIVITY_PENDING - if response.status_code == 204: - _write_commit_activity(repo, []) - return COMMIT_ACTIVITY_READY - try: data = response.json() except Exception as e: @@ -331,11 +467,12 @@ def _fetch_commit_activity(repo, headers: dict) -> str: log.warning(f'Error fetching commit activity for {repo.name}: {message}') return COMMIT_ACTIVITY_FAILED - if not isinstance(data, list): + commit_activity = _participation_to_commit_activity(data) + if not commit_activity: log.warning(f'Unexpected commit activity response for {repo.name}: {data}') return COMMIT_ACTIVITY_FAILED - _write_commit_activity(repo, data) + _write_commit_activity(repo, commit_activity, sha) return COMMIT_ACTIVITY_READY @@ -612,19 +749,13 @@ def _fetch_open_graph_image_url(repo, headers: dict, graphql_url: str) -> str: raise RuntimeError(f'Error: update_github: {repo_data}') from None -def _collect_commit_activity( - repos: list, - headers: dict, - poll_attempts: int = COMMIT_ACTIVITY_POLL_ATTEMPTS, - poll_interval: int = COMMIT_ACTIVITY_POLL_INTERVAL, -) -> None: +def _collect_commit_activity(repos: list, headers: dict) -> None: """ - Trigger and collect weekly commit activity for active repositories. + Collect weekly commit totals for active repositories. - GitHub may return ``202 Accepted`` for the stats endpoint while it starts - its server-side calculation. The first pass gives every repository a chance - to start that work. Later passes only revisit repositories that were still - pending after the previous request. + GitHub caches repository stats by the current default-branch SHA. Reuse + cached files while the SHA matches, and refresh only when the SHA changes + or when no cached stats file exists. Parameters ---------- @@ -632,49 +763,20 @@ def _collect_commit_activity( Active PyGithub Repository objects. headers : dict HTTP headers including the GitHub authorisation token. - poll_attempts : int - Maximum number of follow-up passes for pending repositories. - poll_interval : int - Seconds to wait between follow-up passes. """ - pending_repos = [] for repo in tqdm( iterable=repos, - desc='Triggering GitHub commit activity', + desc='Updating GitHub commit activity', ): - status = _fetch_commit_activity(repo, headers) + sha = _run_github_repo_step(repo, 'default branch SHA', lambda: _default_branch_sha(repo)) + if sha and _has_cached_commit_activity(repo) and _cached_commit_activity_sha(repo) == sha: + continue + + status = _fetch_commit_activity(repo, headers, sha) if status == COMMIT_ACTIVITY_PENDING: - pending_repos.append(repo) - - for attempt in range(1, poll_attempts + 1): - if not pending_repos: - return - - if poll_interval > 0: - message = ( - f'Waiting {poll_interval}s for GitHub commit activity calculation ' - f'({len(pending_repos)} repos pending, attempt {attempt}/{poll_attempts}).' - ) - log.info(message) + message = f'GitHub commit activity is still being calculated for: {repo.name}' + log.warning(message) tqdm.write(message) - time.sleep(poll_interval) - - next_pending = [] - for repo in tqdm( - iterable=pending_repos, - desc=f'Collecting GitHub commit activity ({attempt}/{poll_attempts})', - ): - status = _fetch_commit_activity(repo, headers) - if status == COMMIT_ACTIVITY_PENDING: - next_pending.append(repo) - - pending_repos = next_pending - - if pending_repos: - repo_names = ', '.join(repo.name for repo in pending_repos) - message = f'GitHub commit activity is still being calculated for: {repo_names}' - log.warning(message) - tqdm.write(message) def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index c6ff65853..cd801c4b6 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -56,16 +56,22 @@ def get_page(self, idx): class FakeRepo: - def __init__(self, name='repo1', archived=False, stars=4): + def __init__(self, name='repo1', archived=False, stars=4, sha=None): self.name = name self.archived = archived self.owner = SimpleNamespace(login='owner') self.stargazers_count = stars + self.default_branch = 'master' + self.sha = sha or f'sha-{name}' self.raw_data = {'name': name, 'archived': archived} def get_languages(self): return {'Python': 100} + def get_branch(self, branch): + assert branch == self.default_branch + return SimpleNamespace(commit=SimpleNamespace(sha=self.sha)) + def get_pulls(self, state='open'): assert state == 'open' return [FakePull(3)] @@ -224,26 +230,40 @@ def fake_get(url): def test_fetch_commit_activity(monkeypatch, tmp_path): monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) + fixed_today = datetime(2026, 5, 19, tzinfo=timezone.utc) + + class FixedDatetime(datetime): + @classmethod + def now(cls, tz=None): + return fixed_today + + monkeypatch.setattr(updater, 'datetime', FixedDatetime) + writes = [] monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) - responses = [ - FakeResponse([{'week': 1, 'total': 2}], status=200), - FakeResponse(status=202), - FakeResponse(status=204), - ] - monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: responses.pop(0)) + urls = [] + monkeypatch.setattr( + updater.helpers.s, + 'get', + lambda url, headers: urls.append(url) or FakeResponse({'all': [0, 2]}, status=200), + ) repo = FakeRepo(name='demo') headers = {'Authorization': 'token'} - assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_READY - assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_PENDING - assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_READY + assert updater._fetch_commit_activity(repo, headers, sha='abc') == updater.COMMIT_ACTIVITY_READY + assert urls == ['https://api.github.com/repos/owner/demo/stats/participation'] assert len(writes) == 2 - assert writes[0][1] == [{'week': 1, 'total': 2}] - assert writes[1][1] == [] - assert '/repos/owner/demo/stats/commit_activity' in updater._commit_activity_url(repo) + assert writes[0][0].endswith('commitActivity\\demo') or writes[0][0].endswith('commitActivity/demo') + assert writes[0][1] == [ + {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1778371200}, + {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 2, 'week': 1778976000}, + ] + assert writes[1][0].endswith('commitActivityHashes\\demo') or writes[1][0].endswith( + 'commitActivityHashes/demo' + ) + assert writes[1][1] == {'sha': 'abc'} def test_fetch_commit_activity_errors(monkeypatch): @@ -252,6 +272,8 @@ def test_fetch_commit_activity_errors(monkeypatch): warnings = [] monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + assert updater._participation_to_commit_activity({'all': 'bad'}) == [] + monkeypatch.setattr( updater.helpers.s, 'get', @@ -259,6 +281,9 @@ def test_fetch_commit_activity_errors(monkeypatch): ) assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: FakeResponse(status=202)) + assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_PENDING + monkeypatch.setattr( updater.helpers.s, 'get', @@ -273,7 +298,7 @@ def test_fetch_commit_activity_errors(monkeypatch): ) assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED - monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: FakeResponse({'bad': 1}, status=200)) + monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: FakeResponse({'all': []}, status=200)) assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED assert len(warnings) == 4 @@ -298,61 +323,57 @@ def test_run_github_repo_step_timeout(monkeypatch): assert messages[-1] == warnings[0] -def test_collect_commit_activity(monkeypatch): - ready = FakeRepo('ready') - pending = FakeRepo('pending') - stuck = FakeRepo('stuck') +def test_commit_activity_cache_helpers(tmp_path, monkeypatch): + monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) + repo = FakeRepo(name='demo') - statuses = { - 'ready': [updater.COMMIT_ACTIVITY_READY, updater.COMMIT_ACTIVITY_READY], - 'pending': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_READY], - 'stuck': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_PENDING], - } - calls = [] + assert not updater._has_cached_commit_activity(repo) + assert updater._cached_commit_activity_sha(repo) is None - def fake_fetch(repo, headers): - calls.append(repo.name) - return statuses[repo.name].pop(0) + stats_path = tmp_path / 'gh-pages' / 'github' / 'commitActivity' / 'demo.json' + hash_path = tmp_path / 'gh-pages' / 'github' / 'commitActivityHashes' / 'demo.json' + stats_path.parent.mkdir(parents=True) + hash_path.parent.mkdir(parents=True) - warnings = [] - monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) - monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + stats_path.write_text('{bad', encoding='utf-8') + hash_path.write_text('[]', encoding='utf-8') + assert not updater._has_cached_commit_activity(repo) + assert updater._cached_commit_activity_sha(repo) is None - updater._collect_commit_activity([ready], {}, poll_attempts=1, poll_interval=0) - updater._collect_commit_activity([ready, pending, stuck], {}, poll_attempts=1, poll_interval=0) + stats_path.write_text('[{"total": 1}]', encoding='utf-8') + hash_path.write_text('{"sha": "abc"}', encoding='utf-8') + assert updater._has_cached_commit_activity(repo) + assert updater._cached_commit_activity_sha(repo) == 'abc' - assert calls == ['ready', 'ready', 'pending', 'stuck', 'pending', 'stuck'] - assert warnings == ['GitHub commit activity is still being calculated for: stuck'] +def test_collect_commit_activity_uses_sha_cache(monkeypatch, tmp_path): + monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) -def test_collect_commit_activity_polls_until_ready(monkeypatch): - repo = FakeRepo('repo') + cached = FakeRepo('cached', sha='same') + changed = FakeRepo('changed', sha='new') + missing = FakeRepo('missing', sha='missing') + pending = FakeRepo('pending', sha='pending') + + updater._write_commit_activity(cached, [{'total': 1}], 'same') + updater._write_commit_activity(changed, [{'total': 1}], 'old') - statuses = [ - updater.COMMIT_ACTIVITY_PENDING, - updater.COMMIT_ACTIVITY_PENDING, - updater.COMMIT_ACTIVITY_READY, - ] calls = [] - sleeps = [] - infos = [] + warnings = [] messages = [] - def fake_fetch(repo, headers): - calls.append(repo.name) - return statuses.pop(0) + def fake_fetch(repo, headers, sha=None): + calls.append((repo.name, sha)) + return updater.COMMIT_ACTIVITY_PENDING if repo.name == 'pending' else updater.COMMIT_ACTIVITY_READY monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) - monkeypatch.setattr(updater.time, 'sleep', lambda seconds: sleeps.append(seconds)) - monkeypatch.setattr(updater.log, 'info', lambda msg: infos.append(msg)) + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) monkeypatch.setattr(updater.tqdm, 'write', lambda msg: messages.append(msg)) - updater._collect_commit_activity([repo], {}, poll_attempts=2, poll_interval=1) + updater._collect_commit_activity([cached, changed, missing, pending], {}) - assert calls == ['repo', 'repo', 'repo'] - assert sleeps == [1, 1] - assert len(infos) == 2 - assert messages == infos + assert calls == [('changed', 'new'), ('missing', 'missing'), ('pending', 'pending')] + assert warnings == ['GitHub commit activity is still being calculated for: pending'] + assert messages[-1] == warnings[0] def test_seed_star_history(monkeypatch): @@ -497,17 +518,12 @@ def get_user(self, name): writes = [] monkeypatch.setattr(updater.helpers, 'write_json_files', lambda file_path, data: writes.append((file_path, data))) - commit_calls = [] - commit_statuses = { - 'active': [updater.COMMIT_ACTIVITY_READY], - 'pending': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_READY], - } + commit_repos = [] monkeypatch.setattr( updater, - '_fetch_commit_activity', - lambda repo, headers: commit_calls.append(repo.name) or commit_statuses[repo.name].pop(0), + '_collect_commit_activity', + lambda repos, headers: commit_repos.extend(repo.name for repo in repos), ) - monkeypatch.setattr(updater.time, 'sleep', lambda seconds: None) processed = [] monkeypatch.setattr(updater, '_process_github_repo', lambda repo, headers, graphql_url: processed.append(repo.name)) monkeypatch.setattr(updater, 'BASE_DIR', 'base') @@ -515,7 +531,7 @@ def get_user(self, name): updater.update_github() assert any(path.endswith('github\\repos') or path.endswith('github/repos') for path, _ in writes) - assert commit_calls == ['active', 'pending', 'pending'] + assert commit_repos == ['active', 'pending'] assert processed == ['active', 'pending'] From 712781d6a686e5a7307a7afdf6fa665c1d645f4d Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Mon, 18 May 2026 22:44:16 -0400 Subject: [PATCH 5/6] style: sonar fixes --- src/updater.py | 4 ++-- tests/unit/test_updater.py | 25 ++++++++++++------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/updater.py b/src/updater.py index 119209ece..24dd50b50 100644 --- a/src/updater.py +++ b/src/updater.py @@ -358,7 +358,7 @@ def _run_github_repo_step(repo, step: str, func: callable, default=None, timeout def runner(): try: result_queue.put((True, func())) - except BaseException as e: + except Exception as e: result_queue.put((False, e)) thread = Thread(target=runner, daemon=True) @@ -768,7 +768,7 @@ def _collect_commit_activity(repos: list, headers: dict) -> None: iterable=repos, desc='Updating GitHub commit activity', ): - sha = _run_github_repo_step(repo, 'default branch SHA', lambda: _default_branch_sha(repo)) + sha = _run_github_repo_step(repo, 'default branch SHA', lambda repo=repo: _default_branch_sha(repo)) if sha and _has_cached_commit_activity(repo) and _cached_commit_activity_sha(repo) == sha: continue diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index cd801c4b6..2c56d4acf 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -255,14 +255,12 @@ def now(cls, tz=None): assert updater._fetch_commit_activity(repo, headers, sha='abc') == updater.COMMIT_ACTIVITY_READY assert urls == ['https://api.github.com/repos/owner/demo/stats/participation'] assert len(writes) == 2 - assert writes[0][0].endswith('commitActivity\\demo') or writes[0][0].endswith('commitActivity/demo') + assert writes[0][0].endswith(('commitActivity\\demo', 'commitActivity/demo')) assert writes[0][1] == [ {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1778371200}, {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 2, 'week': 1778976000}, ] - assert writes[1][0].endswith('commitActivityHashes\\demo') or writes[1][0].endswith( - 'commitActivityHashes/demo' - ) + assert writes[1][0].endswith(('commitActivityHashes\\demo', 'commitActivityHashes/demo')) assert writes[1][1] == {'sha': 'abc'} @@ -274,11 +272,10 @@ def test_fetch_commit_activity_errors(monkeypatch): assert updater._participation_to_commit_activity({'all': 'bad'}) == [] - monkeypatch.setattr( - updater.helpers.s, - 'get', - lambda url, headers: (_ for _ in ()).throw(requests.exceptions.Timeout('timeout')), - ) + def raise_timeout(url, headers): + raise requests.exceptions.Timeout('timeout') + + monkeypatch.setattr(updater.helpers.s, 'get', raise_timeout) assert updater._fetch_commit_activity(repo, headers) == updater.COMMIT_ACTIVITY_FAILED monkeypatch.setattr(updater.helpers.s, 'get', lambda url, headers: FakeResponse(status=202)) @@ -318,9 +315,10 @@ def test_run_github_repo_step_timeout(monkeypatch): timeout=0.001, ) + expected_warning = 'Timeout after 0.001s while running GitHub slow step for demo, skipping.' assert result == 'fallback' - assert warnings == ['Timeout after 0.001s while running GitHub slow step for demo, skipping.'] - assert messages[-1] == warnings[0] + assert warnings == [expected_warning] + assert expected_warning in messages def test_commit_activity_cache_helpers(tmp_path, monkeypatch): @@ -371,9 +369,10 @@ def fake_fetch(repo, headers, sha=None): updater._collect_commit_activity([cached, changed, missing, pending], {}) + expected_warning = 'GitHub commit activity is still being calculated for: pending' assert calls == [('changed', 'new'), ('missing', 'missing'), ('pending', 'pending')] - assert warnings == ['GitHub commit activity is still being calculated for: pending'] - assert messages[-1] == warnings[0] + assert warnings == [expected_warning] + assert expected_warning in messages def test_seed_star_history(monkeypatch): From 04e9065b0fcbc13594ffee5c0c013fd521276eda Mon Sep 17 00:00:00 2001 From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com> Date: Tue, 19 May 2026 17:15:19 -0400 Subject: [PATCH 6/6] Improve GitHub step logging and activity collection Consolidate logging in _run_github_repo_step to use single log.warning calls (remove tqdm.write duplicates) and keep returning the default on timeout or error. Revamp _collect_commit_activity to use a two-pass approach: a priming pass to trigger GitHub stats calculation and a second pass to re-check only pending repositories, aggregating pending repo names into a single warning. Update unit tests to match the new logging/behavior (add test_run_github_repo_step_error, adjust timeout test expectations, replace the pending test with deterministic status sequences, and add a test ensuring early return when all repos are ready). --- src/updater.py | 39 +++++++++++++++-------- tests/unit/test_updater.py | 65 ++++++++++++++++++++++++++++++-------- 2 files changed, 78 insertions(+), 26 deletions(-) diff --git a/src/updater.py b/src/updater.py index 24dd50b50..c1b2884c0 100644 --- a/src/updater.py +++ b/src/updater.py @@ -351,8 +351,6 @@ def _run_github_repo_step(repo, step: str, func: callable, default=None, timeout any The callable result, or ``default`` when the step fails. """ - tqdm.write(f'GitHub {repo.name}: {step}...') - result_queue = Queue(maxsize=1) def runner(): @@ -366,18 +364,14 @@ def runner(): thread.join(timeout=timeout) if thread.is_alive(): - message = f'Timeout after {timeout}s while running GitHub {step} for {repo.name}, skipping.' - log.warning(message) - tqdm.write(message) + log.warning(f'Timeout after {timeout}s while running GitHub {step} for {repo.name}, skipping.') return default success, value = result_queue.get() if success: return value - message = f'Error running GitHub {step} for {repo.name}: {value}' - log.warning(message) - tqdm.write(message) + log.warning(f'Error running GitHub {step} for {repo.name}: {value}') return default @@ -755,7 +749,10 @@ def _collect_commit_activity(repos: list, headers: dict) -> None: GitHub caches repository stats by the current default-branch SHA. Reuse cached files while the SHA matches, and refresh only when the SHA changes - or when no cached stats file exists. + or when no cached stats file exists. The first pass gives GitHub a chance + to calculate participation stats for every changed repository; the second + pass revisits only repositories that returned ``202`` during the first + request. Parameters ---------- @@ -764,9 +761,11 @@ def _collect_commit_activity(repos: list, headers: dict) -> None: headers : dict HTTP headers including the GitHub authorisation token. """ + pending_repos = [] + for repo in tqdm( iterable=repos, - desc='Updating GitHub commit activity', + desc='Priming GitHub commit activity', ): sha = _run_github_repo_step(repo, 'default branch SHA', lambda repo=repo: _default_branch_sha(repo)) if sha and _has_cached_commit_activity(repo) and _cached_commit_activity_sha(repo) == sha: @@ -774,9 +773,23 @@ def _collect_commit_activity(repos: list, headers: dict) -> None: status = _fetch_commit_activity(repo, headers, sha) if status == COMMIT_ACTIVITY_PENDING: - message = f'GitHub commit activity is still being calculated for: {repo.name}' - log.warning(message) - tqdm.write(message) + pending_repos.append((repo, sha)) + + if not pending_repos: + return + + still_pending = [] + for repo, sha in tqdm( + iterable=pending_repos, + desc='Collecting GitHub commit activity', + ): + status = _fetch_commit_activity(repo, headers, sha) + if status == COMMIT_ACTIVITY_PENDING: + still_pending.append(repo.name) + + if still_pending: + repo_names = ', '.join(still_pending) + log.warning(f'GitHub commit activity is still being calculated for: {repo_names}') def _process_github_repo(repo, headers: dict, graphql_url: str) -> None: diff --git a/tests/unit/test_updater.py b/tests/unit/test_updater.py index 2c56d4acf..25c2453e6 100644 --- a/tests/unit/test_updater.py +++ b/tests/unit/test_updater.py @@ -300,12 +300,25 @@ def raise_timeout(url, headers): assert len(warnings) == 4 +def test_run_github_repo_step_error(monkeypatch): + repo = FakeRepo(name='demo') + warnings = [] + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + + def raise_error(): + raise RuntimeError('boom') + + result = updater._run_github_repo_step(repo, 'broken step', raise_error, default='fallback') + + assert updater._run_github_repo_step(repo, 'normal step', lambda: 'ok') == 'ok' + assert result == 'fallback' + assert warnings == ['Error running GitHub broken step for demo: boom'] + + def test_run_github_repo_step_timeout(monkeypatch): repo = FakeRepo(name='demo') warnings = [] - messages = [] monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - monkeypatch.setattr(updater.tqdm, 'write', lambda msg: messages.append(msg)) result = updater._run_github_repo_step( repo, @@ -315,10 +328,8 @@ def test_run_github_repo_step_timeout(monkeypatch): timeout=0.001, ) - expected_warning = 'Timeout after 0.001s while running GitHub slow step for demo, skipping.' assert result == 'fallback' - assert warnings == [expected_warning] - assert expected_warning in messages + assert warnings == ['Timeout after 0.001s while running GitHub slow step for demo, skipping.'] def test_commit_activity_cache_helpers(tmp_path, monkeypatch): @@ -350,29 +361,57 @@ def test_collect_commit_activity_uses_sha_cache(monkeypatch, tmp_path): cached = FakeRepo('cached', sha='same') changed = FakeRepo('changed', sha='new') missing = FakeRepo('missing', sha='missing') - pending = FakeRepo('pending', sha='pending') + stuck = FakeRepo('stuck', sha='stuck') updater._write_commit_activity(cached, [{'total': 1}], 'same') updater._write_commit_activity(changed, [{'total': 1}], 'old') calls = [] warnings = [] - messages = [] + statuses = { + 'changed': [updater.COMMIT_ACTIVITY_READY], + 'missing': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_READY], + 'stuck': [updater.COMMIT_ACTIVITY_PENDING, updater.COMMIT_ACTIVITY_PENDING], + } def fake_fetch(repo, headers, sha=None): calls.append((repo.name, sha)) - return updater.COMMIT_ACTIVITY_PENDING if repo.name == 'pending' else updater.COMMIT_ACTIVITY_READY + return statuses[repo.name].pop(0) monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) - monkeypatch.setattr(updater.tqdm, 'write', lambda msg: messages.append(msg)) - updater._collect_commit_activity([cached, changed, missing, pending], {}) + updater._collect_commit_activity([cached, changed, missing, stuck], {}) - expected_warning = 'GitHub commit activity is still being calculated for: pending' - assert calls == [('changed', 'new'), ('missing', 'missing'), ('pending', 'pending')] + expected_warning = 'GitHub commit activity is still being calculated for: stuck' + assert calls == [ + ('changed', 'new'), + ('missing', 'missing'), + ('stuck', 'stuck'), + ('missing', 'missing'), + ('stuck', 'stuck'), + ] assert warnings == [expected_warning] - assert expected_warning in messages + + +def test_collect_commit_activity_returns_when_all_ready(monkeypatch, tmp_path): + monkeypatch.setattr(updater, 'BASE_DIR', str(tmp_path / 'gh-pages')) + + repo = FakeRepo('ready', sha='new') + calls = [] + warnings = [] + + def fake_fetch(repo, headers, sha=None): + calls.append((repo.name, sha)) + return updater.COMMIT_ACTIVITY_READY + + monkeypatch.setattr(updater, '_fetch_commit_activity', fake_fetch) + monkeypatch.setattr(updater.log, 'warning', lambda msg: warnings.append(msg)) + + updater._collect_commit_activity([repo], {}) + + assert calls == [('ready', 'new')] + assert warnings == [] def test_seed_star_history(monkeypatch):