Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 156 additions & 32 deletions src/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import json
import math
import os
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout
from datetime import datetime, timezone
import time
from datetime import datetime, timedelta, timezone
from threading import Thread

# lib imports
Expand Down Expand Up @@ -193,30 +193,153 @@ def update_fb():
helpers.write_json_files(file_path=file_path, data=data)


def _get_stats_with_timeout(repo, timeout=60):
def _prime_commit_activity(repo, headers: dict) -> None:
"""
Fetch commit activity for a repo, capping total wait time.
Start GitHub's async commit-activity stats job for a repo.

Parameters
----------
repo :
PyGithub Repository object.
timeout : int
Maximum seconds to wait before giving up (GitHub may return 202 while
computing stats, causing PyGithub to retry indefinitely without this guard).
PyGithub waits and retries automatically on 202 responses, which is what
we want for the final fetch. This warm-up request intentionally does not
use PyGithub so all repos can be started before any one repo blocks.
"""
url = f'{repo.url}/stats/commit_activity'
try:
response = helpers.s.get(url=url, headers=headers, timeout=5)
except requests.exceptions.RequestException as e:
log.warning(f'Could not prime commit activity for {repo.name}: {e}')
return

Returns
-------
list or None
Weekly commit-activity objects, or None on timeout.
if response.status_code not in (200, 202):
log.warning(f'Could not prime commit activity for {repo.name}: HTTP {response.status_code}')


def _get_commit_activity(repo, headers: dict) -> tuple[list | None, int | str]:
"""
with ThreadPoolExecutor(max_workers=1) as pool:
future = pool.submit(repo.get_stats_commit_activity)
try:
return future.result(timeout=timeout)
except FuturesTimeout:
log.warning(f'Timeout fetching commit activity for {repo.name}, skipping.')
return None
Fetch commit activity for a repo.
"""
url = f'{repo.url}/stats/commit_activity'
try:
response = helpers.s.get(url=url, headers=headers, timeout=10)
except requests.exceptions.RequestException as e:
log.warning(f'Could not fetch commit activity for {repo.name}: {e}')
return None, 'error'

if response.status_code in (202, 204):
return None, response.status_code
if response.status_code in (403, 429) or response.status_code >= 500:
log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}')
return None, response.status_code
if response.status_code != 200:
log.warning(f'Could not fetch commit activity for {repo.name}: HTTP {response.status_code}')
return [], response.status_code

try:
return response.json() or [], response.status_code
except requests.exceptions.JSONDecodeError as e:
log.warning(f'Could not parse commit activity for {repo.name}: {e}')
return [], 'parse_error'


def _build_commit_activity_from_commits(repo) -> list:
"""
Build commit activity from the commits API when GitHub stats do not become ready.
"""
today = datetime.now(tz=timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
latest_week_start = today - timedelta(days=(today.weekday() + 1) % 7)
first_week_start = latest_week_start - timedelta(weeks=51)
end = latest_week_start + timedelta(days=7)

commit_activity = [
{
'days': [0, 0, 0, 0, 0, 0, 0],
'total': 0,
'week': int((first_week_start + timedelta(weeks=i)).timestamp()),
}
for i in range(52)
]

try:
commits = repo.get_commits(since=first_week_start)
for commit in commits:
if len(getattr(commit, 'parents', [])) > 1:
continue

commit_data = getattr(commit, 'commit', None)
author = getattr(commit_data, 'author', None)
committer = getattr(commit_data, 'committer', None)
commit_date = getattr(author, 'date', None) or getattr(committer, 'date', None)
if commit_date is None:
continue
if commit_date.tzinfo is None:
commit_date = commit_date.replace(tzinfo=timezone.utc)
else:
commit_date = commit_date.astimezone(timezone.utc)

if commit_date < first_week_start or commit_date >= end:
continue

days_since_start = (commit_date.date() - first_week_start.date()).days
week_index, day_index = divmod(days_since_start, 7)
commit_activity[week_index]['days'][day_index] += 1
commit_activity[week_index]['total'] += 1
except GithubException as e:
log.warning(f'Could not build commit activity for {repo.name}: {e}')
return []

return commit_activity


def _write_commit_activity(repo, commit_activity: list) -> None:
"""
Cache commit activity for a repo.
"""
if commit_activity:
commits = [
week.raw_data if hasattr(week, 'raw_data') else week
for week in commit_activity
]
file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name)
helpers.write_json_files(file_path=file_path, data=commits)


def _update_commit_activity(repos: list, headers: dict, max_wait: int = 1200, poll_interval: int = 15) -> None:
"""
Poll commit activity for all repos until ready or timeout.
"""
pending = list(repos)
deadline = time.monotonic() + max_wait

with tqdm(total=len(pending), desc='Updating GitHub commit activity') as progress:
while pending and time.monotonic() < deadline:
remaining = []
statuses = {}
for repo in pending:
commit_activity, status = _get_commit_activity(repo, headers)
statuses[status] = statuses.get(status, 0) + 1
if commit_activity is None:
remaining.append(repo)
continue

_write_commit_activity(repo, commit_activity)
progress.update(1)

pending = remaining
if pending:
status_summary = ', '.join(
f'{status}: {count}' for status, count in sorted(statuses.items(), key=lambda item: str(item[0]))
)
progress.set_postfix_str(f'{len(pending)} pending')
progress.write(
f'Waiting for GitHub commit activity: {len(pending)} repos pending ({status_summary})'
)
progress.refresh()
sleep_for = min(poll_interval, max(0, deadline - time.monotonic()))
if sleep_for:
time.sleep(sleep_for)

for repo in pending:
log.warning(f'Timeout fetching commit activity stats for {repo.name}, using commits API fallback.')
commit_activity = _build_commit_activity_from_commits(repo)
_write_commit_activity(repo, commit_activity)


def _seed_star_history(repo, total: int, initial_samples: int) -> list[dict]:
Expand Down Expand Up @@ -446,13 +569,6 @@ def _process_github_repo(repo, headers: dict, graphql_url: str) -> None:
file_path = os.path.join(BASE_DIR, 'github', 'languages', repo.name)
helpers.write_json_files(file_path=file_path, data=languages)

# commit activity (last year, weekly buckets)
commit_activity = _get_stats_with_timeout(repo)
if commit_activity:
commits = [week.raw_data for week in commit_activity]
file_path = os.path.join(BASE_DIR, 'github', 'commitActivity', repo.name)
helpers.write_json_files(file_path=file_path, data=commits)

# open pull requests
pulls_data = []
for pr in repo.get_pulls(state='open'):
Expand Down Expand Up @@ -546,14 +662,22 @@ def update_github():
}
graphql_url = 'https://api.github.com/graphql'

active_repos = [repo for repo in repos if not repo.archived]

for repo in tqdm(
iterable=repos,
iterable=active_repos,
desc='Priming GitHub commit activity',
):
_prime_commit_activity(repo, headers)

for repo in tqdm(
iterable=active_repos,
desc='Updating GitHub data',
):
if repo.archived:
continue
_process_github_repo(repo, headers, graphql_url)

_update_commit_activity(active_repos, headers)


def update_patreon():
"""
Expand Down
Loading