diff --git a/docker-compose.yml b/docker-compose.yml index 4e0c0f7..44a9196 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,7 +32,6 @@ services: environment: # GitHub Configuration GITHUB_REPOS: "mozilla-firefox/firefox" - GITHUB_TOKEN: "" # Not needed for mock API # Use mock GitHub API instead of real API GITHUB_API_URL: "http://mock-github-api:5000" diff --git a/main.py b/main.py index e73ad42..135ec7d 100755 --- a/main.py +++ b/main.py @@ -26,6 +26,14 @@ logger = logging.getLogger(__name__) +_RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({500, 502, 503, 504}) +_MAX_RETRIES: int = 5 +_RETRY_BASE_DELAY: float = 1.0 +_RETRY_MAX_DELAY: float = 60.0 +_RETRY_MULTIPLIER: float = 2.0 +_MAX_AUTH_RETRIES: int = 2 +_REQUEST_TIMEOUT: float = 30.0 + @dataclass(frozen=True) class AccessToken: @@ -213,7 +221,7 @@ def extract_pull_requests( while True: if refresh_auth: refresh_auth() - resp = github_get(session, base_url, params=params) + resp = github_get(session, base_url, params=params, refresh_auth=refresh_auth) batch = resp.json() pages += 1 @@ -229,13 +237,13 @@ def extract_pull_requests( if not pr_number: continue pr["commit_data"] = extract_commits( - session, repo, pr_number, github_api_url + session, repo, pr_number, github_api_url, refresh_auth=refresh_auth ) pr["reviewer_data"] = extract_reviewers( - session, repo, pr_number, github_api_url + session, repo, pr_number, github_api_url, refresh_auth=refresh_auth ) pr["comment_data"] = extract_comments( - session, repo, pr_number, github_api_url + session, repo, pr_number, github_api_url, refresh_auth=refresh_auth ) yield batch @@ -276,6 +284,7 @@ def extract_commits( repo: str, pr_number: int, github_api_url: str = "https://api.github.com", + refresh_auth: Optional[Callable[[], None]] = None, ) -> list[dict]: """ Extract commits and files for a specific pull request. @@ -294,13 +303,13 @@ def extract_commits( logger.info(f"Commits URL: {commits_url}") - resp = github_get(session, commits_url) + resp = github_get(session, commits_url, refresh_auth=refresh_auth) commits = resp.json() for commit in commits: commit_sha = commit.get("sha") commit_url = f"{github_api_url}/repos/{repo}/commits/{commit_sha}" - commit_data = github_get(session, commit_url).json() + commit_data = github_get(session, commit_url, refresh_auth=refresh_auth).json() commit["files"] = commit_data.get("files", []) logger.info(f"Extracted {len(commits)} commits for PR #{pr_number}") @@ -312,6 +321,7 @@ def extract_reviewers( repo: str, pr_number: int, github_api_url: str = "https://api.github.com", + refresh_auth: Optional[Callable[[], None]] = None, ) -> list[dict]: """ Extract reviewers for a specific pull request. @@ -330,7 +340,7 @@ def extract_reviewers( logger.info(f"Reviewers URL: {reviewers_url}") - reviewers = github_get(session, reviewers_url).json() + reviewers = github_get(session, reviewers_url, refresh_auth=refresh_auth).json() filtered = [r for r in reviewers if r.get("user") is not None] skipped = len(reviewers) - len(filtered) @@ -346,6 +356,7 @@ def extract_comments( repo: str, pr_number: int, github_api_url: str = "https://api.github.com", + refresh_auth: Optional[Callable[[], None]] = None, ) -> list[dict]: """ Extract comments for a specific pull request. @@ -364,7 +375,7 @@ def extract_comments( logger.info(f"Comments URL: {comments_url}") - comments = github_get(session, comments_url).json() + comments = github_get(session, comments_url, refresh_auth=refresh_auth).json() filtered = [c for c in comments if c.get("user") is not None and c.get("body")] skipped = len(comments) - len(filtered) @@ -389,35 +400,109 @@ def sleep_for_rate_limit(resp: requests.Response) -> None: time.sleep(sleep_time) +def _is_html_error_page(resp: requests.Response) -> bool: + """Return True when GitHub returns an HTML error page instead of JSON.""" + content_type = resp.headers.get("Content-Type", "") + return "text/html" in content_type and resp.status_code >= 400 + + def github_get( session: requests.Session, url: str, params: Optional[dict] = None, + refresh_auth: Optional[Callable[[], None]] = None, ) -> requests.Response: """ - Make a GitHub API GET request, retrying in a loop on rate limit. + Make a GitHub API GET request, retrying on transient errors and expired tokens. + + Retry behaviour: + - 403 rate-limit: sleeps until reset, then retries (unbounded, existing behaviour). + - 401 bad credentials: calls refresh_auth() then retries, up to + _MAX_AUTH_RETRIES times. If refresh_auth is None the error is + treated as non-retryable. + - 5xx / HTML error page: exponential backoff up to _MAX_RETRIES attempts. + - Network-level errors (Timeout, ConnectionError): same exponential backoff. + - All other non-200 responses (404, 422 ...): raises SystemExit immediately. Args: session: Authenticated requests session url: URL to fetch params: Optional query parameters + refresh_auth: Optional callable that refreshes the session's Authorization + header. Called automatically when a 401 response is received. Returns: Successful response (status 200) Raises: - SystemExit: On non-200, non-rate-limit errors + SystemExit: When all retries are exhausted or a non-retryable error occurs. """ + auth_retries = _MAX_AUTH_RETRIES + transient_retries = _MAX_RETRIES + backoff = _RETRY_BASE_DELAY + while True: - resp = session.get(url, params=params) + try: + resp = session.get(url, params=params, timeout=_REQUEST_TIMEOUT) + except ( + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + ) as exc: + if transient_retries > 0: + logger.warning( + f"Network error for {url}: {exc}. " + f"Retrying in {backoff:.0f}s ({transient_retries} retries left)" + ) + time.sleep(backoff) + backoff = min(backoff * _RETRY_MULTIPLIER, _RETRY_MAX_DELAY) + transient_retries -= 1 + continue + raise SystemExit( + f"GitHub API request failed after retries for {url}: {exc}" + ) + if resp.status_code == 200: return resp + if ( resp.status_code == 403 and int(resp.headers.get("X-RateLimit-Remaining", "1")) == 0 ): sleep_for_rate_limit(resp) continue + + if resp.status_code == 401: + if auth_retries > 0 and refresh_auth is not None: + logger.warning( + f"401 for {url}, refreshing auth token ({auth_retries} retries left)" + ) + refresh_auth() + auth_retries -= 1 + continue + if refresh_auth is None: + auth_error_detail = "with no refresh_auth configured" + else: + auth_error_detail = f"after {_MAX_AUTH_RETRIES} refresh attempts" + raise SystemExit( + f"GitHub API auth error 401 for {url} {auth_error_detail}: " + f"{resp.text or 'No response text'}" + ) + + if resp.status_code in _RETRYABLE_STATUS_CODES or _is_html_error_page(resp): + if transient_retries > 0: + logger.warning( + f"Transient error {resp.status_code} for {url}. " + f"Retrying in {backoff:.0f}s ({transient_retries} retries left)" + ) + time.sleep(backoff) + backoff = min(backoff * _RETRY_MULTIPLIER, _RETRY_MAX_DELAY) + transient_retries -= 1 + continue + raise SystemExit( + f"GitHub API error {resp.status_code} for {url} after {_MAX_RETRIES} retries: " + f"{resp.text or 'No response text'}" + ) + raise SystemExit( f"GitHub API error {resp.status_code} for {url}: {resp.text or 'No response text'}" ) @@ -826,81 +911,95 @@ def _main() -> int: total_processed = 0 snapshot_date = datetime.now(timezone.utc).strftime("%Y-%m-%d") + failed_repos: list[str] = [] + for repo in github_repos: - # Delete any existing rows for this (repo, snapshot_date) before loading. - # This makes every run idempotent: if a previous run crashed mid-way and left - # partial data, a rerun will clean up the partial write and reload cleanly. - if snapshot_exists(bigquery_client, bigquery_dataset, repo, snapshot_date): - logger.info( - f"Deleting partial/existing snapshot for {repo} on {snapshot_date} before reload" - ) - delete_existing_snapshot( - bigquery_client, bigquery_dataset, repo, snapshot_date - ) + try: + # Delete any existing rows for this (repo, snapshot_date) before loading. + # This makes every run idempotent: if a previous run crashed mid-way and left + # partial data, a rerun will clean up the partial write and reload cleanly. + if snapshot_exists(bigquery_client, bigquery_dataset, repo, snapshot_date): + logger.info( + f"Deleting partial/existing snapshot for {repo} on {snapshot_date} before reload" + ) + delete_existing_snapshot( + bigquery_client, bigquery_dataset, repo, snapshot_date + ) - # Build a per-repo token refresh callable. It is called by the generator - # before each page fetch, so every API request (PRs + commits + reviewers + - # comments) uses a valid token. The access_token_cache means this only hits - # the GitHub API when the cached token has <60 seconds remaining. - refresh_auth: Optional[Callable[[], None]] = None - if github_app_id and github_private_key: - - def _make_refresh( - _repo: str = repo, - ) -> Callable[[], None]: - def _refresh() -> None: - try: - app_jwt = generate_github_jwt(github_app_id, github_private_key) - access_token = get_installation_access_token( - app_jwt, _repo, github_api_url - ) - except Exception as e: - raise RuntimeError( - f"Failed to obtain GitHub App access token for {_repo}: {e}. " - "Check that GITHUB_APP_ID is correct and GITHUB_PRIVATE_KEY " - "is a valid PEM-encoded RSA private key." - ) from e - session.headers["Authorization"] = f"Bearer {access_token}" - - return _refresh - - refresh_auth = _make_refresh() - # Set the token immediately so the first generator page is authenticated. - refresh_auth() + # Build a per-repo token refresh callable. It is called by the generator + # before each page fetch, so every API request (PRs + commits + reviewers + + # comments) uses a valid token. The access_token_cache means this only hits + # the GitHub API when the cached token has <60 seconds remaining. + refresh_auth: Optional[Callable[[], None]] = None + if github_app_id and github_private_key: + + def _make_refresh( + _repo: str = repo, + ) -> Callable[[], None]: + def _refresh() -> None: + try: + app_jwt = generate_github_jwt( + github_app_id, github_private_key + ) + access_token = get_installation_access_token( + app_jwt, _repo, github_api_url + ) + except Exception as e: + raise RuntimeError( + f"Failed to obtain GitHub App access token for {_repo}: {e}. " + "Check that GITHUB_APP_ID is correct and GITHUB_PRIVATE_KEY " + "is a valid PEM-encoded RSA private key." + ) from e + session.headers["Authorization"] = f"Bearer {access_token}" + + return _refresh + + refresh_auth = _make_refresh() + # Set the token immediately so the first generator page is authenticated. + refresh_auth() + + for chunk_count, chunk in enumerate( + extract_pull_requests( + session, + repo, + chunk_size=100, + github_api_url=github_api_url, + refresh_auth=refresh_auth, + ), + start=1, + ): + logger.info(f"Processing chunk {chunk_count} with {len(chunk)} PRs") + + # Transform + transformed_data = transform_data(chunk, repo) + + # Load + load_data( + bigquery_client, + bigquery_dataset, + transformed_data, + snapshot_date, + use_streaming_insert=bool(emulator_host), + ) - for chunk_count, chunk in enumerate( - extract_pull_requests( - session, - repo, - chunk_size=100, - github_api_url=github_api_url, - refresh_auth=refresh_auth, - ), - start=1, - ): - logger.info(f"Processing chunk {chunk_count} with {len(chunk)} PRs") - - # Transform - transformed_data = transform_data(chunk, repo) - - # Load - load_data( - bigquery_client, - bigquery_dataset, - transformed_data, - snapshot_date, - use_streaming_insert=bool(emulator_host), - ) + total_processed += len(chunk) + logger.info( + f"Completed chunk {chunk_count}. Total PRs processed: {total_processed}" + ) + except (SystemExit, RuntimeError) as exc: + logger.error(f"Failed to process repo {repo}: {exc}") + failed_repos.append(repo) + continue - total_processed += len(chunk) - logger.info( - f"Completed chunk {chunk_count}. Total PRs processed: {total_processed}" - ) + if failed_repos: + logger.error( + f"ETL completed with failures. Failed repos: {', '.join(failed_repos)}" + ) + return 1 logger.info( f"GitHub ETL process completed successfully. Total PRs processed: {total_processed}" ) - return 0 diff --git a/tests/test_extract_comments.py b/tests/test_extract_comments.py index 155036b..43c47b1 100644 --- a/tests/test_extract_comments.py +++ b/tests/test_extract_comments.py @@ -28,6 +28,7 @@ def test_api_error_comments(mock_session): error_response = Mock() error_response.status_code = 404 error_response.text = "Not Found" + error_response.headers = {} mock_session.get.return_value = error_response diff --git a/tests/test_extract_commits.py b/tests/test_extract_commits.py index b56f4a0..07a32b6 100644 --- a/tests/test_extract_commits.py +++ b/tests/test_extract_commits.py @@ -66,11 +66,13 @@ def test_rate_limit_on_commits_list(mock_sleep, mock_session): assert result == [] -def test_api_error_on_commits_list(mock_session): - """Test API error handling when fetching commits list.""" +@patch("time.sleep") +def test_api_error_on_commits_list(mock_sleep, mock_session): + """Test that extract_commits retries on 500, then raises SystemExit.""" error_response = Mock() error_response.status_code = 500 error_response.text = "Internal Server Error" + error_response.headers = {} mock_session.get.return_value = error_response @@ -78,6 +80,7 @@ def test_api_error_on_commits_list(mock_session): main.extract_commits(mock_session, "mozilla/firefox", 123) assert "GitHub API error 500" in str(exc_info.value) + assert mock_session.get.call_count == main._MAX_RETRIES + 1 def test_api_error_on_individual_commit(mock_session): @@ -89,6 +92,7 @@ def test_api_error_on_individual_commit(mock_session): commit_error = Mock() commit_error.status_code = 404 commit_error.text = "Commit not found" + commit_error.headers = {} mock_session.get.side_effect = [commits_response, commit_error] diff --git a/tests/test_extract_pull_requests.py b/tests/test_extract_pull_requests.py index b950f44..7f9b7f3 100644 --- a/tests/test_extract_pull_requests.py +++ b/tests/test_extract_pull_requests.py @@ -173,6 +173,7 @@ def test_handles_api_error_404(mock_session): mock_response = Mock() mock_response.status_code = 404 mock_response.text = "Not Found" + mock_response.headers = {} mock_session.get.return_value = mock_response @@ -182,11 +183,13 @@ def test_handles_api_error_404(mock_session): assert "GitHub API error 404" in str(exc_info.value) -def test_handles_api_error_500(mock_session): - """Test that extract_pull_requests raises SystemExit on 500.""" +@patch("time.sleep") +def test_handles_api_error_500(mock_sleep, mock_session): + """Test that extract_pull_requests retries on 500, then raises SystemExit.""" mock_response = Mock() mock_response.status_code = 500 mock_response.text = "Internal Server Error" + mock_response.headers = {} mock_session.get.return_value = mock_response @@ -194,6 +197,7 @@ def test_handles_api_error_500(mock_session): list(main.extract_pull_requests(mock_session, "mozilla/firefox")) assert "GitHub API error 500" in str(exc_info.value) + assert mock_session.get.call_count == main._MAX_RETRIES + 1 def test_stops_on_empty_batch(mock_session): diff --git a/tests/test_extract_reviewers.py b/tests/test_extract_reviewers.py index 0e2ca42..32cb575 100644 --- a/tests/test_extract_reviewers.py +++ b/tests/test_extract_reviewers.py @@ -23,11 +23,13 @@ def test_rate_limit_handling(mock_sleep, mock_session): mock_sleep.assert_called_once() -def test_api_error(mock_session): - """Test API error handling when fetching reviewers.""" +@patch("time.sleep") +def test_api_error(mock_sleep, mock_session): + """Test that extract_reviewers retries on 500, then raises SystemExit.""" error_response = Mock() error_response.status_code = 500 error_response.text = "Internal Server Error" + error_response.headers = {} mock_session.get.return_value = error_response @@ -35,3 +37,4 @@ def test_api_error(mock_session): main.extract_reviewers(mock_session, "mozilla/firefox", 123) assert "GitHub API error 500" in str(exc_info.value) + assert mock_session.get.call_count == main._MAX_RETRIES + 1 diff --git a/tests/test_github_get.py b/tests/test_github_get.py new file mode 100644 index 0000000..822ae3f --- /dev/null +++ b/tests/test_github_get.py @@ -0,0 +1,120 @@ +from unittest.mock import Mock, patch + +import pytest +import requests + +import main + + +@pytest.fixture +def mock_session(): + session = Mock(spec=requests.Session) + session.headers = {} + return session + + +def _make_response( + status_code: int, text: str = "", headers: dict | None = None +) -> Mock: + resp = Mock() + resp.status_code = status_code + resp.text = text + resp.headers = headers or {} + return resp + + +@patch("time.sleep") +def test_retries_on_500_then_succeeds(mock_sleep, mock_session): + """Transient 500 errors are retried with backoff; success is returned.""" + fail = _make_response(500, "Internal Server Error") + fail2 = _make_response(500, "Internal Server Error") + ok = _make_response(200) + ok.json.return_value = {} + mock_session.get.side_effect = [fail, fail2, ok] + + result = main.github_get(mock_session, "https://api.github.com/repos/test") + + assert result is ok + assert mock_session.get.call_count == 3 + assert mock_sleep.call_count == 2 + + +@patch("time.sleep") +def test_retries_on_connection_error(mock_sleep, mock_session): + """Network-level ConnectionError triggers retry with backoff.""" + ok = _make_response(200) + mock_session.get.side_effect = [ + requests.exceptions.ConnectionError("connection refused"), + ok, + ] + + result = main.github_get(mock_session, "https://api.github.com/repos/test") + + assert result is ok + assert mock_session.get.call_count == 2 + mock_sleep.assert_called_once() + + +def test_refreshes_auth_on_401(mock_session): + """401 response triggers refresh_auth then retries.""" + unauthorized = _make_response(401, "Bad credentials") + ok = _make_response(200) + mock_session.get.side_effect = [unauthorized, ok] + + refresh_auth = Mock() + result = main.github_get( + mock_session, "https://api.github.com/repos/test", refresh_auth=refresh_auth + ) + + assert result is ok + refresh_auth.assert_called_once() + assert mock_session.get.call_count == 2 + + +def test_401_without_refresh_auth_fails_immediately(mock_session): + """401 with no refresh_auth callable raises SystemExit without retrying.""" + unauthorized = _make_response(401, "Bad credentials") + mock_session.get.return_value = unauthorized + + with pytest.raises(SystemExit) as exc_info: + main.github_get(mock_session, "https://api.github.com/repos/test") + + assert "401" in str(exc_info.value) + assert mock_session.get.call_count == 1 + + +@patch("time.sleep") +def test_401_exhausts_auth_retries(mock_sleep, mock_session): + """Persistent 401 responses exhaust the auth retry budget and raise SystemExit.""" + unauthorized = _make_response(401, "Bad credentials") + mock_session.get.return_value = unauthorized + + refresh_auth = Mock() + + with pytest.raises(SystemExit) as exc_info: + main.github_get( + mock_session, + "https://api.github.com/repos/test", + refresh_auth=refresh_auth, + ) + + assert "401" in str(exc_info.value) + assert mock_session.get.call_count == main._MAX_AUTH_RETRIES + 1 + assert refresh_auth.call_count == main._MAX_AUTH_RETRIES + + +@patch("time.sleep") +def test_html_error_page_retried(mock_sleep, mock_session): + """GitHub HTML error pages (e.g. 503 timeouts) are detected and retried.""" + html_body = "
We couldn't respond to your request in time.
" + html_error = _make_response( + 503, html_body, headers={"Content-Type": "text/html; charset=utf-8"} + ) + ok = _make_response(200) + mock_session.get.side_effect = [html_error, ok] + + result = main.github_get(mock_session, "https://api.github.com/repos/test") + + assert result is ok + assert mock_session.get.call_count == 2 + mock_sleep.assert_called_once() diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 0000000..10730d1 --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +""" +Tests for setup_logging function. + +Tests logging configuration including log level and handler setup. +""" + +import logging + +import main + + +def test_setup_logging(): + """Test that setup_logging configures logging correctly.""" + main.setup_logging() + + root_logger = logging.getLogger() + assert root_logger.level == logging.INFO + assert len(root_logger.handlers) > 0 + + # Check that at least one handler is a StreamHandler + has_stream_handler = any( + isinstance(handler, logging.StreamHandler) for handler in root_logger.handlers + ) + assert has_stream_handler diff --git a/tests/test_main.py b/tests/test_main.py index 700f671..520f350 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -61,10 +61,10 @@ def test_requires_bigquery_dataset( @patch("main.setup_logging") @patch("main.bigquery.Client") @patch("requests.Session") -def test_github_token_optional_with_warning( +def test_runs_without_auth_credentials( mock_session_class, mock_bq_client, mock_setup_logging ): - """Test that GITHUB_TOKEN is optional but warns if missing.""" + """Test that auth credentials are optional; script runs with a warning.""" with ( patch.dict( os.environ, @@ -96,7 +96,6 @@ def test_splits_github_repos_by_comma( "GITHUB_REPOS": "mozilla/firefox,mozilla/gecko-dev", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ), @@ -120,7 +119,6 @@ def test_honors_github_api_url(mock_session_class, mock_bq_client, mock_setup_lo "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", "GITHUB_API_URL": "https://custom-api.example.com", }, clear=True, @@ -147,7 +145,6 @@ def test_honors_bigquery_emulator_host( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", "BIGQUERY_EMULATOR_HOST": "http://localhost:9050", }, clear=True, @@ -157,6 +154,37 @@ def test_honors_bigquery_emulator_host( main.main() +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_creates_session_with_headers( + mock_session_class, mock_bq_client, mock_setup_logging +): + """Test that session is created with Accept and User-Agent headers.""" + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])), + ): + main.main() + + # Verify session headers were set + assert mock_session.headers.update.called + call_args = mock_session.headers.update.call_args[0][0] + assert "Accept" in call_args + assert "User-Agent" in call_args + + @patch("main.setup_logging") @patch("main.bigquery.Client") @patch("requests.Session") @@ -186,7 +214,6 @@ def test_single_repo_successful_etl( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ): @@ -225,7 +252,6 @@ def test_multiple_repos_processing( "GITHUB_REPOS": "mozilla/firefox,mozilla/gecko-dev,mozilla/addons", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ): @@ -272,7 +298,6 @@ def test_processes_chunks_iteratively( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ): @@ -298,7 +323,6 @@ def test_returns_zero_on_success( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ), @@ -347,7 +371,6 @@ def test_full_etl_flow_transforms_data_correctly( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ): @@ -402,7 +425,6 @@ def test_bug_id_extraction_through_pipeline( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ): @@ -460,7 +482,6 @@ def test_pagination_through_full_flow( "GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test", - "GITHUB_TOKEN": "token", }, clear=True, ): @@ -468,3 +489,49 @@ def test_pagination_through_full_flow( # Should be called twice (once per chunk/page) assert mock_load.call_count == 2 + + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +@patch("main.extract_pull_requests") +@patch("main.transform_data") +@patch("main.load_data") +def test_repo_failure_continues_to_next_repo( + mock_load, + mock_transform, + mock_extract, + mock_session_class, + mock_bq_client, + mock_setup_logging, +): + """A fatal error on one repo should not prevent other repos from being processed.""" + + def extract_side_effect(*args, **kwargs): + repo = args[1] + if repo == "mozilla/firefox": + raise SystemExit("GitHub API error 502 for https://api.github.com/...") + return iter([[{"number": 1}]]) + + mock_extract.side_effect = extract_side_effect + mock_transform.return_value = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox,mozilla/gecko-dev", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + }, + clear=True, + ): + result = main.main() + + assert result == 1 # partial failure + assert mock_extract.call_count == 2 # both repos were attempted + mock_load.assert_called_once() # only the successful repo loaded data