Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ on:
pull_request:
branches: [main]

permissions:
contents: read

jobs:
test:
runs-on: ubuntu-latest
Expand All @@ -19,3 +22,14 @@ jobs:
pip install -e ".[dev]"
- name: Run all tests
run: pytest

integration-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run integration test with docker compose
run: |
docker compose up --build --abort-on-container-exit --exit-code-from github-etl
- name: Cleanup
if: always()
run: docker compose down -v --remove-orphans
2 changes: 1 addition & 1 deletion Dockerfile.mock
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Dockerfile for mock GitHub API service
FROM python:3.11-slim
FROM python:3.14.2-slim

WORKDIR /app

Expand Down
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,28 @@ This setup includes:
- **BigQuery Emulator**: Local BigQuery instance for testing
- **ETL Service**: Configured to use both mock services

### Running Tests

The project includes a test suite using pytest. Tests are in the `tests/` directory.

Install development dependencies:

```bash
pip install -e ".[dev]"
```

Run tests:

```bash
pytest
```

Run with coverage:

```bash
pytest --cov=. --cov-report=html
```

### Adding Dependencies

Add new Python packages to `requirements.txt` and rebuild the Docker image.
Expand Down
22 changes: 12 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python3
"""
GitHub ETL for Mozilla Organization Firefox repositories

Expand Down Expand Up @@ -276,7 +275,7 @@ def extract_commits(
session: requests.Session,
repo: str,
pr_number: int,
github_api_url: str,
github_api_url: str = "https://api.github.com",
) -> list[dict]:
"""
Extract commits and files for a specific pull request.
Expand Down Expand Up @@ -312,7 +311,7 @@ def extract_reviewers(
session: requests.Session,
repo: str,
pr_number: int,
github_api_url: str,
github_api_url: str = "https://api.github.com",
) -> list[dict]:
"""
Extract reviewers for a specific pull request.
Expand Down Expand Up @@ -346,7 +345,7 @@ def extract_comments(
session: requests.Session,
repo: str,
pr_number: int,
github_api_url: str,
github_api_url: str = "https://api.github.com",
) -> list[dict]:
"""
Extract comments for a specific pull request.
Expand Down Expand Up @@ -383,7 +382,7 @@ def sleep_for_rate_limit(resp: requests.Response) -> None:
remaining = int(resp.headers.get("X-RateLimit-Remaining", 1))
reset = int(resp.headers.get("X-RateLimit-Reset", 0))
if remaining == 0:
sleep_time = max(0, reset - int(time.time())) + 5
sleep_time = max(0, reset - int(time.time()))
print(
f"Rate limit exceeded. Sleeping for {sleep_time} seconds.", file=sys.stderr
)
Expand Down Expand Up @@ -688,7 +687,7 @@ def load_data(
client: bigquery.Client,
dataset_id: str,
transformed_data: dict,
snapshot_date: str,
snapshot_date: Optional[str] = None,
use_streaming_insert: bool = False,
) -> None:
"""
Expand All @@ -707,6 +706,9 @@ def load_data(
mutable (no streaming buffer restriction).
"""

if snapshot_date is None:
snapshot_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")

if not transformed_data:
logger.warning("No data to load, skipping")
return
Expand Down Expand Up @@ -780,9 +782,9 @@ def _main() -> int:
bigquery_dataset = os.environ.get("BIGQUERY_DATASET")

if not bigquery_project:
raise RuntimeError("Environment variable BIGQUERY_PROJECT is required")
raise SystemExit("Environment variable BIGQUERY_PROJECT is required")
if not bigquery_dataset:
raise RuntimeError("Environment variable BIGQUERY_DATASET is required")
raise SystemExit("Environment variable BIGQUERY_DATASET is required")

# Setup GitHub session; the Authorization header is updated before each repo using
# an installation access token (which may be cached)
Expand Down Expand Up @@ -815,9 +817,9 @@ def _main() -> int:
github_repos = []
github_repos_str = os.getenv("GITHUB_REPOS")
if github_repos_str:
github_repos = github_repos_str.split(",")
github_repos = [r.strip() for r in github_repos_str.split(",") if r.strip()]
else:
raise RuntimeError(
raise SystemExit(
"Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"
)

Expand Down
1 change: 0 additions & 1 deletion mock_github_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python3
"""
Mock GitHub API server for testing the ETL pipeline locally.

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ dependencies = [
[project.optional-dependencies]
dev = [
"pytest>=7.0.0",
"pytest-cov>=7.0.0",
"pytest-mock>=3.10.0",
"ruff>=0.14.14",
"black>=24.0.0",
]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.14
# by the following command:
#
# pip-compile --generate-hashes pyproject.toml
# pip-compile --generate-hashes --output-file=requirements.txt pyproject.toml
#
certifi==2026.1.4 \
--hash=sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c \
Expand Down
20 changes: 20 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from unittest.mock import MagicMock, Mock

import pytest
import requests
from google.cloud import bigquery


@pytest.fixture
def mock_session() -> Mock:
session = Mock(spec=requests.Session)
session.headers = {}
return session


@pytest.fixture
def mock_bigquery_client() -> Mock:
client = Mock(spec=bigquery.Client)
client.project = "test-project"
client.insert_rows_json = MagicMock(return_value=[]) # Empty list = no errors
return client
37 changes: 37 additions & 0 deletions tests/test_extract_comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from unittest.mock import Mock, patch
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We install pytest-mock but we don't seem to use the provided test fixtures anywhere. We're instead importing from unittest.mock directly.

We should get rid of pytest-mock, or switch occurrences to use the mocker pytest fixtures in tests.


import pytest

import main


@patch("main.sleep_for_rate_limit")
def test_rate_limit_handling_comments(mock_sleep, mock_session):
"""Test rate limit handling when fetching comments."""
rate_limit_response = Mock()
rate_limit_response.status_code = 403
rate_limit_response.headers = {"X-RateLimit-Remaining": "0"}

success_response = Mock()
success_response.status_code = 200
success_response.json.return_value = []

mock_session.get.side_effect = [rate_limit_response, success_response]

main.extract_comments(mock_session, "mozilla/firefox", 123)

mock_sleep.assert_called_once()


def test_api_error_comments(mock_session):
"""Test API error handling when fetching comments."""
error_response = Mock()
error_response.status_code = 404
error_response.text = "Not Found"

mock_session.get.return_value = error_response

with pytest.raises(SystemExit) as exc_info:
main.extract_comments(mock_session, "mozilla/firefox", 123)

assert "GitHub API error 404" in str(exc_info.value)
98 changes: 98 additions & 0 deletions tests/test_extract_commits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from unittest.mock import Mock, patch

import pytest

import main


def test_extract_commits_with_files(mock_session):
"""Test that file details are fetched per commit and merged into commit data."""
commits_response = Mock()
commits_response.status_code = 200
commits_response.json.return_value = [
{"sha": "abc123"},
{"sha": "def456"},
]

commit_detail_1 = Mock()
commit_detail_1.status_code = 200
commit_detail_1.json.return_value = {
"sha": "abc123",
"files": [{"filename": "file1.py", "additions": 10}],
}

commit_detail_2 = Mock()
commit_detail_2.status_code = 200
commit_detail_2.json.return_value = {
"sha": "def456",
"files": [{"filename": "file2.py", "deletions": 5}],
}

mock_session.get.side_effect = [
commits_response,
commit_detail_1,
commit_detail_2,
]

result = main.extract_commits(mock_session, "mozilla/firefox", 123)

# Verify the individual commit detail endpoints were fetched
assert mock_session.get.call_count == 3
calls = mock_session.get.call_args_list
assert "commits/abc123" in calls[1][0][0]
assert "commits/def456" in calls[2][0][0]

# Verify file data from detail responses is merged into each commit
assert result[0]["files"][0]["filename"] == "file1.py"
assert result[1]["files"][0]["filename"] == "file2.py"


@patch("main.sleep_for_rate_limit")
def test_rate_limit_on_commits_list(mock_sleep, mock_session):
"""Test rate limit handling when fetching commits list."""
rate_limit_response = Mock()
rate_limit_response.status_code = 403
rate_limit_response.headers = {"X-RateLimit-Remaining": "0"}

success_response = Mock()
success_response.status_code = 200
success_response.json.return_value = []

mock_session.get.side_effect = [rate_limit_response, success_response]

result = main.extract_commits(mock_session, "mozilla/firefox", 123)

mock_sleep.assert_called_once()
assert result == []


def test_api_error_on_commits_list(mock_session):
"""Test API error handling when fetching commits list."""
error_response = Mock()
error_response.status_code = 500
error_response.text = "Internal Server Error"

mock_session.get.return_value = error_response

with pytest.raises(SystemExit) as exc_info:
main.extract_commits(mock_session, "mozilla/firefox", 123)

assert "GitHub API error 500" in str(exc_info.value)


def test_api_error_on_individual_commit(mock_session):
"""Test API error when fetching individual commit details."""
commits_response = Mock()
commits_response.status_code = 200
commits_response.json.return_value = [{"sha": "abc123"}]

commit_error = Mock()
commit_error.status_code = 404
commit_error.text = "Commit not found"

mock_session.get.side_effect = [commits_response, commit_error]

with pytest.raises(SystemExit) as exc_info:
main.extract_commits(mock_session, "mozilla/firefox", 123)

assert "GitHub API error 404" in str(exc_info.value)
Loading
Loading