Skip to content

Commit 87a62fd

Browse files
authored
Merge pull request #1 from formula-code/offline_commits
add offline commit scraiping
2 parents 054a316 + b7cbfea commit 87a62fd

12 files changed

Lines changed: 729 additions & 33 deletions

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,19 +146,19 @@ The scraper can be run using the following command:
146146
```bash
147147
$ python scripts/scrape_repositories.py \
148148
--outfile artifacts/raw/repos_discovered.csv \
149+
--min-stars 500 \
149150
--filtered-outfile artifacts/raw/repos_valid.csv
150151
# Writes artifacts/raw/repos_discovered.csv and artifacts/raw/repos_valid.csv
151152
```
152153

153-
The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / pass other sanity checks. We found ~700 filtered repositories for this dataset.
154+
The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / has atleast 500 stars / pass other sanity checks. We found ~700 filtered repositories for this dataset.
154155

155156

156157
### 4. Collect relevant commits for all repositories
157158

158159
Given the list of repositories, we find the subset of commits that have already been closed and merged into the main branch (the top 5000 PRs, sorted by popularity). We use the `collect_commits.py` script to do this. The `filter_commits.py` script then filters out those commits that primarily modified the benchmarking files (e.g. `asv.conf.json`) or were not relevant to the benchmarks (e.g. documentation changes). The script also limits the number of repositories to a maximum of 350 to ensure we don't burden the GitHub API with too many requests. The scripts can be run as follows:
159160

160161
```bash
161-
# 50 pages * 100 (PRs per page) = 5000 PRs max per repo.
162162
$ python scripts/collect_commits.py \
163163
--dashboards artifacts/raw/repos_valid.csv \
164164
--outfile artifacts/raw/commits_all.jsonl \

scripts/collect_commits.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
import pandas as pd
44

5-
from datasmith.execution.collect_commits import search_commits
5+
# from datasmith.execution.collect_commits import search_commits
6+
from datasmith.execution.collect_commits_offline import search_commits
67
from datasmith.logging_config import configure_logging
78

89
# Configure logging for the script

scripts/filter_commits.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
import argparse
44
import json
55
import re
6+
import tempfile
67
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
78
from pathlib import Path
89

910
import pandas as pd
11+
from git import Repo
1012
from tqdm.auto import tqdm
1113

12-
from datasmith.execution.utils import _get_commit_info, find_file_in_tree
14+
from datasmith.execution.utils import _get_commit_info_offline, find_file_in_tree
1315
from datasmith.logging_config import configure_logging
1416

1517
# Configure logging for the script
@@ -37,10 +39,11 @@ def _asv_conf_worker(repo_name: str) -> str | None:
3739
return find_file_in_tree(repo_name, "asv.conf.json")
3840

3941

40-
def _commit_info_worker(arg_tuple) -> dict | None:
42+
def _commit_info_worker(arg_tuple: tuple[Repo, str]) -> dict | None:
4143
"""Wrapper for ProcessPool: arg_tuple = (repo_name, sha)."""
4244
repo, sha = arg_tuple
43-
return _get_commit_info(repo, sha)
45+
# return _get_commit_info(repo, sha)
46+
return _get_commit_info_offline(repo, sha)
4447

4548

4649
NON_CORE_PATTERNS = re.compile(
@@ -107,27 +110,51 @@ def main() -> None:
107110
commits = commits.merge(benchmarks, how="right", on="repo_name")
108111
commits = commits.dropna(subset=["commit_sha"])
109112

110-
with ProcessPoolExecutor(max_workers=args.procs) as pp:
111-
commits["commit_info"] = list(
112-
tqdm(
113-
pp.map(_commit_info_worker, commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None)),
114-
total=len(commits),
115-
desc="Fetching commit metadata",
113+
all_repo_names = set(commits["repo_name"])
114+
115+
# download all repos to a temp dir
116+
with tempfile.TemporaryDirectory(prefix="gh-repos-") as td:
117+
all_repos = {}
118+
for repo_name in tqdm(all_repo_names, desc="Cloning repos"):
119+
repo_name = repo_name.strip("/")
120+
owner, name = repo_name.split("/", 1)
121+
path = Path(td) / f"{owner}__{name}.git"
122+
repo = Repo.clone_from(
123+
f"https://github.com/{repo_name}.git",
124+
path,
125+
bare=True,
126+
# multi_options=["--filter=tree:0"],
127+
multi_options=["--filter=blob:none"],
128+
quiet=True,
129+
)
130+
all_repos[repo_name] = repo
131+
132+
commit_info_args: list[tuple[Repo, str]] = []
133+
for repo_name, commit_sha in commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None):
134+
repo = all_repos[repo_name]
135+
commit_info_args.append((repo, commit_sha))
136+
137+
with ProcessPoolExecutor(max_workers=args.procs) as pp:
138+
commits["commit_info"] = list(
139+
tqdm(
140+
pp.map(_commit_info_worker, commit_info_args),
141+
total=len(commits),
142+
desc="Fetching commit metadata",
143+
)
116144
)
117-
)
118145

119-
commit_meta = pd.json_normalize(commits.pop("commit_info"))
120-
commits = pd.concat([commits, commit_meta], axis=1)
121-
commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
122-
commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
146+
commit_meta = pd.json_normalize(commits.pop("commit_info"))
147+
commits = pd.concat([commits, commit_meta], axis=1)
148+
commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
149+
commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
123150

124151
out_path = Path(args.output_pth)
125152
if not out_path.parent.exists():
126153
out_path.parent.mkdir(parents=True, exist_ok=True)
127154
# commits.to_csv(out_path, index=False)
128155
commits.to_json(out_path, orient="records", lines=True, index=False)
129156

130-
logger.info(f"✔ Wrote {len(commits):,} rows → {out_path}")
157+
logger.info("✔ Wrote %s rows → %s", len(commits), out_path)
131158

132159

133160
if __name__ == "__main__":

scripts/scrape_repositories.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def parse_args() -> argparse.Namespace:
5656
default=0.3,
5757
help="Random extra delay (0-JITTER's) after each call",
5858
)
59+
p.add_argument("--min-stars", type=int, default=500, help="Minimum number of stars to consider a repository")
5960
return p.parse_args()
6061

6162

@@ -83,6 +84,7 @@ def main() -> None:
8384
filtered_df = filter_dashboards(df, url_col="url")
8485
# remove airspeed-velocity/asv
8586
filtered_df = filtered_df[filtered_df.repo_name != "airspeed-velocity/asv"]
87+
filtered_df = filtered_df[filtered_df.stars >= args.min_stars]
8688
if filtered_df.empty:
8789
raise ValueError("No dashboards found in the repositories.") # noqa: TRY003
8890

src/datasmith/benchmark/collection.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,9 @@ def save(self, path: str | Path) -> Path:
7676
"""
7777
self.modified_at = datetime.now(timezone.utc)
7878
path = Path(path)
79-
if any(suffix not in [".fc", ".pkl"] for suffix in path.suffixes):
80-
path = path.with_suffix(".fc.pkl")
79+
# Ensure the filename ends with the exact `.fc.pkl` suffix
80+
if not path.name.endswith(".fc.pkl"):
81+
path = path.with_name(path.name + ".fc.pkl")
8182
with open(path, "wb") as fh:
8283
pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
8384
return path

src/datasmith/detection/detect_breakpoints.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,10 @@ def detect_all_breakpoints(summary_df: pd.DataFrame, method: str = "rbf") -> pd.
9292
if missing := needed - set(summary_df.columns):
9393
raise ValueError(str(missing))
9494

95-
breakpoints: pd.DataFrame = (
96-
summary_df.groupby("benchmark", sort=False)
97-
.apply(detection_method)
98-
.dropna()
99-
.explode()
100-
.apply(pd.Series)
101-
.reset_index(drop=True)
102-
)
95+
detected = summary_df.groupby("benchmark", sort=False).apply(detection_method, include_groups=False).dropna()
96+
97+
if detected.empty:
98+
return pd.DataFrame()
99+
100+
breakpoints: pd.DataFrame = detected.explode().apply(pd.Series).reset_index(drop=True)
103101
return breakpoints
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from __future__ import annotations
2+
3+
import os
4+
import re
5+
import tempfile
6+
import urllib.parse
7+
from pathlib import Path
8+
9+
from git import GitCommandError, Repo
10+
11+
from datasmith import logger
12+
from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion
13+
14+
_PR_MERGE_PATTERNS: tuple[re.Pattern[str], ...] = (
15+
# standard "Merge pull request #123 ..."
16+
re.compile(r"Merge pull request #(\d+)\b"),
17+
# squash-merge style "... (#[0-9]+)" on the last line
18+
re.compile(r"\(#(\d+)\)"),
19+
)
20+
21+
22+
def _default_branch(repo: Repo) -> str:
23+
"""
24+
Resolve the remote's default branch (origin/HEAD -> "main" / "master" / ...).
25+
"""
26+
try:
27+
# “origin/main”
28+
full_ref: str = repo.git.symbolic_ref("--quiet", "--short", "refs/remotes/origin/HEAD")
29+
return full_ref.split("/", 1)[1] # keep text after "origin/"
30+
except Exception:
31+
# Fallback if symbolic-ref is missing (rare).
32+
return repo.head.reference.name
33+
34+
35+
def _is_pr_merge(message: str) -> bool:
36+
"""
37+
True iff *message* matches one of our PR-closing patterns.
38+
"""
39+
return any(p.search(message) for p in _PR_MERGE_PATTERNS)
40+
41+
42+
def _is_public(repo_name: str) -> bool:
43+
"""
44+
Check if a repo is public.
45+
"""
46+
return _get_github_metadata(f"/repos/{repo_name}") is not None
47+
48+
49+
@cache_completion(CACHE_LOCATION, "search_commits_offline")
50+
def search_commits(
51+
repo_name: str,
52+
query: str,
53+
max_pages: int = 100, # ignored (kept for compatibility)
54+
per_page: int = 100, # ignored (kept for compatibility)
55+
) -> list[str]:
56+
"""
57+
Return a list of commit SHAs that closed pull requests, **without**
58+
calling any GitHub API endpoints. Internally:
59+
60+
• clones the repo (metadata-only) into a tmp dir
61+
• walks the commit history
62+
• selects commits whose message looks like a PR merge
63+
64+
The only element of *query* we still honour is `base=<branch>`.
65+
"""
66+
qs = urllib.parse.parse_qs(query, keep_blank_values=True)
67+
base_branch: str | None = qs.get("base", [None])[0]
68+
69+
with tempfile.TemporaryDirectory(prefix="gh-history-") as workdir:
70+
workdir_path = Path(workdir)
71+
url = f"https://github.com/{repo_name}.git"
72+
73+
# Clone *just* the commit / tree metadata (no blobs).
74+
clone_kwargs: dict = {
75+
"multi_options": ["--filter=tree:0"],
76+
"no_checkout": True,
77+
}
78+
if base_branch:
79+
clone_kwargs["branch"] = base_branch
80+
81+
# ignore if repo is not public
82+
try:
83+
repo = Repo.clone_from(
84+
url,
85+
workdir_path,
86+
env={"GIT_TERMINAL_PROMPT": "0", **os.environ},
87+
**clone_kwargs,
88+
)
89+
except GitCommandError as e:
90+
if e.status == 128:
91+
msg = e.stderr.strip() or "authentication failed or repository not found"
92+
logger.warning("Cannot clone %s: %s", url, msg)
93+
return []
94+
raise
95+
96+
# Figure out which ref to walk.
97+
branch = base_branch or _default_branch(repo)
98+
ref_to_walk = f"origin/{branch}"
99+
100+
merge_shas: set[str] = set()
101+
for commit in repo.iter_commits(ref_to_walk):
102+
if _is_pr_merge(str(commit.message)):
103+
merge_shas.add(commit.hexsha)
104+
105+
return sorted(merge_shas)

src/datasmith/execution/utils.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
from typing import Any
2+
3+
from git import BadName, GitCommandError, Repo
14
from requests.exceptions import HTTPError
25

36
from datasmith.logging_config import get_logger
4-
from datasmith.utils import _get_github_metadata
7+
from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion
58

69
logger = get_logger("execution.utils")
710

@@ -47,6 +50,48 @@ def _get_commit_info(repo_name: str, commit_sha: str) -> dict:
4750
}
4851

4952

53+
@cache_completion(CACHE_LOCATION, "get_commit_info_offline")
54+
def _get_commit_info_offline(repo: Repo, commit_sha: str) -> dict[str, Any]:
55+
"""
56+
Return commit metadata and diff stats *without* the GitHub REST API.
57+
58+
The function creates a temporary **treeless** clone
59+
(`git clone --filter=tree:0 …`) so it transfers only commit objects.
60+
When we later call `commit.stats`, Git will lazily grab just the blobs
61+
needed dto compute line-level stats - still far cheaper than an API call.
62+
"""
63+
try:
64+
commit = repo.commit(commit_sha)
65+
66+
except (BadName, ValueError):
67+
logger.exception("Maybe commit not found: %s", commit_sha)
68+
repo.git.fetch("--no-filter", "--quiet", "origin", commit_sha)
69+
commit = repo.commit(commit_sha) # retry after fetching
70+
except GitCommandError:
71+
logger.exception("Error fetching commit info: %s", commit_sha)
72+
return {
73+
"sha": commit_sha,
74+
"date": None,
75+
"message": None,
76+
"total_additions": 0,
77+
"total_deletions": 0,
78+
"total_files_changed": 0,
79+
"files_changed": "",
80+
}
81+
82+
stats = commit.stats
83+
84+
return {
85+
"sha": commit.hexsha,
86+
"date": commit.committed_datetime.isoformat(),
87+
"message": commit.message,
88+
"total_additions": stats.total["insertions"],
89+
"total_deletions": stats.total["deletions"],
90+
"total_files_changed": stats.total["files"],
91+
"files_changed": "\n".join(str(k) for k in stats.files),
92+
}
93+
94+
5095
def find_file_in_tree(repo: str, filename: str, branch: str | None = None) -> list[str] | None:
5196
if branch is None:
5297
repo_info = _get_github_metadata(endpoint=f"/repos/{repo}")

src/datasmith/logging_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@
88

99
import logging
1010
import sys
11-
from typing import Optional
11+
from typing import Optional, TextIO
1212

1313

1414
def configure_logging(
1515
level: int = logging.INFO,
1616
format_string: Optional[str] = None,
1717
date_format: str = "%H:%M:%S",
18-
stream: Optional[object] = None,
18+
stream: Optional[TextIO] = None,
1919
) -> logging.Logger:
2020
"""
2121
Configure logging for the datasmith package.

src/datasmith/scrape/scrape_dashboards.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
9393
df["date"] = df["revision"].astype(str).map(index_data["revision_to_date"])
9494
frames.append(df)
9595

96-
all_benchmarks = pd.concat(frames, ignore_index=True)
96+
all_benchmarks = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
9797
logger.info("Collected %s rows from %s benchmark files.", f"{len(all_benchmarks):,}", f"{len(frames):,}")
9898

9999
all_summaries = []
@@ -114,7 +114,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
114114
df["benchmark"] = benchmark_name
115115
all_summaries.append(df)
116116

117-
all_summaries_df = pd.concat(all_summaries, ignore_index=True)
117+
all_summaries_df = pd.concat(all_summaries, ignore_index=True) if all_summaries else pd.DataFrame()
118118

119119
collection = BenchmarkCollection(
120120
base_url=base_url,

0 commit comments

Comments
 (0)