Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,19 +146,19 @@ The scraper can be run using the following command:
```bash
$ python scripts/scrape_repositories.py \
--outfile artifacts/raw/repos_discovered.csv \
--min-stars 500 \
--filtered-outfile artifacts/raw/repos_valid.csv
# Writes artifacts/raw/repos_discovered.csv and artifacts/raw/repos_valid.csv
```

The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / pass other sanity checks. We found ~700 filtered repositories for this dataset.
The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / has atleast 500 stars / pass other sanity checks. We found ~700 filtered repositories for this dataset.


### 4. Collect relevant commits for all repositories

Given the list of repositories, we find the subset of commits that have already been closed and merged into the main branch (the top 5000 PRs, sorted by popularity). We use the `collect_commits.py` script to do this. The `filter_commits.py` script then filters out those commits that primarily modified the benchmarking files (e.g. `asv.conf.json`) or were not relevant to the benchmarks (e.g. documentation changes). The script also limits the number of repositories to a maximum of 350 to ensure we don't burden the GitHub API with too many requests. The scripts can be run as follows:

```bash
# 50 pages * 100 (PRs per page) = 5000 PRs max per repo.
$ python scripts/collect_commits.py \
--dashboards artifacts/raw/repos_valid.csv \
--outfile artifacts/raw/commits_all.jsonl \
Expand Down
3 changes: 2 additions & 1 deletion scripts/collect_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import pandas as pd

from datasmith.execution.collect_commits import search_commits
# from datasmith.execution.collect_commits import search_commits
from datasmith.execution.collect_commits_offline import search_commits
from datasmith.logging_config import configure_logging

# Configure logging for the script
Expand Down
57 changes: 42 additions & 15 deletions scripts/filter_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import argparse
import json
import re
import tempfile
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from pathlib import Path

import pandas as pd
from git import Repo
from tqdm.auto import tqdm

from datasmith.execution.utils import _get_commit_info, find_file_in_tree
from datasmith.execution.utils import _get_commit_info_offline, find_file_in_tree
from datasmith.logging_config import configure_logging

# Configure logging for the script
Expand Down Expand Up @@ -37,10 +39,11 @@ def _asv_conf_worker(repo_name: str) -> str | None:
return find_file_in_tree(repo_name, "asv.conf.json")


def _commit_info_worker(arg_tuple) -> dict | None:
def _commit_info_worker(arg_tuple: tuple[Repo, str]) -> dict | None:
"""Wrapper for ProcessPool: arg_tuple = (repo_name, sha)."""
repo, sha = arg_tuple
return _get_commit_info(repo, sha)
# return _get_commit_info(repo, sha)
return _get_commit_info_offline(repo, sha)


NON_CORE_PATTERNS = re.compile(
Expand Down Expand Up @@ -107,27 +110,51 @@ def main() -> None:
commits = commits.merge(benchmarks, how="right", on="repo_name")
commits = commits.dropna(subset=["commit_sha"])

with ProcessPoolExecutor(max_workers=args.procs) as pp:
commits["commit_info"] = list(
tqdm(
pp.map(_commit_info_worker, commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None)),
total=len(commits),
desc="Fetching commit metadata",
all_repo_names = set(commits["repo_name"])

# download all repos to a temp dir
with tempfile.TemporaryDirectory(prefix="gh-repos-") as td:
all_repos = {}
for repo_name in tqdm(all_repo_names, desc="Cloning repos"):
repo_name = repo_name.strip("/")
owner, name = repo_name.split("/", 1)
path = Path(td) / f"{owner}__{name}.git"
repo = Repo.clone_from(
f"https://github.com/{repo_name}.git",
path,
bare=True,
# multi_options=["--filter=tree:0"],
multi_options=["--filter=blob:none"],
quiet=True,
)
all_repos[repo_name] = repo

commit_info_args: list[tuple[Repo, str]] = []
for repo_name, commit_sha in commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None):
repo = all_repos[repo_name]
commit_info_args.append((repo, commit_sha))

with ProcessPoolExecutor(max_workers=args.procs) as pp:
commits["commit_info"] = list(
tqdm(
pp.map(_commit_info_worker, commit_info_args),
total=len(commits),
desc="Fetching commit metadata",
)
)
)

commit_meta = pd.json_normalize(commits.pop("commit_info"))
commits = pd.concat([commits, commit_meta], axis=1)
commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
commit_meta = pd.json_normalize(commits.pop("commit_info"))
commits = pd.concat([commits, commit_meta], axis=1)
commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)

out_path = Path(args.output_pth)
if not out_path.parent.exists():
out_path.parent.mkdir(parents=True, exist_ok=True)
# commits.to_csv(out_path, index=False)
commits.to_json(out_path, orient="records", lines=True, index=False)

logger.info(f"✔ Wrote {len(commits):,} rows → {out_path}")
logger.info("✔ Wrote %s rows → %s", len(commits), out_path)


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions scripts/scrape_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def parse_args() -> argparse.Namespace:
default=0.3,
help="Random extra delay (0-JITTER's) after each call",
)
p.add_argument("--min-stars", type=int, default=500, help="Minimum number of stars to consider a repository")
return p.parse_args()


Expand Down Expand Up @@ -83,6 +84,7 @@ def main() -> None:
filtered_df = filter_dashboards(df, url_col="url")
# remove airspeed-velocity/asv
filtered_df = filtered_df[filtered_df.repo_name != "airspeed-velocity/asv"]
filtered_df = filtered_df[filtered_df.stars >= args.min_stars]
if filtered_df.empty:
raise ValueError("No dashboards found in the repositories.") # noqa: TRY003

Expand Down
5 changes: 3 additions & 2 deletions src/datasmith/benchmark/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,9 @@ def save(self, path: str | Path) -> Path:
"""
self.modified_at = datetime.now(timezone.utc)
path = Path(path)
if any(suffix not in [".fc", ".pkl"] for suffix in path.suffixes):
path = path.with_suffix(".fc.pkl")
# Ensure the filename ends with the exact `.fc.pkl` suffix
if not path.name.endswith(".fc.pkl"):
path = path.with_name(path.name + ".fc.pkl")
with open(path, "wb") as fh:
pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
return path
Expand Down
14 changes: 6 additions & 8 deletions src/datasmith/detection/detect_breakpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,10 @@ def detect_all_breakpoints(summary_df: pd.DataFrame, method: str = "rbf") -> pd.
if missing := needed - set(summary_df.columns):
raise ValueError(str(missing))

breakpoints: pd.DataFrame = (
summary_df.groupby("benchmark", sort=False)
.apply(detection_method)
.dropna()
.explode()
.apply(pd.Series)
.reset_index(drop=True)
)
detected = summary_df.groupby("benchmark", sort=False).apply(detection_method, include_groups=False).dropna()

if detected.empty:
return pd.DataFrame()

breakpoints: pd.DataFrame = detected.explode().apply(pd.Series).reset_index(drop=True)
return breakpoints
105 changes: 105 additions & 0 deletions src/datasmith/execution/collect_commits_offline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from __future__ import annotations

import os
import re
import tempfile
import urllib.parse
from pathlib import Path

from git import GitCommandError, Repo

from datasmith import logger
from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion

_PR_MERGE_PATTERNS: tuple[re.Pattern[str], ...] = (
# standard "Merge pull request #123 ..."
re.compile(r"Merge pull request #(\d+)\b"),
# squash-merge style "... (#[0-9]+)" on the last line
re.compile(r"\(#(\d+)\)"),
)


def _default_branch(repo: Repo) -> str:
"""
Resolve the remote's default branch (origin/HEAD -> "main" / "master" / ...).
"""
try:
# “origin/main”
full_ref: str = repo.git.symbolic_ref("--quiet", "--short", "refs/remotes/origin/HEAD")
return full_ref.split("/", 1)[1] # keep text after "origin/"
except Exception:
# Fallback if symbolic-ref is missing (rare).
return repo.head.reference.name


def _is_pr_merge(message: str) -> bool:
"""
True iff *message* matches one of our PR-closing patterns.
"""
return any(p.search(message) for p in _PR_MERGE_PATTERNS)


def _is_public(repo_name: str) -> bool:
"""
Check if a repo is public.
"""
return _get_github_metadata(f"/repos/{repo_name}") is not None


@cache_completion(CACHE_LOCATION, "search_commits_offline")
def search_commits(
repo_name: str,
query: str,
max_pages: int = 100, # ignored (kept for compatibility)
per_page: int = 100, # ignored (kept for compatibility)
) -> list[str]:
"""
Return a list of commit SHAs that closed pull requests, **without**
calling any GitHub API endpoints. Internally:

• clones the repo (metadata-only) into a tmp dir
• walks the commit history
• selects commits whose message looks like a PR merge

The only element of *query* we still honour is `base=<branch>`.
"""
qs = urllib.parse.parse_qs(query, keep_blank_values=True)
base_branch: str | None = qs.get("base", [None])[0]

with tempfile.TemporaryDirectory(prefix="gh-history-") as workdir:
workdir_path = Path(workdir)
url = f"https://github.com/{repo_name}.git"

# Clone *just* the commit / tree metadata (no blobs).
clone_kwargs: dict = {
"multi_options": ["--filter=tree:0"],
"no_checkout": True,
}
if base_branch:
clone_kwargs["branch"] = base_branch

# ignore if repo is not public
try:
repo = Repo.clone_from(
url,
workdir_path,
env={"GIT_TERMINAL_PROMPT": "0", **os.environ},
**clone_kwargs,
)
except GitCommandError as e:
if e.status == 128:
msg = e.stderr.strip() or "authentication failed or repository not found"
logger.warning("Cannot clone %s: %s", url, msg)
return []
raise

# Figure out which ref to walk.
branch = base_branch or _default_branch(repo)
ref_to_walk = f"origin/{branch}"

merge_shas: set[str] = set()
for commit in repo.iter_commits(ref_to_walk):
if _is_pr_merge(str(commit.message)):
merge_shas.add(commit.hexsha)

return sorted(merge_shas)
47 changes: 46 additions & 1 deletion src/datasmith/execution/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import Any

from git import BadName, GitCommandError, Repo
from requests.exceptions import HTTPError

from datasmith.logging_config import get_logger
from datasmith.utils import _get_github_metadata
from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion

logger = get_logger("execution.utils")

Expand Down Expand Up @@ -47,6 +50,48 @@ def _get_commit_info(repo_name: str, commit_sha: str) -> dict:
}


@cache_completion(CACHE_LOCATION, "get_commit_info_offline")
def _get_commit_info_offline(repo: Repo, commit_sha: str) -> dict[str, Any]:
"""
Return commit metadata and diff stats *without* the GitHub REST API.

The function creates a temporary **treeless** clone
(`git clone --filter=tree:0 …`) so it transfers only commit objects.
When we later call `commit.stats`, Git will lazily grab just the blobs
needed dto compute line-level stats - still far cheaper than an API call.
"""
try:
commit = repo.commit(commit_sha)

except (BadName, ValueError):
logger.exception("Maybe commit not found: %s", commit_sha)
repo.git.fetch("--no-filter", "--quiet", "origin", commit_sha)
commit = repo.commit(commit_sha) # retry after fetching
except GitCommandError:
logger.exception("Error fetching commit info: %s", commit_sha)
return {
"sha": commit_sha,
"date": None,
"message": None,
"total_additions": 0,
"total_deletions": 0,
"total_files_changed": 0,
"files_changed": "",
}

stats = commit.stats

return {
"sha": commit.hexsha,
"date": commit.committed_datetime.isoformat(),
"message": commit.message,
"total_additions": stats.total["insertions"],
"total_deletions": stats.total["deletions"],
"total_files_changed": stats.total["files"],
"files_changed": "\n".join(str(k) for k in stats.files),
}


def find_file_in_tree(repo: str, filename: str, branch: str | None = None) -> list[str] | None:
if branch is None:
repo_info = _get_github_metadata(endpoint=f"/repos/{repo}")
Expand Down
4 changes: 2 additions & 2 deletions src/datasmith/logging_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@

import logging
import sys
from typing import Optional
from typing import Optional, TextIO


def configure_logging(
level: int = logging.INFO,
format_string: Optional[str] = None,
date_format: str = "%H:%M:%S",
stream: Optional[object] = None,
stream: Optional[TextIO] = None,
) -> logging.Logger:
"""
Configure logging for the datasmith package.
Expand Down
4 changes: 2 additions & 2 deletions src/datasmith/scrape/scrape_dashboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
df["date"] = df["revision"].astype(str).map(index_data["revision_to_date"])
frames.append(df)

all_benchmarks = pd.concat(frames, ignore_index=True)
all_benchmarks = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
logger.info("Collected %s rows from %s benchmark files.", f"{len(all_benchmarks):,}", f"{len(frames):,}")

all_summaries = []
Expand All @@ -114,7 +114,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
df["benchmark"] = benchmark_name
all_summaries.append(df)

all_summaries_df = pd.concat(all_summaries, ignore_index=True)
all_summaries_df = pd.concat(all_summaries, ignore_index=True) if all_summaries else pd.DataFrame()

collection = BenchmarkCollection(
base_url=base_url,
Expand Down
Loading
Loading