Merge pull request #1 from formula-code/offline_commits

atharvas · web-flow · commit 87a62fd27d29 · 2025-08-16T16:21:01.000-07:00
add offline commit scraiping
diff --git a/README.md b/README.md
@@ -146,19 +146,19 @@ The scraper can be run using the following command:
 ```bash
 $ python scripts/scrape_repositories.py \
        --outfile artifacts/raw/repos_discovered.csv \
+       --min-stars 500 \
        --filtered-outfile artifacts/raw/repos_valid.csv
 # Writes artifacts/raw/repos_discovered.csv and artifacts/raw/repos_valid.csv
 ```
 
-The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / pass other sanity checks. We found ~700 filtered repositories for this dataset.
+The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / has atleast 500 stars / pass other sanity checks. We found ~700 filtered repositories for this dataset.
 
 
 ### 4. Collect relevant commits for all repositories
 
 Given the list of repositories, we find the subset of commits that have already been closed and merged into the main branch (the top 5000 PRs, sorted by popularity). We use the `collect_commits.py` script to do this. The `filter_commits.py` script then filters out those commits that primarily modified the benchmarking files (e.g. `asv.conf.json`) or were not relevant to the benchmarks (e.g. documentation changes). The script also limits the number of repositories to a maximum of 350 to ensure we don't burden the GitHub API with too many requests. The scripts can be run as follows:
 
 ```bash
-# 50 pages * 100 (PRs per page) = 5000 PRs max per repo.
 $ python scripts/collect_commits.py \
        --dashboards artifacts/raw/repos_valid.csv \
        --outfile    artifacts/raw/commits_all.jsonl \
diff --git a/scripts/collect_commits.py b/scripts/collect_commits.py
@@ -2,7 +2,8 @@
 
 import pandas as pd
 
-from datasmith.execution.collect_commits import search_commits
+# from datasmith.execution.collect_commits import search_commits
+from datasmith.execution.collect_commits_offline import search_commits
 from datasmith.logging_config import configure_logging
 
 # Configure logging for the script
diff --git a/scripts/filter_commits.py b/scripts/filter_commits.py
@@ -3,13 +3,15 @@
 import argparse
 import json
 import re
+import tempfile
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from pathlib import Path
 
 import pandas as pd
+from git import Repo
 from tqdm.auto import tqdm
 
-from datasmith.execution.utils import _get_commit_info, find_file_in_tree
+from datasmith.execution.utils import _get_commit_info_offline, find_file_in_tree
 from datasmith.logging_config import configure_logging
 
 # Configure logging for the script
@@ -37,10 +39,11 @@ def _asv_conf_worker(repo_name: str) -> str | None:
     return find_file_in_tree(repo_name, "asv.conf.json")
 
 
-def _commit_info_worker(arg_tuple) -> dict | None:
+def _commit_info_worker(arg_tuple: tuple[Repo, str]) -> dict | None:
     """Wrapper for ProcessPool: arg_tuple = (repo_name, sha)."""
     repo, sha = arg_tuple
-    return _get_commit_info(repo, sha)
+    # return _get_commit_info(repo, sha)
+    return _get_commit_info_offline(repo, sha)
 
 
 NON_CORE_PATTERNS = re.compile(
@@ -107,27 +110,51 @@ def main() -> None:
     commits = commits.merge(benchmarks, how="right", on="repo_name")
     commits = commits.dropna(subset=["commit_sha"])
 
-    with ProcessPoolExecutor(max_workers=args.procs) as pp:
-        commits["commit_info"] = list(
-            tqdm(
-                pp.map(_commit_info_worker, commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None)),
-                total=len(commits),
-                desc="Fetching commit metadata",
+    all_repo_names = set(commits["repo_name"])
+
+    # download all repos to a temp dir
+    with tempfile.TemporaryDirectory(prefix="gh-repos-") as td:
+        all_repos = {}
+        for repo_name in tqdm(all_repo_names, desc="Cloning repos"):
+            repo_name = repo_name.strip("/")
+            owner, name = repo_name.split("/", 1)
+            path = Path(td) / f"{owner}__{name}.git"
+            repo = Repo.clone_from(
+                f"https://github.com/{repo_name}.git",
+                path,
+                bare=True,
+                # multi_options=["--filter=tree:0"],
+                multi_options=["--filter=blob:none"],
+                quiet=True,
+            )
+            all_repos[repo_name] = repo
+
+        commit_info_args: list[tuple[Repo, str]] = []
+        for repo_name, commit_sha in commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None):
+            repo = all_repos[repo_name]
+            commit_info_args.append((repo, commit_sha))
+
+        with ProcessPoolExecutor(max_workers=args.procs) as pp:
+            commits["commit_info"] = list(
+                tqdm(
+                    pp.map(_commit_info_worker, commit_info_args),
+                    total=len(commits),
+                    desc="Fetching commit metadata",
+                )
             )
-        )
 
-    commit_meta = pd.json_normalize(commits.pop("commit_info"))
-    commits = pd.concat([commits, commit_meta], axis=1)
-    commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
-    commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
+        commit_meta = pd.json_normalize(commits.pop("commit_info"))
+        commits = pd.concat([commits, commit_meta], axis=1)
+        commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
+        commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
 
     out_path = Path(args.output_pth)
     if not out_path.parent.exists():
         out_path.parent.mkdir(parents=True, exist_ok=True)
     # commits.to_csv(out_path, index=False)
     commits.to_json(out_path, orient="records", lines=True, index=False)
 
-    logger.info(f"✔ Wrote {len(commits):,} rows → {out_path}")
+    logger.info("✔ Wrote %s rows → %s", len(commits), out_path)
 
 
 if __name__ == "__main__":
diff --git a/scripts/scrape_repositories.py b/scripts/scrape_repositories.py
@@ -56,6 +56,7 @@ def parse_args() -> argparse.Namespace:
         default=0.3,
         help="Random extra delay (0-JITTER's) after each call",
     )
+    p.add_argument("--min-stars", type=int, default=500, help="Minimum number of stars to consider a repository")
     return p.parse_args()
 
 
@@ -83,6 +84,7 @@ def main() -> None:
     filtered_df = filter_dashboards(df, url_col="url")
     # remove airspeed-velocity/asv
     filtered_df = filtered_df[filtered_df.repo_name != "airspeed-velocity/asv"]
+    filtered_df = filtered_df[filtered_df.stars >= args.min_stars]
     if filtered_df.empty:
         raise ValueError("No dashboards found in the repositories.")  # noqa: TRY003
 
diff --git a/src/datasmith/benchmark/collection.py b/src/datasmith/benchmark/collection.py
@@ -76,8 +76,9 @@ def save(self, path: str | Path) -> Path:
         """
         self.modified_at = datetime.now(timezone.utc)
         path = Path(path)
-        if any(suffix not in [".fc", ".pkl"] for suffix in path.suffixes):
-            path = path.with_suffix(".fc.pkl")
+        # Ensure the filename ends with the exact `.fc.pkl` suffix
+        if not path.name.endswith(".fc.pkl"):
+            path = path.with_name(path.name + ".fc.pkl")
         with open(path, "wb") as fh:
             pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
         return path
diff --git a/src/datasmith/detection/detect_breakpoints.py b/src/datasmith/detection/detect_breakpoints.py
@@ -92,12 +92,10 @@ def detect_all_breakpoints(summary_df: pd.DataFrame, method: str = "rbf") -> pd.
     if missing := needed - set(summary_df.columns):
         raise ValueError(str(missing))
 
-    breakpoints: pd.DataFrame = (
-        summary_df.groupby("benchmark", sort=False)
-        .apply(detection_method)
-        .dropna()
-        .explode()
-        .apply(pd.Series)
-        .reset_index(drop=True)
-    )
+    detected = summary_df.groupby("benchmark", sort=False).apply(detection_method, include_groups=False).dropna()
+
+    if detected.empty:
+        return pd.DataFrame()
+
+    breakpoints: pd.DataFrame = detected.explode().apply(pd.Series).reset_index(drop=True)
     return breakpoints
diff --git a/src/datasmith/execution/collect_commits_offline.py b/src/datasmith/execution/collect_commits_offline.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import os
+import re
+import tempfile
+import urllib.parse
+from pathlib import Path
+
+from git import GitCommandError, Repo
+
+from datasmith import logger
+from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion
+
+_PR_MERGE_PATTERNS: tuple[re.Pattern[str], ...] = (
+    # standard "Merge pull request #123 ..."
+    re.compile(r"Merge pull request #(\d+)\b"),
+    # squash-merge style "... (#[0-9]+)" on the last line
+    re.compile(r"\(#(\d+)\)"),
+)
+
+
+def _default_branch(repo: Repo) -> str:
+    """
+    Resolve the remote's default branch (origin/HEAD -> "main" / "master" / ...).
+    """
+    try:
+        # “origin/main”
+        full_ref: str = repo.git.symbolic_ref("--quiet", "--short", "refs/remotes/origin/HEAD")
+        return full_ref.split("/", 1)[1]  # keep text after "origin/"
+    except Exception:
+        # Fallback if symbolic-ref is missing (rare).
+        return repo.head.reference.name
+
+
+def _is_pr_merge(message: str) -> bool:
+    """
+    True iff *message* matches one of our PR-closing patterns.
+    """
+    return any(p.search(message) for p in _PR_MERGE_PATTERNS)
+
+
+def _is_public(repo_name: str) -> bool:
+    """
+    Check if a repo is public.
+    """
+    return _get_github_metadata(f"/repos/{repo_name}") is not None
+
+
+@cache_completion(CACHE_LOCATION, "search_commits_offline")
+def search_commits(
+    repo_name: str,
+    query: str,
+    max_pages: int = 100,  # ignored (kept for compatibility)
+    per_page: int = 100,  # ignored (kept for compatibility)
+) -> list[str]:
+    """
+    Return a list of commit SHAs that closed pull requests, **without**
+    calling any GitHub API endpoints.  Internally:
+
+        • clones the repo (metadata-only) into a tmp dir
+        • walks the commit history
+        • selects commits whose message looks like a PR merge
+
+    The only element of *query* we still honour is `base=<branch>`.
+    """
+    qs = urllib.parse.parse_qs(query, keep_blank_values=True)
+    base_branch: str | None = qs.get("base", [None])[0]
+
+    with tempfile.TemporaryDirectory(prefix="gh-history-") as workdir:
+        workdir_path = Path(workdir)
+        url = f"https://github.com/{repo_name}.git"
+
+        # Clone *just* the commit / tree metadata (no blobs).
+        clone_kwargs: dict = {
+            "multi_options": ["--filter=tree:0"],
+            "no_checkout": True,
+        }
+        if base_branch:
+            clone_kwargs["branch"] = base_branch
+
+        # ignore if repo is not public
+        try:
+            repo = Repo.clone_from(
+                url,
+                workdir_path,
+                env={"GIT_TERMINAL_PROMPT": "0", **os.environ},
+                **clone_kwargs,
+            )
+        except GitCommandError as e:
+            if e.status == 128:
+                msg = e.stderr.strip() or "authentication failed or repository not found"
+                logger.warning("Cannot clone %s: %s", url, msg)
+                return []
+            raise
+
+        # Figure out which ref to walk.
+        branch = base_branch or _default_branch(repo)
+        ref_to_walk = f"origin/{branch}"
+
+        merge_shas: set[str] = set()
+        for commit in repo.iter_commits(ref_to_walk):
+            if _is_pr_merge(str(commit.message)):
+                merge_shas.add(commit.hexsha)
+
+        return sorted(merge_shas)
diff --git a/src/datasmith/execution/utils.py b/src/datasmith/execution/utils.py
@@ -1,7 +1,10 @@
+from typing import Any
+
+from git import BadName, GitCommandError, Repo
 from requests.exceptions import HTTPError
 
 from datasmith.logging_config import get_logger
-from datasmith.utils import _get_github_metadata
+from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion
 
 logger = get_logger("execution.utils")
 
@@ -47,6 +50,48 @@ def _get_commit_info(repo_name: str, commit_sha: str) -> dict:
     }
 
 
+@cache_completion(CACHE_LOCATION, "get_commit_info_offline")
+def _get_commit_info_offline(repo: Repo, commit_sha: str) -> dict[str, Any]:
+    """
+    Return commit metadata and diff stats *without* the GitHub REST API.
+
+    The function creates a temporary **treeless** clone
+    (`git clone --filter=tree:0 …`) so it transfers only commit objects.
+    When we later call `commit.stats`, Git will lazily grab just the blobs
+    needed dto compute line-level stats - still far cheaper than an API call.
+    """
+    try:
+        commit = repo.commit(commit_sha)
+
+    except (BadName, ValueError):
+        logger.exception("Maybe commit not found: %s", commit_sha)
+        repo.git.fetch("--no-filter", "--quiet", "origin", commit_sha)
+        commit = repo.commit(commit_sha)  # retry after fetching
+    except GitCommandError:
+        logger.exception("Error fetching commit info: %s", commit_sha)
+        return {
+            "sha": commit_sha,
+            "date": None,
+            "message": None,
+            "total_additions": 0,
+            "total_deletions": 0,
+            "total_files_changed": 0,
+            "files_changed": "",
+        }
+
+    stats = commit.stats
+
+    return {
+        "sha": commit.hexsha,
+        "date": commit.committed_datetime.isoformat(),
+        "message": commit.message,
+        "total_additions": stats.total["insertions"],
+        "total_deletions": stats.total["deletions"],
+        "total_files_changed": stats.total["files"],
+        "files_changed": "\n".join(str(k) for k in stats.files),
+    }
+
+
 def find_file_in_tree(repo: str, filename: str, branch: str | None = None) -> list[str] | None:
     if branch is None:
         repo_info = _get_github_metadata(endpoint=f"/repos/{repo}")
diff --git a/src/datasmith/logging_config.py b/src/datasmith/logging_config.py
@@ -8,14 +8,14 @@
 
 import logging
 import sys
-from typing import Optional
+from typing import Optional, TextIO
 
 
 def configure_logging(
     level: int = logging.INFO,
     format_string: Optional[str] = None,
     date_format: str = "%H:%M:%S",
-    stream: Optional[object] = None,
+    stream: Optional[TextIO] = None,
 ) -> logging.Logger:
     """
     Configure logging for the datasmith package.
diff --git a/src/datasmith/scrape/scrape_dashboards.py b/src/datasmith/scrape/scrape_dashboards.py
@@ -93,7 +93,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
             df["date"] = df["revision"].astype(str).map(index_data["revision_to_date"])
             frames.append(df)
 
-    all_benchmarks = pd.concat(frames, ignore_index=True)
+    all_benchmarks = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
     logger.info("Collected %s rows from %s benchmark files.", f"{len(all_benchmarks):,}", f"{len(frames):,}")
 
     all_summaries = []
@@ -114,7 +114,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
         df["benchmark"] = benchmark_name
         all_summaries.append(df)
 
-    all_summaries_df = pd.concat(all_summaries, ignore_index=True)
+    all_summaries_df = pd.concat(all_summaries, ignore_index=True) if all_summaries else pd.DataFrame()
 
     collection = BenchmarkCollection(
         base_url=base_url,
diff --git a/tests/test_benchmark_collection.py b/tests/test_benchmark_collection.py
diff --git a/tests/test_scraper.py b/tests/test_scraper.py