From 38c99aafc5b217c23e182c2ab258a69f5d7d3dee Mon Sep 17 00:00:00 2001
From: Atharva Sehgal <atharva.sehgal@gmail.com>
Date: Thu, 17 Jul 2025 01:26:05 +0000
Subject: [PATCH 1/3] add offline commit scraiping

---
 README.md                                     |   4 +-
 scripts/collect_commits.py                    |   3 +-
 scripts/filter_commits.py                     |  57 +++++++---
 scripts/scrape_repositories.py                |   2 +
 .../execution/collect_commits_offline.py      | 105 ++++++++++++++++++
 src/datasmith/execution/utils.py              |  47 +++++++-
 src/datasmith/logging_config.py               |   4 +-
 7 files changed, 201 insertions(+), 21 deletions(-)
 create mode 100644 src/datasmith/execution/collect_commits_offline.py

diff --git a/README.md b/README.md
index 1afcafd..b0ef975 100644
--- a/README.md
+++ b/README.md
@@ -146,11 +146,12 @@ The scraper can be run using the following command:
 ```bash
 $ python scripts/scrape_repositories.py \
        --outfile artifacts/raw/repos_discovered.csv \
+       --min-stars 500 \
        --filtered-outfile artifacts/raw/repos_valid.csv
 # Writes artifacts/raw/repos_discovered.csv and artifacts/raw/repos_valid.csv
 ```
 
-The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / pass other sanity checks. We found ~700 filtered repositories for this dataset.
+The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / has atleast 500 stars / pass other sanity checks. We found ~700 filtered repositories for this dataset.
 
 
 ### 4. Collect relevant commits for all repositories
@@ -158,7 +159,6 @@ The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories t
 Given the list of repositories, we find the subset of commits that have already been closed and merged into the main branch (the top 5000 PRs, sorted by popularity). We use the `collect_commits.py` script to do this. The `filter_commits.py` script then filters out those commits that primarily modified the benchmarking files (e.g. `asv.conf.json`) or were not relevant to the benchmarks (e.g. documentation changes). The script also limits the number of repositories to a maximum of 350 to ensure we don't burden the GitHub API with too many requests. The scripts can be run as follows:
 
 ```bash
-# 50 pages * 100 (PRs per page) = 5000 PRs max per repo.
 $ python scripts/collect_commits.py \
        --dashboards artifacts/raw/repos_valid.csv \
        --outfile    artifacts/raw/commits_all.jsonl \
diff --git a/scripts/collect_commits.py b/scripts/collect_commits.py
index d5116b9..741dc48 100644
--- a/scripts/collect_commits.py
+++ b/scripts/collect_commits.py
@@ -2,7 +2,8 @@
 
 import pandas as pd
 
-from datasmith.execution.collect_commits import search_commits
+# from datasmith.execution.collect_commits import search_commits
+from datasmith.execution.collect_commits_offline import search_commits
 from datasmith.logging_config import configure_logging
 
 # Configure logging for the script
diff --git a/scripts/filter_commits.py b/scripts/filter_commits.py
index 790fa68..2ed9f8b 100644
--- a/scripts/filter_commits.py
+++ b/scripts/filter_commits.py
@@ -3,13 +3,15 @@
 import argparse
 import json
 import re
+import tempfile
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from pathlib import Path
 
 import pandas as pd
+from git import Repo
 from tqdm.auto import tqdm
 
-from datasmith.execution.utils import _get_commit_info, find_file_in_tree
+from datasmith.execution.utils import _get_commit_info_offline, find_file_in_tree
 from datasmith.logging_config import configure_logging
 
 # Configure logging for the script
@@ -37,10 +39,11 @@ def _asv_conf_worker(repo_name: str) -> str | None:
     return find_file_in_tree(repo_name, "asv.conf.json")
 
 
-def _commit_info_worker(arg_tuple) -> dict | None:
+def _commit_info_worker(arg_tuple: tuple[Repo, str]) -> dict | None:
     """Wrapper for ProcessPool: arg_tuple = (repo_name, sha)."""
     repo, sha = arg_tuple
-    return _get_commit_info(repo, sha)
+    # return _get_commit_info(repo, sha)
+    return _get_commit_info_offline(repo, sha)
 
 
 NON_CORE_PATTERNS = re.compile(
@@ -107,19 +110,43 @@ def main() -> None:
     commits = commits.merge(benchmarks, how="right", on="repo_name")
     commits = commits.dropna(subset=["commit_sha"])
 
-    with ProcessPoolExecutor(max_workers=args.procs) as pp:
-        commits["commit_info"] = list(
-            tqdm(
-                pp.map(_commit_info_worker, commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None)),
-                total=len(commits),
-                desc="Fetching commit metadata",
+    all_repo_names = set(commits["repo_name"])
+
+    # download all repos to a temp dir
+    with tempfile.TemporaryDirectory(prefix="gh-repos-") as td:
+        all_repos = {}
+        for repo_name in tqdm(all_repo_names, desc="Cloning repos"):
+            repo_name = repo_name.strip("/")
+            owner, name = repo_name.split("/", 1)
+            path = Path(td) / f"{owner}__{name}.git"
+            repo = Repo.clone_from(
+                f"https://github.com/{repo_name}.git",
+                path,
+                bare=True,
+                # multi_options=["--filter=tree:0"],
+                multi_options=["--filter=blob:none"],
+                quiet=True,
+            )
+            all_repos[repo_name] = repo
+
+        commit_info_args: list[tuple[Repo, str]] = []
+        for repo_name, commit_sha in commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None):
+            repo = all_repos[repo_name]
+            commit_info_args.append((repo, commit_sha))
+
+        with ProcessPoolExecutor(max_workers=args.procs) as pp:
+            commits["commit_info"] = list(
+                tqdm(
+                    pp.map(_commit_info_worker, commit_info_args),
+                    total=len(commits),
+                    desc="Fetching commit metadata",
+                )
             )
-        )
 
-    commit_meta = pd.json_normalize(commits.pop("commit_info"))
-    commits = pd.concat([commits, commit_meta], axis=1)
-    commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
-    commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
+        commit_meta = pd.json_normalize(commits.pop("commit_info"))
+        commits = pd.concat([commits, commit_meta], axis=1)
+        commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"])
+        commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True)
 
     out_path = Path(args.output_pth)
     if not out_path.parent.exists():
@@ -127,7 +154,7 @@ def main() -> None:
     # commits.to_csv(out_path, index=False)
     commits.to_json(out_path, orient="records", lines=True, index=False)
 
-    logger.info(f"✔ Wrote {len(commits):,} rows → {out_path}")
+    logger.info("✔ Wrote %s rows → %s", len(commits), out_path)
 
 
 if __name__ == "__main__":
diff --git a/scripts/scrape_repositories.py b/scripts/scrape_repositories.py
index 615d7a7..444c30c 100644
--- a/scripts/scrape_repositories.py
+++ b/scripts/scrape_repositories.py
@@ -56,6 +56,7 @@ def parse_args() -> argparse.Namespace:
         default=0.3,
         help="Random extra delay (0-JITTER's) after each call",
     )
+    p.add_argument("--min-stars", type=int, default=500, help="Minimum number of stars to consider a repository")
     return p.parse_args()
 
 
@@ -83,6 +84,7 @@ def main() -> None:
     filtered_df = filter_dashboards(df, url_col="url")
     # remove airspeed-velocity/asv
     filtered_df = filtered_df[filtered_df.repo_name != "airspeed-velocity/asv"]
+    filtered_df = filtered_df[filtered_df.stars >= args.min_stars]
     if filtered_df.empty:
         raise ValueError("No dashboards found in the repositories.")  # noqa: TRY003
 
diff --git a/src/datasmith/execution/collect_commits_offline.py b/src/datasmith/execution/collect_commits_offline.py
new file mode 100644
index 0000000..8593a9c
--- /dev/null
+++ b/src/datasmith/execution/collect_commits_offline.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import os
+import re
+import tempfile
+import urllib.parse
+from pathlib import Path
+
+from git import GitCommandError, Repo
+
+from datasmith import logger
+from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion
+
+_PR_MERGE_PATTERNS: tuple[re.Pattern[str], ...] = (
+    # standard "Merge pull request #123 ..."
+    re.compile(r"Merge pull request #(\d+)\b"),
+    # squash-merge style "... (#[0-9]+)" on the last line
+    re.compile(r"\(#(\d+)\)"),
+)
+
+
+def _default_branch(repo: Repo) -> str:
+    """
+    Resolve the remote's default branch (origin/HEAD -> "main" / "master" / ...).
+    """
+    try:
+        # “origin/main”
+        full_ref: str = repo.git.symbolic_ref("--quiet", "--short", "refs/remotes/origin/HEAD")
+        return full_ref.split("/", 1)[1]  # keep text after "origin/"
+    except Exception:
+        # Fallback if symbolic-ref is missing (rare).
+        return repo.head.reference.name
+
+
+def _is_pr_merge(message: str) -> bool:
+    """
+    True iff *message* matches one of our PR-closing patterns.
+    """
+    return any(p.search(message) for p in _PR_MERGE_PATTERNS)
+
+
+def _is_public(repo_name: str) -> bool:
+    """
+    Check if a repo is public.
+    """
+    return _get_github_metadata(f"/repos/{repo_name}") is not None
+
+
+@cache_completion(CACHE_LOCATION, "search_commits_offline")
+def search_commits(
+    repo_name: str,
+    query: str,
+    max_pages: int = 100,  # ignored (kept for compatibility)
+    per_page: int = 100,  # ignored (kept for compatibility)
+) -> list[str]:
+    """
+    Return a list of commit SHAs that closed pull requests, **without**
+    calling any GitHub API endpoints.  Internally:
+
+        • clones the repo (metadata-only) into a tmp dir
+        • walks the commit history
+        • selects commits whose message looks like a PR merge
+
+    The only element of *query* we still honour is `base=<branch>`.
+    """
+    qs = urllib.parse.parse_qs(query, keep_blank_values=True)
+    base_branch: str | None = qs.get("base", [None])[0]
+
+    with tempfile.TemporaryDirectory(prefix="gh-history-") as workdir:
+        workdir_path = Path(workdir)
+        url = f"https://github.com/{repo_name}.git"
+
+        # Clone *just* the commit / tree metadata (no blobs).
+        clone_kwargs: dict = {
+            "multi_options": ["--filter=tree:0"],
+            "no_checkout": True,
+        }
+        if base_branch:
+            clone_kwargs["branch"] = base_branch
+
+        # ignore if repo is not public
+        try:
+            repo = Repo.clone_from(
+                url,
+                workdir_path,
+                env={"GIT_TERMINAL_PROMPT": "0", **os.environ},
+                **clone_kwargs,
+            )
+        except GitCommandError as e:
+            if e.status == 128:
+                msg = e.stderr.strip() or "authentication failed or repository not found"
+                logger.warning("Cannot clone %s: %s", url, msg)
+                return []
+            raise
+
+        # Figure out which ref to walk.
+        branch = base_branch or _default_branch(repo)
+        ref_to_walk = f"origin/{branch}"
+
+        merge_shas: set[str] = set()
+        for commit in repo.iter_commits(ref_to_walk):
+            if _is_pr_merge(str(commit.message)):
+                merge_shas.add(commit.hexsha)
+
+        return sorted(merge_shas)
diff --git a/src/datasmith/execution/utils.py b/src/datasmith/execution/utils.py
index 4a57a47..acb0724 100644
--- a/src/datasmith/execution/utils.py
+++ b/src/datasmith/execution/utils.py
@@ -1,7 +1,10 @@
+from typing import Any
+
+from git import BadName, GitCommandError, Repo
 from requests.exceptions import HTTPError
 
 from datasmith.logging_config import get_logger
-from datasmith.utils import _get_github_metadata
+from datasmith.utils import CACHE_LOCATION, _get_github_metadata, cache_completion
 
 logger = get_logger("execution.utils")
 
@@ -47,6 +50,48 @@ def _get_commit_info(repo_name: str, commit_sha: str) -> dict:
     }
 
 
+@cache_completion(CACHE_LOCATION, "get_commit_info_offline")
+def _get_commit_info_offline(repo: Repo, commit_sha: str) -> dict[str, Any]:
+    """
+    Return commit metadata and diff stats *without* the GitHub REST API.
+
+    The function creates a temporary **treeless** clone
+    (`git clone --filter=tree:0 …`) so it transfers only commit objects.
+    When we later call `commit.stats`, Git will lazily grab just the blobs
+    needed dto compute line-level stats - still far cheaper than an API call.
+    """
+    try:
+        commit = repo.commit(commit_sha)
+
+    except (BadName, ValueError):
+        logger.exception("Maybe commit not found: %s", commit_sha)
+        repo.git.fetch("--no-filter", "--quiet", "origin", commit_sha)
+        commit = repo.commit(commit_sha)  # retry after fetching
+    except GitCommandError:
+        logger.exception("Error fetching commit info: %s", commit_sha)
+        return {
+            "sha": commit_sha,
+            "date": None,
+            "message": None,
+            "total_additions": 0,
+            "total_deletions": 0,
+            "total_files_changed": 0,
+            "files_changed": "",
+        }
+
+    stats = commit.stats
+
+    return {
+        "sha": commit.hexsha,
+        "date": commit.committed_datetime.isoformat(),
+        "message": commit.message,
+        "total_additions": stats.total["insertions"],
+        "total_deletions": stats.total["deletions"],
+        "total_files_changed": stats.total["files"],
+        "files_changed": "\n".join(str(k) for k in stats.files),
+    }
+
+
 def find_file_in_tree(repo: str, filename: str, branch: str | None = None) -> list[str] | None:
     if branch is None:
         repo_info = _get_github_metadata(endpoint=f"/repos/{repo}")
diff --git a/src/datasmith/logging_config.py b/src/datasmith/logging_config.py
index 3f72878..d5ecbb3 100644
--- a/src/datasmith/logging_config.py
+++ b/src/datasmith/logging_config.py
@@ -8,14 +8,14 @@
 
 import logging
 import sys
-from typing import Optional
+from typing import Optional, TextIO
 
 
 def configure_logging(
     level: int = logging.INFO,
     format_string: Optional[str] = None,
     date_format: str = "%H:%M:%S",
-    stream: Optional[object] = None,
+    stream: Optional[TextIO] = None,
 ) -> logging.Logger:
     """
     Configure logging for the datasmith package.

From efcb24d7711291b56581b1b98e1a6a5de227ae4a Mon Sep 17 00:00:00 2001
From: Atharva Sehgal <atharva.sehgal@gmail.com>
Date: Thu, 14 Aug 2025 20:21:50 +0000
Subject: [PATCH 2/3] adding some test cases for the scraper and for the
 benchmark collections package

---
 src/datasmith/benchmark/collection.py         |   5 +-
 src/datasmith/detection/detect_breakpoints.py |  14 +-
 src/datasmith/scrape/scrape_dashboards.py     |   4 +-
 tests/test_benchmark_collection.py            | 102 +++++
 tests/test_scraper.py                         | 415 ++++++++++++++++++
 5 files changed, 528 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_benchmark_collection.py
 create mode 100644 tests/test_scraper.py

diff --git a/src/datasmith/benchmark/collection.py b/src/datasmith/benchmark/collection.py
index 9a62907..ab2606e 100644
--- a/src/datasmith/benchmark/collection.py
+++ b/src/datasmith/benchmark/collection.py
@@ -76,8 +76,9 @@ def save(self, path: str | Path) -> Path:
         """
         self.modified_at = datetime.now(timezone.utc)
         path = Path(path)
-        if any(suffix not in [".fc", ".pkl"] for suffix in path.suffixes):
-            path = path.with_suffix(".fc.pkl")
+        # Ensure the filename ends with the exact `.fc.pkl` suffix
+        if not path.name.endswith(".fc.pkl"):
+            path = path.with_name(path.name + ".fc.pkl")
         with open(path, "wb") as fh:
             pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
         return path
diff --git a/src/datasmith/detection/detect_breakpoints.py b/src/datasmith/detection/detect_breakpoints.py
index defc100..0d25f44 100644
--- a/src/datasmith/detection/detect_breakpoints.py
+++ b/src/datasmith/detection/detect_breakpoints.py
@@ -92,12 +92,10 @@ def detect_all_breakpoints(summary_df: pd.DataFrame, method: str = "rbf") -> pd.
     if missing := needed - set(summary_df.columns):
         raise ValueError(str(missing))
 
-    breakpoints: pd.DataFrame = (
-        summary_df.groupby("benchmark", sort=False)
-        .apply(detection_method)
-        .dropna()
-        .explode()
-        .apply(pd.Series)
-        .reset_index(drop=True)
-    )
+    detected = summary_df.groupby("benchmark", sort=False).apply(detection_method, include_groups=False).dropna()
+
+    if detected.empty:
+        return pd.DataFrame()
+
+    breakpoints: pd.DataFrame = detected.explode().apply(pd.Series).reset_index(drop=True)
     return breakpoints
diff --git a/src/datasmith/scrape/scrape_dashboards.py b/src/datasmith/scrape/scrape_dashboards.py
index 862ce22..72aa49b 100644
--- a/src/datasmith/scrape/scrape_dashboards.py
+++ b/src/datasmith/scrape/scrape_dashboards.py
@@ -93,7 +93,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
             df["date"] = df["revision"].astype(str).map(index_data["revision_to_date"])
             frames.append(df)
 
-    all_benchmarks = pd.concat(frames, ignore_index=True)
+    all_benchmarks = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
     logger.info("Collected %s rows from %s benchmark files.", f"{len(all_benchmarks):,}", f"{len(frames):,}")
 
     all_summaries = []
@@ -114,7 +114,7 @@ def make_benchmark_from_html(base_url: str, html_dir: str, force: bool) -> Bench
         df["benchmark"] = benchmark_name
         all_summaries.append(df)
 
-    all_summaries_df = pd.concat(all_summaries, ignore_index=True)
+    all_summaries_df = pd.concat(all_summaries, ignore_index=True) if all_summaries else pd.DataFrame()
 
     collection = BenchmarkCollection(
         base_url=base_url,
diff --git a/tests/test_benchmark_collection.py b/tests/test_benchmark_collection.py
new file mode 100644
index 0000000..e1258d4
--- /dev/null
+++ b/tests/test_benchmark_collection.py
@@ -0,0 +1,102 @@
+"""
+This file implements many test cases pertaining to the use of the BenchmarkCollection class.
+
+- `BenchmarkCollection.save` and `load` round-trip integrity.
+- `save` enforces `.fc.pkl` suffix; updates `modified_at`; `load` returns correct type.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from pathlib import Path
+
+import pandas as pd
+
+from datasmith.benchmark.collection import BenchmarkCollection
+
+
+def _make_collection() -> BenchmarkCollection:
+    now = datetime.now(timezone.utc)
+    benchmarks = pd.DataFrame({
+        "revision": [1],
+        "time": [1.0],
+        "hash": ["abc1234"],
+        "benchmark": ["bench.fn"],
+        "machine": ["docker"],
+        "date": ["2024-01-01T00:00:00Z"],
+    })
+    summaries = pd.DataFrame({
+        "revision": [1],
+        "time": [1.0],
+        "hash": ["abc1234"],
+        "benchmark": ["bench.fn"],
+        "date": ["2024-01-01T00:00:00Z"],
+    })
+    index_data = {
+        "project": "demo",
+        "project_url": "https://example.com",
+        "show_commit_url": "https://github.com/org/repo/commit",
+        "hash_length": 7,
+        "revision_to_hash": {"1": "abc1234"},
+        "revision_to_date": {"1": "2024-01-01T00:00:00Z"},
+        "params": ["machine"],
+        "graph_param_list": [{"machine": "docker"}],
+        "benchmarks": ["bench.fn"],
+        "machines": ["docker"],
+        "tags": [],
+        "pages": [],
+    }
+    return BenchmarkCollection(
+        base_url="file:///tmp/html",
+        collected_at=now,
+        modified_at=now,
+        param_keys=["machine"],
+        index_data=index_data,
+        benchmarks=benchmarks,
+        summaries=summaries,
+    )
+
+
+def test_save_load_roundtrip(tmp_path: Path) -> None:
+    c = _make_collection()
+    out = c.save(tmp_path / "dashboard.fc.pkl")
+    loaded = BenchmarkCollection.load(out)
+
+    assert isinstance(loaded, BenchmarkCollection)
+    assert loaded.base_url == c.base_url
+    assert loaded.param_keys == c.param_keys
+    assert loaded.index_data == c.index_data
+    assert loaded.benchmarks.equals(c.benchmarks)
+    assert loaded.summaries.equals(c.summaries)
+
+
+def test_save_enforces_fc_pkl_suffix(tmp_path: Path) -> None:
+    c = _make_collection()
+    candidates = [
+        tmp_path / "a",
+        tmp_path / "a.pkl",
+        tmp_path / "a.fc",
+        tmp_path / "a.txt",
+        tmp_path / "a.fc.pkl",
+    ]
+    for p in candidates:
+        written = c.save(p)
+        # Expect exact .fc.pkl suffix convention
+        assert Path(written).suffixes[-2:] == [".fc", ".pkl"]
+        assert Path(written).exists()
+
+
+def test_modified_at_updates_on_save(tmp_path: Path) -> None:
+    c = _make_collection()
+    before = c.modified_at
+    out = c.save(tmp_path / "x.fc.pkl")
+    loaded = BenchmarkCollection.load(out)
+    assert loaded.modified_at >= before
+    assert loaded.modified_at != before
+
+
+def test_load_returns_correct_type(tmp_path: Path) -> None:
+    c = _make_collection()
+    out = c.save(tmp_path / "y.fc.pkl")
+    obj = BenchmarkCollection.load(out)
+    assert isinstance(obj, BenchmarkCollection)
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
new file mode 100644
index 0000000..3817056
--- /dev/null
+++ b/tests/test_scraper.py
@@ -0,0 +1,415 @@
+"""
+This file implements many test cases pertaining to the use of the scraper.
+
+
+scrape.scrape_dashboards.make_benchmark_from_html:
+    - `make_benchmark_from_html` handles both remote URLs and local folders; correct path joining for graphs and summaries.
+    - Proper parsing of `index.json`, `graphs/*/*.json` including invalid JSON and missing files (log + skip).
+    - DataFrames have required columns: `benchmarks` and `summaries` contain `revision`, `time`, `hash`, `benchmark`, `machine`/`date`.
+    - `force` propagation works (note: CLI currently doesn't pass `force` through; test should expose this gap).
+
+
+detection.detect_breakpoints.detect_all_breakpoints` (+ `get_detection_method`):
+scrape.code_coverage.generate_coverage_dataframe
+scrape.build_reports.breakpoints_scrape_comments
+    - `get_detection_method`: returns callable for `"rbf"` and `"asv"`, errors for invalid method.
+    - `detect_all_breakpoints`: on synthetic series detects negative deltas; validates required columns; grouping per `benchmark`.
+    - Coverage generation builds commit URLs from `index_data["show_commit_url"]`, de-duplicates, respects `only` filter; handles missing coverage gracefully.
+    - Reports: `breakpoints_scrape_comments` merges coverage, computes `n_tokens`, produces stable `report` text per commit; robust to PR-less commits.
+    - End-to-end: loading collection, attaching `breakpoints/coverage/comments/enriched_breakpoints`, saving to expected path.
+
+
+scrape.detect_dashboards.scrape_github` (uses `search_pages` + `_request_with_backoff`), `scrape.filter_dashboards.filter_dashboards` (+ `enrich_repos`)
+    - `search_pages`: unique repo names, pagination stop conditions, jitter/backoff; correct query assembly.
+    - `_request_with_backoff`: retry on 403/429 with backoff; success path returns JSON.
+    - `filter_dashboards`/`enrich_repos`: adds `is_accessible/is_fork/is_archived/watchers/stars`; filtering logic; empty-input error.
+
+
+
+
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import ClassVar
+from unittest.mock import Mock, patch
+
+import pandas as pd
+import pytest
+import requests
+
+from datasmith.detection.detect_breakpoints import detect_all_breakpoints, get_detection_method
+from datasmith.scrape.code_coverage import generate_coverage_dataframe
+from datasmith.scrape.detect_dashboards import search_pages
+from datasmith.scrape.filter_dashboards import enrich_repos, filter_dashboards
+from datasmith.scrape.scrape_dashboards import make_benchmark_from_html
+from datasmith.utils import _request_with_backoff
+
+
+class TestMakeBenchmarkFromHtml:
+    def _create_mock_dashboard(self, tmp_path: Path) -> Path:
+        """Create a minimal mock ASV dashboard structure."""
+        html_dir = tmp_path / "html"
+        html_dir.mkdir()
+
+        # Create index.json
+        index_data = {
+            "params": ["machine"],
+            "benchmarks": ["bench.func1", "bench.func2"],
+            "graph_param_list": [{"machine": "docker"}],
+            "revision_to_hash": {"1": "abc123", "2": "def456"},
+            "revision_to_date": {"1": "2024-01-01T00:00:00Z", "2": "2024-01-02T00:00:00Z"},
+        }
+        (html_dir / "index.json").write_text(json.dumps(index_data))
+
+        # Create graphs directory structure
+        graphs_dir = html_dir / "graphs" / "machine-docker"
+        graphs_dir.mkdir(parents=True)
+
+        # Create benchmark JSON files
+        bench_data = [["1", 1.5], ["2", 1.2]]
+        (graphs_dir / "bench.func1.json").write_text(json.dumps(bench_data))
+        (graphs_dir / "bench.func2.json").write_text(json.dumps(bench_data))
+
+        # Create summary files
+        summary_dir = html_dir / "graphs" / "summary"
+        summary_dir.mkdir()
+        (summary_dir / "bench.func1.json").write_text(json.dumps(bench_data))
+        (summary_dir / "bench.func2.json").write_text(json.dumps(bench_data))
+
+        return html_dir
+
+    def test_local_dashboard_parsing(self, tmp_path: Path) -> None:
+        """Test parsing a local ASV dashboard."""
+        html_dir = self._create_mock_dashboard(tmp_path)
+
+        collection = make_benchmark_from_html(base_url=str(html_dir), html_dir=str(tmp_path / "output"), force=False)
+
+        assert collection is not None
+        assert collection.base_url == str(html_dir)
+        assert "machine" in collection.param_keys
+
+        # Check benchmarks DataFrame
+        assert not collection.benchmarks.empty
+        required_cols = {"revision", "time", "hash", "benchmark", "machine", "date"}
+        assert required_cols.issubset(set(collection.benchmarks.columns))
+
+        # Check summaries DataFrame
+        assert not collection.summaries.empty
+        summary_cols = {"revision", "time", "hash", "benchmark", "date"}
+        assert summary_cols.issubset(set(collection.summaries.columns))
+
+    def test_missing_index_json(self, tmp_path: Path) -> None:
+        """Test behavior when index.json is missing."""
+        html_dir = tmp_path / "empty"
+        html_dir.mkdir()
+
+        collection = make_benchmark_from_html(base_url=str(html_dir), html_dir=str(tmp_path / "output"), force=False)
+
+        assert collection is None
+
+    def test_invalid_json_handling(self, tmp_path: Path) -> None:
+        """Test graceful handling of invalid JSON files."""
+        html_dir = tmp_path / "html"
+        html_dir.mkdir()
+
+        # Create valid index.json
+        index_data = {
+            "params": ["machine"],
+            "benchmarks": ["bench.func1"],
+            "graph_param_list": [{"machine": "docker"}],
+            "revision_to_hash": {"1": "abc123"},
+            "revision_to_date": {"1": "2024-01-01T00:00:00Z"},
+        }
+        (html_dir / "index.json").write_text(json.dumps(index_data))
+
+        # Create invalid benchmark JSON
+        graphs_dir = html_dir / "graphs" / "machine-docker"
+        graphs_dir.mkdir(parents=True)
+        (graphs_dir / "bench.func1.json").write_text("invalid json")
+
+        # Create valid summary
+        summary_dir = html_dir / "graphs" / "summary"
+        summary_dir.mkdir()
+        (summary_dir / "bench.func1.json").write_text(json.dumps([["1", 1.5]]))
+
+        collection = make_benchmark_from_html(base_url=str(html_dir), html_dir=str(tmp_path / "output"), force=False)
+
+        assert collection is not None
+        # Should have summaries but benchmarks may be empty due to invalid JSON
+        # The function should handle empty frames gracefully
+        assert not collection.summaries.empty
+
+    @patch("requests.get")
+    def test_remote_url_handling(self, mock_get: Mock, tmp_path: Path) -> None:
+        """Test handling of remote HTTP URLs."""
+        # Mock successful HTTP responses
+        index_response = Mock()
+        index_response.status_code = 200
+        index_response.content = json.dumps({
+            "params": ["machine"],
+            "benchmarks": ["bench.func1"],
+            "graph_param_list": [{"machine": "docker"}],
+            "revision_to_hash": {"1": "abc123"},
+            "revision_to_date": {"1": "2024-01-01T00:00:00Z"},
+        }).encode()
+
+        bench_response = Mock()
+        bench_response.status_code = 200
+        bench_response.content = json.dumps([["1", 1.5]]).encode()
+
+        mock_get.side_effect = [index_response, bench_response, bench_response]
+
+        collection = make_benchmark_from_html(
+            base_url="https://example.com/dashboard", html_dir=str(tmp_path / "output"), force=True
+        )
+
+        assert collection is not None
+        assert collection.base_url == "https://example.com/dashboard"
+
+
+class TestDetectionMethods:
+    def test_get_detection_method_valid(self) -> None:
+        """Test get_detection_method returns correct callables."""
+        rbf_method = get_detection_method("rbf")
+        asv_method = get_detection_method("asv")
+
+        assert callable(rbf_method)
+        assert callable(asv_method)
+        assert rbf_method != asv_method
+
+    def test_get_detection_method_invalid(self) -> None:
+        """Test get_detection_method raises error for invalid method."""
+        with pytest.raises(ValueError, match="Unknown method: invalid"):
+            get_detection_method("invalid")
+
+    def test_detect_all_breakpoints_required_columns(self) -> None:
+        """Test detect_all_breakpoints validates required columns."""
+        df_missing_cols = pd.DataFrame({"benchmark": ["test"], "time": [1.0]})
+
+        with pytest.raises(ValueError):
+            detect_all_breakpoints(df_missing_cols)
+
+    def test_detect_all_breakpoints_synthetic_data(self) -> None:
+        """Test detect_all_breakpoints on synthetic performance regression."""
+        # Create synthetic data with clear performance improvement
+        data = []
+        for i in range(10):
+            # Performance gets better at index 5
+            time_val = 2.0 if i < 5 else 1.0
+            data.append({
+                "benchmark": "test.func",
+                "time": time_val,
+                "hash": f"hash{i:02d}",
+            })
+
+        df = pd.DataFrame(data)
+        breakpoints = detect_all_breakpoints(df, method="rbf")
+
+        # Should detect breakpoints or return empty DataFrame gracefully
+        assert isinstance(breakpoints, pd.DataFrame)
+
+
+class TestCoverageGeneration:
+    @patch("datasmith.scrape.code_coverage._iter_commit_coverage")
+    def test_generate_coverage_dataframe(self, mock_iter: Mock) -> None:
+        """Test coverage dataframe generation."""
+        # Mock commit coverage iteration
+        mock_iter.return_value = [("file1.py", 85.0), ("file2.py", None)]
+
+        breakpoints_df = pd.DataFrame({"hash": ["abc123"], "gt_hash": ["def456"]})
+
+        index_data = {"show_commit_url": "https://github.com/org/repo/commit/"}
+
+        coverage_df = generate_coverage_dataframe(breakpoints_df, index_data, only=None)
+
+        assert not coverage_df.empty
+        expected_cols = {"typ", "url", "path", "coverage"}
+        assert expected_cols.issubset(set(coverage_df.columns))
+
+
+class TestRepositoryFiltering:
+    MOCK_RESPONSES: ClassVar[dict[str, dict]] = {
+        "org/repo1": {
+            "fork": False,
+            "archived": False,
+            "disabled": False,
+            "subscribers_count": 100,
+            "stargazers_count": 500,
+        },
+        "org/repo2": {
+            "fork": True,
+            "archived": False,
+            "disabled": False,
+            "subscribers_count": 200,
+            "stargazers_count": 1000,
+        },
+        "org/repo3": {
+            "fork": False,
+            "archived": True,
+            "disabled": False,
+            "subscribers_count": 300,
+            "stargazers_count": 1500,
+        },
+    }
+
+    @patch("datasmith.scrape.filter_dashboards._get_repo_metadata")
+    def test_enrich_repos(self, mock_metadata: Mock) -> None:
+        """Test repository enrichment with GitHub metadata."""
+        # Mock GitHub API responses
+        mock_metadata.return_value = {
+            "fork": False,
+            "archived": False,
+            "disabled": False,
+            "subscribers_count": 100,
+            "stargazers_count": 500,
+        }
+
+        df = pd.DataFrame({"repo_name": ["https://github.com/org/repo"]})
+        enriched = enrich_repos(df, url_col="repo_name", show_progress=False)
+
+        expected_cols = {"is_accessible", "is_fork", "is_archived", "fork_parent", "forked_at", "watchers", "stars"}
+        assert expected_cols.issubset(set(enriched.columns))
+        assert enriched.iloc[0]["is_accessible"] is True
+        assert enriched.iloc[0]["is_fork"] is False
+        assert enriched.iloc[0]["stars"] == 500
+
+    def test_filter_dashboards_empty_input(self) -> None:
+        """Test filter_dashboards handles empty input."""
+        df = pd.DataFrame({"repo_name": []})
+
+        with pytest.raises(ValueError, match="Dataframe empty"):
+            filter_dashboards(df, url_col="repo_name", show_progress=False)
+
+    @patch("datasmith.scrape.filter_dashboards._get_repo_metadata")
+    def test_filter_dashboards_filtering_logic(self, mock_metadata: Mock) -> None:
+        """Test filtering logic removes forks and archived repos."""
+
+        # Mock responses for different repo types
+        def mock_response(full_name: str) -> dict | None:
+            repo_info = self.MOCK_RESPONSES.get(full_name)
+            if repo_info is None:
+                return None
+            return repo_info
+
+        mock_metadata.side_effect = mock_response
+
+        df = pd.DataFrame({
+            "repo_name": [
+                "https://github.com/org/repo1",
+                "https://github.com/org/repo2",
+                "https://github.com/org/repo3",
+            ]
+        })
+
+        filtered = filter_dashboards(df, url_col="repo_name", show_progress=False)
+
+        # Should only keep repo1 (not fork, not archived)
+        assert len(filtered) == 1
+        assert "repo1" in filtered.iloc[0]["repo_name"]
+
+
+class TestRequestWithBackoff:
+    @patch("time.sleep")
+    @patch("datasmith.utils._build_headers")
+    def test_request_success(self, mock_headers: Mock, mock_sleep: Mock) -> None:
+        """Test successful request without retries."""
+        mock_headers.return_value = {"Authorization": "Bearer token"}
+
+        with patch.object(requests.Session, "get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = {"data": "test"}
+            mock_get.return_value = mock_response
+
+            session = requests.Session()
+            response = _request_with_backoff(url="https://api.github.com/test", site_name="github", session=session)
+
+            assert response.status_code == 200
+            mock_sleep.assert_called()  # Should still throttle
+
+    @patch("time.sleep")
+    @patch("time.time")
+    @patch("datasmith.utils._build_headers")
+    def test_request_rate_limited(self, mock_headers: Mock, mock_time: Mock, mock_sleep: Mock) -> None:
+        """Test retry behavior on rate limiting."""
+        mock_headers.return_value = {"Authorization": "Bearer token"}
+        mock_time.return_value = 1000.0
+
+        with patch.object(requests.Session, "get") as mock_get:
+            # First call returns 429, second succeeds
+            rate_limited_response = Mock()
+            rate_limited_response.status_code = 429
+            rate_limited_response.headers = {"X-RateLimit-Reset": "1010", "X-RateLimit-Remaining": "0"}
+
+            success_response = Mock()
+            success_response.status_code = 200
+            success_response.json.return_value = {"data": "test"}
+
+            mock_get.side_effect = [rate_limited_response, success_response]
+
+            session = requests.Session()
+            response = _request_with_backoff(
+                url="https://api.github.com/test", site_name="github", session=session, max_retries=2
+            )
+
+            assert response.status_code == 200
+            assert mock_get.call_count == 2
+
+
+class TestSearchPages:
+    @patch.dict("os.environ", {"GH_TOKEN": ""}, clear=False)
+    @patch("time.sleep")
+    @patch("requests.Session.get")
+    def test_search_pages_pagination(self, mock_get: Mock, mock_sleep: Mock) -> None:
+        """Test search_pages handles pagination correctly."""
+        # Mock responses for multiple pages
+        page1_response = Mock()
+        page1_response.status_code = 200
+        page1_response.json.return_value = {
+            "items": [{"repository": {"full_name": "org/repo1"}}, {"repository": {"full_name": "org/repo2"}}],
+            "incomplete_results": False,
+        }
+
+        page2_response = Mock()
+        page2_response.status_code = 200
+        page2_response.json.return_value = {
+            "items": [{"repository": {"full_name": "org/repo3"}}, {"repository": {"full_name": "org/repo4"}}],
+            "incomplete_results": False,
+        }
+
+        # Empty page signals end
+        page3_response = Mock()
+        page3_response.status_code = 200
+        page3_response.json.return_value = {"items": []}
+
+        mock_get.side_effect = [page1_response, page2_response, page3_response]
+
+        repos = list(search_pages(max_pages=5, per_page=2, query="filename:asv.conf.json"))
+
+        assert repos == ["org/repo1", "org/repo2", "org/repo3", "org/repo4"]
+        # Now all 3 calls should be made
+        assert mock_get.call_count == 3
+
+    @patch.dict("os.environ", {"GH_TOKEN": ""}, clear=False)
+    @patch("time.sleep")
+    @patch("requests.Session.get")
+    def test_search_pages_deduplication(self, mock_get: Mock, mock_sleep: Mock) -> None:
+        """Test search_pages deduplicates repository names."""
+        response = Mock()
+        response.status_code = 200
+        response.json.return_value = {
+            "items": [
+                {"repository": {"full_name": "org/repo1"}},
+                {"repository": {"full_name": "org/repo1"}},  # Duplicate
+                {"repository": {"full_name": "org/repo2"}},
+            ]
+        }
+
+        mock_get.return_value = response
+
+        repos = list(search_pages(max_pages=1, per_page=10, query="filename:asv.conf.json"))
+
+        assert repos == ["org/repo1", "org/repo2"]

From b7cbfeae0c843ddc521fdd8bc3ba509ce368b717 Mon Sep 17 00:00:00 2001
From: Atharva Sehgal <atharva.sehgal@gmail.com>
Date: Thu, 14 Aug 2025 20:43:22 +0000
Subject: [PATCH 3/3] resolve test case type issues

---
 tests/test_scraper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_scraper.py b/tests/test_scraper.py
index 3817056..6e056b7 100644
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -272,8 +272,8 @@ def test_enrich_repos(self, mock_metadata: Mock) -> None:
 
         expected_cols = {"is_accessible", "is_fork", "is_archived", "fork_parent", "forked_at", "watchers", "stars"}
         assert expected_cols.issubset(set(enriched.columns))
-        assert enriched.iloc[0]["is_accessible"] is True
-        assert enriched.iloc[0]["is_fork"] is False
+        assert bool(enriched.iloc[0]["is_accessible"]) is True
+        assert bool(enriched.iloc[0]["is_fork"]) is False
         assert enriched.iloc[0]["stars"] == 500
 
     def test_filter_dashboards_empty_input(self) -> None: