formula-code
diff --git a/‎Makefile‎
Lines changed: 16 additions & 0 deletions b/‎Makefile‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 18 additions & 16 deletions b/‎pyproject.toml‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎scripts/analyze_benchmark_results.py‎
Lines changed: 69 additions & 0 deletions b/‎scripts/analyze_benchmark_results.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎src/datasmith/analysis/analyze_benchmark_results.py‎
Lines changed: 201 additions & 0 deletions b/‎src/datasmith/analysis/analyze_benchmark_results.py‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎src/datasmith/docker/entrypoint.sh‎
Lines changed: 1 addition & 0 deletions b/‎src/datasmith/docker/entrypoint.sh‎
Lines changed: 1 addition & 0 deletions
@@ -4,6 +4,22 @@ install: ## Install the virtual environment and install the pre-commit hooks
 	@uv sync
 	@uv run pre-commit install
 
+.PHONY: backup
+backup: ## Create a backup of the datasets, results, and analysis directories
+	@echo "🚀 Creating backup archive"
+	@/usr/bin/env bash -euo pipefail -c '\
+		if [ ! -f tokens.env ]; then \
+			echo "❌ Error: tokens.env file not found"; exit 1; \
+		fi; \
+		BACKUP_DIR=$$(grep -E "^BACKUP_DIR=" tokens.env | head -n1 | cut -d "=" -f2-); \
+		if [ -z "$$BACKUP_DIR" ]; then \
+			echo "❌ Error: BACKUP_DIR not defined in tokens.env"; exit 1; \
+		fi; \
+		mkdir -p "$$BACKUP_DIR"; \
+		zip -qr "$$BACKUP_DIR/datasmith.bckp" raw_datasets results analysis; \
+		cp -f cache.db "$$BACKUP_DIR/datasmith.cache.bckp"; \
+	'
+
 .PHONY: check
 check: ## Run code quality tools.
 	@echo "🚀 Checking lock file consistency with 'pyproject.toml'"
 
@@ -126,7 +126,7 @@ Given the list of repositories, we find the subset of commits that have already
 
 ```bash
 $ python scripts/collect_commits.py --dashboards raw_datasets/asv_benchmarks_filtered.csv --outfile raw_datasets/benchmark_commits_merged.jsonl
-$ python scripts/filter_commits.py --filtered-benchmarks-pth raw_datasets/asv_benchmarks_filtered.csv --merged-commits-pth raw_datasets/benchmark_commits_merged.jsonl --output-pth raw_datasets/benchmark_commits_filtered.jsonl --max-repos 150 --threads 8 --procs 8
+$ python scripts/filter_commits.py --filtered-benchmarks-pth raw_datasets/asv_benchmarks_filtered.csv --merged-commits-pth raw_datasets/benchmark_commits_merged.jsonl --output-pth raw_datasets/benchmark_commits_filtered.jsonl --max-repos 350 --threads 8 --procs 8
 ```
 ### 5. Benchmark all commits
 
@@ -136,6 +136,7 @@ $ python scripts/filter_commits.py --filtered-benchmarks-pth raw_datasets/asv_be
 Once we've collected the relevant commits, we can benchmark their performance using `asv`. `asv` includes many quality-of-life features to ensure that benchmarks are robust to noise and that the results are reproducible. Our script benchmarks multiple commits in parallel. Proper benchmarking requires some system tuning. Refer to the [asv tuning guidelines](https://asv.readthedocs.io/en/latest/tuning.html) for more details.
 
 ```bash
+# in a root shell:
 (sudo) $ export OPENBLAS_NUM_THREADS=1
 (sudo) $ export MKL_NUM_THREADS=1
 (sudo) $ export OMP_NUM_THREADS=1
@@ -149,7 +150,7 @@ Generally, each benchmark takes ~2 minutes to run, so benchmarking 70,000 commit
 ### 6. Analyze benchmark results
 
 ```bash
-$ python scripts/analyze_benchmark_results.py --benchmark-results benchmark_results/ --output-dir analysis
+$ python scripts/analyze_benchmark_results.py --results-dir benchmark_results/results --output-dir analysis/ --commit-metadata raw_datasets/benchmark_commits_filtered.jsonl
 ```
 
 
 
@@ -18,15 +18,16 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
 dependencies = [
-    "pandas",
-    "tqdm",
-    "simple-useragent",
-    "requests",
-    "docker",
     "asv",
+    "docker",
+    "gitpython",
     "numpy",
+    "pandas",
+    "requests",
     "ruptures",
+    "simple-useragent",
     "tiktoken",
+    "tqdm",
 ]
 
 [project.urls]
@@ -36,23 +37,24 @@ Documentation = "https://formula-code.github.io/datasmith/"
 
 [dependency-groups]
 dev = [
-    "pytest>=7.2.0",
-    "pre-commit>=2.20.0",
-    "tox-uv>=1.11.3",
+    "asv",
     "deptry>=0.23.0",
-    "mypy>=0.991",
-    "pytest-cov>=4.0.0",
     "docker",
-    "ruff>=0.11.5",
-    "asv",
-    "pyarrow",
+    "gitpython",
+    "ipykernel",
     "ipython",
+    "matplotlib",
+    "mypy>=0.991",
     "pandas-stubs",
+    "pre-commit>=2.20.0",
+    "pyarrow",
+    "pytest-cov>=4.0.0",
+    "pytest>=7.2.0",
+    "ruff>=0.11.5",
+    "tox-uv>=1.11.3",
+    "types-docker",
     "types-requests",
     "types-tqdm",
-    "types-docker",
-    "ipykernel",
-    "matplotlib",
 ]
 
 [build-system]
 
@@ -0,0 +1,69 @@
+import argparse
+import json
+from pathlib import Path
+
+import pandas as pd
+
+from datasmith.analysis.analyze_benchmark_results import aggregate_benchmark_runs, publish_repo
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Collects benchmark results from parallel worker runs and creates a merged benchmark file."
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        required=True,
+        help="Path to the directory containing benchmark results in format `<results-dir>/<commit-id>/<python-version>/results/`",
+    )
+    parser.add_argument(
+        "--commit-metadata",
+        type=Path,
+        required=True,
+        help="Path to the jsonl file containing the commit IDs and associated metadata (e.g. repo_name)",
+    )
+    parser.add_argument(
+        "--output-dir", type=Path, required=True, help="Path to the directory where merged benchmarks will be saved"
+    )
+    parser.add_argument(
+        "--default-machine-name",
+        type=str,
+        default=None,
+        help="Default machine name to use for all runs. If not provided, the machine name from the run ID will be used.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    # Load results from the specified directory
+    all_commits = []
+    with open(args.commit_metadata, encoding="utf-8") as f:
+        for line in f:
+            commit = json.loads(line.strip().replace("None", "null"))
+            all_commits.append(commit)
+    all_commits_df = pd.DataFrame(all_commits)
+    all_commits_df["commit_sha"] = all_commits_df["commit_sha"].astype("string")
+
+    # Aggregate benchmark runs and save them to the output directory
+    stats = aggregate_benchmark_runs(
+        all_commits_df, args.results_dir, args.output_dir / "results", default_machine_name=args.default_machine_name
+    )
+
+    # Run asv publish on the output directory
+    repos = {stat["repo_path"]: stat for stat in stats}
+    for repo_path, stat in repos.items():
+        repo_url = f"https://github.com/{stat['metadata']['repo_name']}.git"
+        publish_repo(
+            repo_url=repo_url,
+            repo_local_dir=(args.output_dir / "repos" / repo_path).resolve(),
+            asv_conf_path=(args.output_dir / "results" / repo_path / "asv.conf.json").resolve(),
+            results_dir=(args.output_dir / "results" / repo_path).resolve(),
+            html_dir=(args.output_dir / "html" / repo_path).resolve(),
+        )
+    print(f"Benchmark results aggregated and saved to {(args.output_dir / 'html').resolve()}.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,201 @@
+import json
+import shutil
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+from asv.commands.publish import Publish  # type: ignore[import-untyped]
+from asv.config import Config  # type: ignore[import-untyped]
+from asv.util import write_json  # type: ignore[import-untyped]
+from git import Repo
+
+
+def _get_all_commits_dict(all_commits_df: pd.DataFrame) -> dict:
+    """Return a dict mapping commit_sha to metadata."""
+    return all_commits_df.set_index("commit_sha", inplace=False).to_dict(orient="index")
+
+
+def _update_dict(pth: Path, new_data: dict) -> None:
+    """
+    Update a JSON file at the given path with new data.
+
+    Args:
+        path (Path): Path to the JSON file.
+        new_data (dict): New data to update the JSON file with.
+    """
+    if not pth.exists():
+        pth.parent.mkdir(parents=True, exist_ok=True)
+        with open(pth, "w", encoding="utf-8") as f:
+            json.dump(new_data, f)
+    else:
+        with open(pth, "r+", encoding="utf-8") as f:
+            try:
+                saved_benchmarks = json.load(f)
+            except json.JSONDecodeError:
+                saved_benchmarks = {}
+        saved_benchmarks.update(new_data)
+
+        with open(pth, "w", encoding="utf-8") as f:
+            json.dump(saved_benchmarks, f)
+
+
+def _update_json(src_path: Path, dest_path: Path) -> None:
+    """Load a JSON file and save it to dest_path using _update_dict."""
+    with open(src_path, encoding="utf-8") as f:
+        data = json.load(f)
+    _update_dict(dest_path, data)
+
+
+def _update_machine_jsons(runid: Path, runid_newpath: Path, default_machine_name: str) -> Optional[dict]:
+    """Update machine name in machine.json and params['machine'] in other json files."""
+    machine_data = None
+    old_file_names = [f.name for f in runid.iterdir()]
+    for fname in old_file_names:
+        src_file = runid / fname
+        dest_file = runid_newpath / fname
+        if fname == "machine.json":
+            with open(src_file, encoding="utf-8") as f:
+                machine_data = json.load(f)
+            machine_data["machine"] = default_machine_name
+            with open(dest_file, "w", encoding="utf-8") as f:
+                json.dump(machine_data, f)
+        elif fname.endswith(".json"):
+            with open(src_file, encoding="utf-8") as f:
+                run_data = json.load(f)
+            if "params" in run_data and "machine" in run_data["params"]:
+                run_data["params"]["machine"] = default_machine_name
+            with open(dest_file, "w", encoding="utf-8") as f:
+                json.dump(run_data, f)
+    return machine_data
+
+
+def _process_runid_folder(runid: Path, runid_newpath: Path, default_machine_name: Optional[str]) -> Optional[dict]:
+    """Copy and update runid folder, handling machine name if needed."""
+    machine_data = None
+    if default_machine_name is not None:
+        if runid_newpath.exists():
+            shutil.rmtree(runid_newpath)
+        runid_newpath.mkdir(parents=True, exist_ok=True)
+        machine_data = _update_machine_jsons(runid, runid_newpath, default_machine_name)
+    else:
+        if runid_newpath.exists():
+            shutil.rmtree(runid_newpath)
+        shutil.copytree(runid, runid_newpath)
+    return machine_data
+
+
+def aggregate_benchmark_runs(
+    all_commits_df: pd.DataFrame, results_dir: Path, output_dir: Path, default_machine_name: Optional[str] = None
+) -> list[dict]:
+    """
+    Aggregates benchmark runs from the specified results directory and saves them to the output directory.
+
+    Args:
+        all_commits_df (pd.DataFrame): DataFrame containing commit metadata.
+        results_dir (Path): Path to the directory containing benchmark results.
+        output_dir (Path): Path to the directory where merged benchmarks will be saved.
+    """
+    stats = []
+    all_commits_dict = _get_all_commits_dict(all_commits_df)
+
+    for commit_pth in results_dir.glob(r'*/"[0-9].[0-9]*"/results/'):
+        commit_id = commit_pth.parent.parent.name
+        if commit_id not in all_commits_dict:
+            continue
+        commit_metadata = all_commits_dict[commit_id]
+        repo_path = (commit_metadata["repo_name"]).replace("/", "_")
+        repo_out_dir = output_dir / repo_path
+        repo_out_dir.mkdir(parents=True, exist_ok=True)
+
+        benchmarks_path = commit_pth / "benchmarks.json"
+        if benchmarks_path.exists():
+            _update_json(benchmarks_path, repo_out_dir / "benchmarks.json")
+
+        asv_conf_path = commit_pth.parent / "asv.conf.json"
+        if asv_conf_path.exists():
+            _update_json(asv_conf_path, repo_out_dir / "asv.conf.json")
+        n_runids = 0
+        machine_data = None
+        for runid in commit_pth.iterdir():
+            if not runid.is_dir():
+                continue
+            n_runids += 1
+            name = default_machine_name if default_machine_name is not None else runid.name
+            runid_newpath = output_dir / repo_path / name
+            machine_data = _process_runid_folder(runid, runid_newpath, default_machine_name) or machine_data
+
+        if default_machine_name is not None and machine_data is not None:
+            saved_machine_path = output_dir / repo_path / "machine.json"
+            with open(saved_machine_path, "w", encoding="utf-8") as f:
+                json.dump(machine_data, f)
+
+        stats.append({
+            "repo_path": repo_path,
+            "metadata": commit_metadata,
+            "commit_sha": commit_id,
+            "n_runids": n_runids,
+        })
+    return stats
+
+
+def publish_repo(
+    repo_url: str,
+    repo_local_dir: Path,
+    asv_conf_path: Path,
+    results_dir: Path,
+    html_dir: Path,
+    *,
+    skip_if_present: bool = True,
+) -> None:
+    """
+    Ensure *repo_local_dir* contains an up-to-date clone of *repo_url*,
+    rewrite the ASV config at *asv_conf_path* to use the supplied
+    directories, then publish the results.
+
+    Parameters
+    ----------
+    repo_url : str
+        Full Git URL, e.g. ``"https://github.com/pandas-dev/pandas.git"``.
+    repo_local_dir : pathlib.Path
+        Where the repo should live locally.
+    asv_conf_path : pathlib.Path
+        Path to the repository's ``asv.conf.json`` on disk.
+    results_dir : pathlib.Path
+        Directory for ASV's benchmark result files.
+    html_dir : pathlib.Path
+        Directory where ASV should write its HTML report.
+    skip_if_present : bool, default True
+        If *repo_local_dir* already exists, skip cloning and simply use it.
+        Set to ``False`` to force a fresh clone each time.
+
+    Raises
+    ------
+    FileNotFoundError
+        If *asv_conf_path* does not exist.
+    """
+    # Clone or reuse the repository
+    if repo_local_dir.exists():
+        if skip_if_present:
+            print(f"Repository {repo_local_dir} already exists - reusing.")
+        else:
+            print(f"Removing {repo_local_dir} for a fresh clone…")
+            shutil.rmtree(repo_local_dir)
+            Repo.clone_from(repo_url, repo_local_dir)
+    else:
+        Repo.clone_from(repo_url, repo_local_dir)
+
+    # Load & patch asv.conf.json
+    with asv_conf_path.open(encoding="utf-8") as f:
+        asv_conf = json.load(f)
+
+    asv_conf.update(
+        repo=str(repo_local_dir.resolve()),
+        results_dir=str(results_dir.resolve()),
+        html_dir=str(html_dir.resolve()),
+    )
+
+    cfg = Config.from_json(asv_conf)
+    write_json(path=asv_conf_path, data=cfg.__dict__, api_version=1)
+
+    # Publish the results
+    Publish.run(cfg)
@@ -43,6 +43,7 @@ config.results_dir = str(path / 'results')
 config.html_dir = str(path / 'html')
 
 asv.util.write_json('asv.conf.json', config.__dict__, api_version=1)
+asv.util.write_json(path / 'asv.conf.json', config.__dict__, api_version=1)
 "
 
     micromamba create -y -n "asv_${version}" -c conda-forge python="$version" \
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ config.results_dir = str(path / 'results')`
`43`	`43`	`config.html_dir = str(path / 'html')`
`44`	`44`
`45`	`45`	`asv.util.write_json('asv.conf.json', config.__dict__, api_version=1)`
	`46`	`+asv.util.write_json(path / 'asv.conf.json', config.__dict__, api_version=1)`
`46`	`47`	`"`
`47`	`48`
`48`	`49`	`micromamba create -y -n "asv_${version}" -c conda-forge python="$version" \`