|
3 | 3 | import argparse |
4 | 4 | import json |
5 | 5 | import re |
| 6 | +import tempfile |
6 | 7 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor |
7 | 8 | from pathlib import Path |
8 | 9 |
|
9 | 10 | import pandas as pd |
| 11 | +from git import Repo |
10 | 12 | from tqdm.auto import tqdm |
11 | 13 |
|
12 | | -from datasmith.execution.utils import _get_commit_info, find_file_in_tree |
| 14 | +from datasmith.execution.utils import _get_commit_info_offline, find_file_in_tree |
13 | 15 | from datasmith.logging_config import configure_logging |
14 | 16 |
|
15 | 17 | # Configure logging for the script |
@@ -37,10 +39,11 @@ def _asv_conf_worker(repo_name: str) -> str | None: |
37 | 39 | return find_file_in_tree(repo_name, "asv.conf.json") |
38 | 40 |
|
39 | 41 |
|
40 | | -def _commit_info_worker(arg_tuple) -> dict | None: |
| 42 | +def _commit_info_worker(arg_tuple: tuple[Repo, str]) -> dict | None: |
41 | 43 | """Wrapper for ProcessPool: arg_tuple = (repo_name, sha).""" |
42 | 44 | repo, sha = arg_tuple |
43 | | - return _get_commit_info(repo, sha) |
| 45 | + # return _get_commit_info(repo, sha) |
| 46 | + return _get_commit_info_offline(repo, sha) |
44 | 47 |
|
45 | 48 |
|
46 | 49 | NON_CORE_PATTERNS = re.compile( |
@@ -107,27 +110,51 @@ def main() -> None: |
107 | 110 | commits = commits.merge(benchmarks, how="right", on="repo_name") |
108 | 111 | commits = commits.dropna(subset=["commit_sha"]) |
109 | 112 |
|
110 | | - with ProcessPoolExecutor(max_workers=args.procs) as pp: |
111 | | - commits["commit_info"] = list( |
112 | | - tqdm( |
113 | | - pp.map(_commit_info_worker, commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None)), |
114 | | - total=len(commits), |
115 | | - desc="Fetching commit metadata", |
| 113 | + all_repo_names = set(commits["repo_name"]) |
| 114 | + |
| 115 | + # download all repos to a temp dir |
| 116 | + with tempfile.TemporaryDirectory(prefix="gh-repos-") as td: |
| 117 | + all_repos = {} |
| 118 | + for repo_name in tqdm(all_repo_names, desc="Cloning repos"): |
| 119 | + repo_name = repo_name.strip("/") |
| 120 | + owner, name = repo_name.split("/", 1) |
| 121 | + path = Path(td) / f"{owner}__{name}.git" |
| 122 | + repo = Repo.clone_from( |
| 123 | + f"https://github.com/{repo_name}.git", |
| 124 | + path, |
| 125 | + bare=True, |
| 126 | + # multi_options=["--filter=tree:0"], |
| 127 | + multi_options=["--filter=blob:none"], |
| 128 | + quiet=True, |
| 129 | + ) |
| 130 | + all_repos[repo_name] = repo |
| 131 | + |
| 132 | + commit_info_args: list[tuple[Repo, str]] = [] |
| 133 | + for repo_name, commit_sha in commits[["repo_name", "commit_sha"]].itertuples(index=False, name=None): |
| 134 | + repo = all_repos[repo_name] |
| 135 | + commit_info_args.append((repo, commit_sha)) |
| 136 | + |
| 137 | + with ProcessPoolExecutor(max_workers=args.procs) as pp: |
| 138 | + commits["commit_info"] = list( |
| 139 | + tqdm( |
| 140 | + pp.map(_commit_info_worker, commit_info_args), |
| 141 | + total=len(commits), |
| 142 | + desc="Fetching commit metadata", |
| 143 | + ) |
116 | 144 | ) |
117 | | - ) |
118 | 145 |
|
119 | | - commit_meta = pd.json_normalize(commits.pop("commit_info")) |
120 | | - commits = pd.concat([commits, commit_meta], axis=1) |
121 | | - commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"]) |
122 | | - commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True) |
| 146 | + commit_meta = pd.json_normalize(commits.pop("commit_info")) |
| 147 | + commits = pd.concat([commits, commit_meta], axis=1) |
| 148 | + commits = commits.dropna(subset=["asv_conf_path", "sha", "date", "message"]) |
| 149 | + commits = commits[commits["files_changed"].apply(has_core_file)].reset_index(drop=True) |
123 | 150 |
|
124 | 151 | out_path = Path(args.output_pth) |
125 | 152 | if not out_path.parent.exists(): |
126 | 153 | out_path.parent.mkdir(parents=True, exist_ok=True) |
127 | 154 | # commits.to_csv(out_path, index=False) |
128 | 155 | commits.to_json(out_path, orient="records", lines=True, index=False) |
129 | 156 |
|
130 | | - logger.info(f"✔ Wrote {len(commits):,} rows → {out_path}") |
| 157 | + logger.info("✔ Wrote %s rows → %s", len(commits), out_path) |
131 | 158 |
|
132 | 159 |
|
133 | 160 | if __name__ == "__main__": |
|
0 commit comments