formula-code
diff --git a/‎scratch/merged_context_registry_2025-09-04T08:32:08.486247.json‎
Lines changed: 845 additions & 0 deletions b/‎scratch/merged_context_registry_2025-09-04T08:32:08.486247.json‎
Lines changed: 845 additions & 0 deletions
diff --git a/‎scratch/notebooks/collect_perf_commits.ipynb‎
Lines changed: 23 additions & 28 deletions b/‎scratch/notebooks/collect_perf_commits.ipynb‎
Lines changed: 23 additions & 28 deletions
diff --git a/‎scratch/scripts/benchmark_commits.py‎
Lines changed: 60 additions & 45 deletions b/‎scratch/scripts/benchmark_commits.py‎
Lines changed: 60 additions & 45 deletions
diff --git a/‎src/datasmith/agents/context_synthesis.py‎
Lines changed: 25 additions & 27 deletions b/‎src/datasmith/agents/context_synthesis.py‎
Lines changed: 25 additions & 27 deletions
@@ -2,7 +2,7 @@
   "cells": [
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 2,
       "id": "13626f75",
       "metadata": {},
       "outputs": [
@@ -12,6 +12,13 @@
           "text": [
             "/mnt/sdd1/atharvas/formulacode/datasmith\n"
           ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "03:16:55 WARNING  simple_useragent.core: Falling back to historic user agent.\n"
+          ]
         }
       ],
       "source": [
@@ -28,7 +35,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 3,
       "id": "b4179f19",
       "metadata": {},
       "outputs": [],
@@ -48,7 +55,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 4,
       "id": "6624689c",
       "metadata": {},
       "outputs": [
@@ -218,7 +225,7 @@
               "4  numpy/numpy-financial  "
             ]
           },
-          "execution_count": 6,
+          "execution_count": 4,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -231,7 +238,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": 5,
       "id": "79905eb5",
       "metadata": {},
       "outputs": [
@@ -291,17 +298,7 @@
       "execution_count": null,
       "id": "567cdaa5",
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "10:28:37 INFO     datasmith.docker.context: Context registry saved to scratch/artifacts/pipeflush/chunk_0/context_registry.json\n",
-            "10:28:41 INFO     datasmith.docker.context: Context registry saved to scratch/artifacts/pipeflush/chunk_1/context_registry.json\n",
-            "10:28:53 INFO     datasmith.docker.context: Context registry saved to scratch/artifacts/pipeflush/chunk_2/context_registry.json\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "# break parquet into three chunks with fixed ratios.\n",
         "ratios = [64, 56, 127]\n",
@@ -319,31 +316,29 @@
         "\n",
         "chunks = [df1, df2, df3]\n",
         "cmds = []\n",
-        "for i, (chunk, ratio) in enumerate(zip(chunks, ratios)):\n",
+        "for i, (_, ratio) in enumerate(zip(chunks, ratios)):\n",
         "    pth = Path(f\"scratch/artifacts/pipeflush/chunk_{i}/commits_perfonly.parquet\")\n",
         "    pth.parent.mkdir(parents=True, exist_ok=True)\n",
-        "    chunk.to_parquet(pth)\n",
+        "    # chunk.to_parquet(pth)\n",
         "    # Make a new context registry:\n",
-        "    cr.save_to_file(pth.parent / \"context_registry.json\")\n",
+        "    # cr.save_to_file(pth.parent / \"context_registry.json\")\n",
         "    cmd_i = cmd.format(output_dir=pth.parent, ncpus=(ratio // 2))\n",
         "    cmds.append(cmd_i)"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
+      "execution_count": null,
       "id": "3fafdd1c",
       "metadata": {},
       "outputs": [
         {
-          "ename": "NameError",
-          "evalue": "name 'cmds' is not defined",
-          "output_type": "error",
-          "traceback": [
-            "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-            "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
-            "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m.join(\u001b[43mcmds\u001b[49m).replace(\u001b[33m\"\u001b[39m\u001b[33m  \u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m))\n",
-            "\u001b[31mNameError\u001b[39m: name 'cmds' is not defined"
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "python scratch/scripts/synthesize_contexts.py --commits scratch/artifacts/pipeflush/chunk_0/commits_perfonly.parquet --output-dir scratch/artifacts/pipeflush/chunk_0/results_synthesis/ --context-registry scratch/artifacts/pipeflush/chunk_0/context_registry.json --max-workers 32 --limit-per-repo 2 --max-attempts 3 --max-steps 10\n",
+            "python scratch/scripts/synthesize_contexts.py --commits scratch/artifacts/pipeflush/chunk_1/commits_perfonly.parquet --output-dir scratch/artifacts/pipeflush/chunk_1/results_synthesis/ --context-registry scratch/artifacts/pipeflush/chunk_1/context_registry.json --max-workers 28 --limit-per-repo 2 --max-attempts 3 --max-steps 10\n",
+            "python scratch/scripts/synthesize_contexts.py --commits scratch/artifacts/pipeflush/chunk_2/commits_perfonly.parquet --output-dir scratch/artifacts/pipeflush/chunk_2/results_synthesis/ --context-registry scratch/artifacts/pipeflush/chunk_2/context_registry.json --max-workers 63 --limit-per-repo 2 --max-attempts 3 --max-steps 10\n"
           ]
         }
       ],
 
@@ -8,25 +8,24 @@
 import os
 import pickle
 import shutil
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
 from pathlib import Path
 
 import asv
 import pandas as pd
 
 from datasmith.benchmark.collection import BenchmarkCollection
-from datasmith.docker.context import ContextRegistry
+from datasmith.docker.context import ContextRegistry, DockerContext, Task
 from datasmith.docker.orchestrator import (
-    build_repo_sha_image,
     get_docker_client,
     orchestrate,
 )
-from datasmith.docker.validation import BuildResult, Task
+from datasmith.execution.collect_commits_offline import find_parent_releases
 from datasmith.logging_config import configure_logging
 from datasmith.scrape.utils import _parse_commit_url
 
-# logger = configure_logging(level=logging.DEBUG, stream=open(Path(__file__).with_suffix(".log"), "w"))
-logger = configure_logging(level=logging.DEBUG)
+logger = configure_logging(level=logging.DEBUG, stream=open(Path(__file__).with_suffix(".log"), "w"))  # noqa: SIM115
+# logger = configure_logging(level=logging.DEBUG)
 
 
 def parse_args() -> argparse.Namespace:
@@ -107,11 +106,13 @@ def process_inputs(args: argparse.Namespace) -> dict[tuple[str, str], set[tuple[
             else:
                 all_states[(owner, repo)].add((sha, 0.0))
     elif args.commits:
-        commits = pd.read_json(args.commits, lines=True)
+        commits = (
+            pd.read_json(args.commits, lines=True) if args.commits.suffix == ".jsonl" else pd.read_parquet(args.commits)
+        )
         all_states = {}
         for _, row in commits.iterrows():
             repo_name = row["repo_name"]
-            sha = row["commit_sha"]
+            sha = row["sha"]
             has_asv = row.get("has_asv", True)
             if not has_asv:
                 logger.debug(f"Skipping {repo_name} commit {sha} as it does not have ASV benchmarks.")
@@ -135,16 +136,30 @@ def main(args: argparse.Namespace) -> None:
     context_registry = ContextRegistry.load_from_file(path=args.context_registry)
 
     # Prepare tasks
-    tasks: list[Task] = []
+    tasks: list[tuple[Task, DockerContext]] = []
+    repo_commit_pairs = defaultdict(list)
     for (owner, repo), uniq in all_states.items():
         limited = list(uniq)[: max(0, args.limit_per_repo)] if args.limit_per_repo > 0 else list(uniq)
         for sha, date in limited:
             task = Task(owner, repo, sha, commit_date=date)
             if task in context_registry:
-                tasks.append(task)
+                tasks.append((task, context_registry.get(task)))
+                repo_commit_pairs[f"{owner}/{repo}"].append(task)
+                # also add the parent commit.
             else:
                 logger.debug(f"main: skipping {task} as not in context registry")
 
+    # get all parent commits and add them as tasks as well.
+    for repo_name, tsks in repo_commit_pairs.items():
+        owner, repo = repo_name.split("/")
+        shas = [t.sha for t in tsks]
+        parent_commits = find_parent_releases(repo_name, shas, add_first=True, incl_datetime=True)
+        for i, (parent_sha, date) in enumerate(parent_commits):
+            parent_task = Task(owner=owner, repo=repo, sha=parent_sha, commit_date=date)  # pyright: ignore[reportArgumentType]
+            # use the child context.
+            ctx = context_registry.get(tsks[i])
+            tasks.append((parent_task, ctx))
+
     max_concurrency = (
         args.max_concurrency if args.max_concurrency != -1 else max(4, math.floor(0.5 * (os.cpu_count() or 1)))
     )
@@ -170,44 +185,44 @@ def main(args: argparse.Namespace) -> None:
     }
     logger.debug("main: machine_defaults keys=%d", len(machine_defaults))
 
-    builds: list[BuildResult] = []
-    if args.max_concurrency < 1:
-        for t in tasks:
-            build_res: BuildResult = build_repo_sha_image(
-                client=client,
-                context_registry=context_registry,
-                task=t,
-                force=args.force_rebuild,
-            )
-            builds.append(build_res)
-    else:
-        with ThreadPoolExecutor(max_workers=args.max_concurrency) as pool:
-            futures = [
-                pool.submit(
-                    build_repo_sha_image,
-                    client,
-                    context_registry,
-                    task,
-                    args.force_rebuild,
-                )
-                for task in tasks
-            ]
-            for fut in as_completed(futures):
-                builds.append(fut.result())
-
-    successful_builds = [b for b in builds if b.rc != 1]
-
-    logger.info("Running benchmarks for %d images", len(successful_builds))
-    logger.info("Failed builds for %d images", len(builds) - len(successful_builds))
-    for b in builds:
-        if b.rc == 1:
-            logger.warning("Build failed for %s", b.image_name)
+    # builds: list[BuildResult] = []
+    # if args.max_concurrency < 1:
+    #     for t in tasks:
+    #         build_res: BuildResult = build_repo_sha_image(
+    #             client=client,
+    #             context_registry=context_registry,
+    #             task=t,
+    #             force=args.force_rebuild,
+    #         )
+    #         builds.append(build_res)
+    # else:
+    #     with ThreadPoolExecutor(max_workers=args.max_concurrency) as pool:
+    #         futures = [
+    #             pool.submit(
+    #                 build_repo_sha_image,
+    #                 client,
+    #                 context_registry,
+    #                 task,
+    #                 args.force_rebuild,
+    #             )
+    #             for task in tasks
+    #         ]
+    #         for fut in as_completed(futures):
+    #             builds.append(fut.result())
+
+    # successful_builds = [b for b in builds if b.rc != 1]
+
+    # logger.info("Running benchmarks for %d images", len(successful_builds))
+    # logger.info("Failed builds for %d images", len(builds) - len(successful_builds))
+    # for b in builds:
+    #     if b.rc == 1:
+    #         logger.warning("Build failed for %s", b.image_name)
 
     machine_args: dict[str, str] = asv.machine.Machine.get_defaults()  # pyright: ignore[reportAttributeAccessIssue]
     machine_args["num_cpu"] = str(args.num_cores)
-    files_by_image: dict[str, dict[str, str]] = asyncio.run(
+    files_by_image: dict[Task, dict[str, str]] = asyncio.run(
         orchestrate(
-            docker_image_names=[b.image_name for b in successful_builds],
+            contexts=tasks,
             asv_args=asv_args,
             machine_args=machine_args,
             max_concurrency=max_concurrency,
@@ -217,7 +232,7 @@ def main(args: argparse.Namespace) -> None:
         )
     )
     # save the files by image as a pickle file.
-    with open(output_dir / "files_by_image.pkl", "wb") as f:
+    with open(output_dir / "files_by_image.json", "wb") as f:
         pickle.dump(files_by_image, f)
 
     # save the files by image as a JSON file
 
@@ -15,6 +15,7 @@
 
 from datasmith.agents.tool_executor import ContainerToolExecutor
 from datasmith.docker.context import BuildResult, ContextRegistry, DockerContext
+from datasmith.docker.orchestrator import gen_run_labels
 from datasmith.docker.validation import Task
 
 logger = logging.getLogger(__name__)
@@ -23,6 +24,7 @@
 def remove_containers_by_label(client: docker.DockerClient, run_id: str) -> None:
     with contextlib.suppress(Exception):
         for c in client.containers.list(all=True, filters={"label": f"datasmith.run={run_id}"}):
+            logger.debug("Removing container %s", c.name)
             c.remove(force=True)
 
 
@@ -32,6 +34,7 @@ def remove_images_by_label(client: docker.DockerClient, run_id: str) -> None:
         imgs = client.images.list(filters={"label": f"datasmith.run={run_id}"})
         for img in imgs:
             try:
+                logger.debug("Removing image %s (%s)", img.tags, img.id)
                 client.images.remove(img.id, force=True, noprune=False)
             except (ImageNotFound, NotFound):
                 pass
@@ -42,14 +45,6 @@ def remove_images_by_label(client: docker.DockerClient, run_id: str) -> None:
                     pass
 
 
-def gen_run_labels(t: Task, runid: str) -> dict[str, str]:
-    return {
-        "datasmith.run": runid,
-        "datasmith.task": f"{t.owner}/{t.repo}",
-        "datasmith.sha": t.sha if t.sha else "unknown",
-    }
-
-
 def _preview(s: str, n: int = 160) -> str:
     s = s or ""
     s = s.replace("\n", "\\n")
@@ -645,24 +640,27 @@ def agent_build_and_validate(  # noqa: C901
         with contextlib.suppress(Exception):
             tool_exec.shutdown()
 
-        run_id = run_labels.get("datasmith.run", "unknown")
-        remove_containers_by_label(client, run_id)
-        for name in [
-            task.with_tag("env").get_container_name(),
-            task.with_tag("pkg").get_container_name(),
-            f"{task.with_tag('env').get_container_name()}-{run_id[:8]}",
-            f"{task.with_tag('pkg').get_container_name()}-{run_id[:8]}",
-        ]:
-            with contextlib.suppress(Exception, NotFound):
-                c = client.containers.get(name)
-                c.remove(force=True)
-
-        remove_images_by_label(client, run_id)
-        for tag in [task.with_tag("env").get_image_name(), task.with_tag("pkg").get_image_name()]:
-            with contextlib.suppress(NotFound, ImageNotFound):
-                client.images.remove(tag, force=True, noprune=False)
-
         try:
-            client.images.prune(filters={"dangling": True})
+            run_id = run_labels.get("datasmith.run", "unknown")
+            remove_containers_by_label(client, run_id)
+            for name in [
+                task.with_tag("env").get_container_name(),
+                task.with_tag("pkg").get_container_name(),
+                f"{task.with_tag('env').get_container_name()}-{run_id[:8]}",
+                f"{task.with_tag('pkg').get_container_name()}-{run_id[:8]}",
+            ]:
+                with contextlib.suppress(Exception, NotFound):
+                    c = client.containers.get(name)
+                    c.remove(force=True)
+
+            remove_images_by_label(client, run_id)
+            for tag in [task.with_tag("env").get_image_name(), task.with_tag("pkg").get_image_name()]:
+                with contextlib.suppress(NotFound, ImageNotFound):
+                    client.images.remove(tag, force=True, noprune=False)
+
+            try:
+                client.images.prune(filters={"dangling": True})
+            except Exception:
+                logger.exception("image prune failed")
         except Exception:
-            logger.exception("image prune failed")
+            logger.exception("agent_build_and_validate: cleanup error")