Skip to content

Commit 65f2408

Browse files
committed
Large scale benchmarking works.
1 parent 9a5eadb commit 65f2408

8 files changed

Lines changed: 1181 additions & 50 deletions

File tree

scratch/merged_context_registry_2025-09-04T23:54:53.035665.json

Lines changed: 1073 additions & 0 deletions
Large diffs are not rendered by default.

scratch/scripts/benchmark_commits.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,16 @@
1010
import pickle
1111
import shutil
1212
from collections import defaultdict
13+
from concurrent.futures import ThreadPoolExecutor, as_completed
1314
from pathlib import Path
1415

1516
import asv
1617
import pandas as pd
1718

1819
from datasmith.benchmark.collection import BenchmarkCollection
19-
from datasmith.docker.context import ContextRegistry, DockerContext, Task
20+
from datasmith.docker.context import ContextRegistry, DockerContext, Task, build_base_image
2021
from datasmith.docker.orchestrator import (
22+
build_repo_sha_image,
2123
get_docker_client,
2224
orchestrate,
2325
)
@@ -131,12 +133,16 @@ def process_inputs(args: argparse.Namespace) -> dict[tuple[str, str], set[tuple[
131133
return all_states
132134

133135

134-
def main(args: argparse.Namespace) -> None:
135-
client = get_docker_client()
136+
def main(args: argparse.Namespace) -> None: # noqa: C901
137+
client = get_docker_client(args.max_concurrency)
136138
all_states = process_inputs(args)
137139
context_registry = ContextRegistry.load_from_file(path=args.context_registry)
138140
interim_path = Path(os.environ["CACHE_LOCATION"]).parent / "interim" # Look here for cached docker contexts
139141

142+
logger.info("Building base image...")
143+
base_tag = build_base_image(client, DockerContext())
144+
os.environ["DOCKER_CACHE_FROM"] = base_tag
145+
140146
# Prepare tasks
141147
tasks: list[tuple[Task, DockerContext]] = []
142148
repo_commit_pairs = defaultdict(list)
@@ -199,6 +205,18 @@ def main(args: argparse.Namespace) -> None:
199205
already_benchmarked = list(filter(lambda x: (interim_path / f"{x[0].get_container_name()}.json").exists(), tasks))
200206
logger.info("Skipping %d tasks that have already been benchmarked", len(already_benchmarked))
201207

208+
# build the containers.
209+
builds = []
210+
with ThreadPoolExecutor(max_workers=args.max_concurrency) as pool:
211+
futures = [
212+
pool.submit(build_repo_sha_image, client, ctx, task, args.force_rebuild, run_id="CANARY-BUILD")
213+
for (task, ctx) in tasks
214+
]
215+
for fut in as_completed(futures):
216+
builds.append(fut.result())
217+
218+
to_benchmark = [t for (t, b) in zip(tasks, builds) if b.rc == 0]
219+
202220
machine_args: dict[str, str] = asv.machine.Machine.get_defaults() # pyright: ignore[reportAttributeAccessIssue]
203221
machine_args["num_cpu"] = str(args.num_cores)
204222
files_by_image: dict[Task, dict[str, str]] = asyncio.run(
@@ -227,6 +245,12 @@ def main(args: argparse.Namespace) -> None:
227245

228246
logger.info("Benchmark results saved to %s", output_file)
229247

248+
# remove all images with CANARY-BUILD tag.
249+
try:
250+
client.images.prune(filters={"label": "datasmith.run=CANARY-BUILD"})
251+
except Exception:
252+
logger.exception("Failed to prune images with CANARY-BUILD tag")
253+
230254

231255
if __name__ == "__main__":
232256
args = parse_args()

scratch/scripts/synthesize_contexts.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import argparse
44
import datetime
55
import json
6+
import os
67
from concurrent.futures import ThreadPoolExecutor, as_completed
78
from pathlib import Path
89

@@ -12,7 +13,7 @@
1213
from datasmith.agents.config import configure_agent_backends
1314
from datasmith.agents.context_synthesis import agent_build_and_validate
1415
from datasmith.benchmark.collection import BenchmarkCollection
15-
from datasmith.docker.context import ContextRegistry
16+
from datasmith.docker.context import ContextRegistry, DockerContext, build_base_image
1617
from datasmith.docker.orchestrator import get_docker_client
1718
from datasmith.docker.validation import Task, _err_lock
1819
from datasmith.logging_config import configure_logging
@@ -131,6 +132,10 @@ def main(args: argparse.Namespace) -> None:
131132
else ContextRegistry()
132133
)
133134

135+
logger.info("Building base image...")
136+
base_tag = build_base_image(client, DockerContext())
137+
os.environ["DOCKER_CACHE_FROM"] = base_tag
138+
134139
# Prepare tasks
135140
tasks = prepare_tasks(all_states, args.limit_per_repo, context_registry)
136141

src/datasmith/agents/context_synthesis.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def fast_cleanup_run_artifacts( # noqa: C901
7575
for iid in img_ids:
7676
try:
7777
logger.debug("Removing image id=%s", iid[:20])
78-
client.images.remove(iid, force=True, noprune=False)
78+
client.images.remove(iid, force=True, noprune=True)
7979
except (ImageNotFound, NotFound):
8080
pass
8181
except APIError as e:
@@ -400,7 +400,7 @@ def build_once_with_context(
400400
run_labels: dict[str, str],
401401
probe: bool = False,
402402
pull: bool = False,
403-
force: bool = True,
403+
force: bool = False,
404404
) -> BuildResult:
405405
logger.info("build_once_with_context: registering context key=%s", task.get_image_name())
406406
logger.debug(
@@ -487,7 +487,7 @@ def agent_build_and_validate( # noqa: C901
487487
tail_chars=args.tail_chars,
488488
probe=True,
489489
pull=True,
490-
force=True, # If the env is already present, don't rebuild (saves time)
490+
force=False, # If the env is already present, don't rebuild (saves time)
491491
run_labels=run_labels,
492492
)
493493
if not env_res.ok:
@@ -582,7 +582,7 @@ def agent_build_and_validate( # noqa: C901
582582
sha=task.sha,
583583
timeout_s=args.build_timeout,
584584
tail_chars=args.tail_chars * 2,
585-
force=True, # Always rebuild package image to pick up new script
585+
force=False, # Always rebuild package image to pick up new script
586586
run_labels=run_labels,
587587
)
588588
attempts.append(AttemptRecord(attempt_idx=i, building_data=script, build_result=build_res))

src/datasmith/docker/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# syntax=docker/dockerfile:1.7
22

3-
FROM buildpack-deps:jammy AS base
3+
ARG BASE_IMAGE=buildpack-deps:jammy
4+
FROM ${BASE_IMAGE} AS base
45

56
RUN apt-get update && \
67
apt-get install -y --no-install-recommends \

src/datasmith/docker/context.py

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,25 @@
2424
logger = get_logger("docker.context")
2525

2626

27+
def build_base_image(client: docker.DockerClient, ctx: DockerContext) -> str:
28+
base_key = hash(ctx)
29+
base_tag = f"asv-base-rev-{base_key}:base"
30+
31+
res = ctx.build_container_streaming(
32+
client=client,
33+
image_name=base_tag,
34+
build_args={},
35+
probe=True,
36+
force=False,
37+
pull=False,
38+
timeout_s=1800,
39+
)
40+
if not res.ok:
41+
logger.exception("Failed to build base image %s error=%s", base_tag, res.stderr_tail)
42+
raise RuntimeError("Failed to build base image")
43+
return base_tag
44+
45+
2746
@dataclass
2847
class BuildResult:
2948
ok: bool
@@ -141,43 +160,32 @@ def __init__(
141160
self.base_building_data = base_building_data
142161
self.building_data = building_data
143162

163+
@staticmethod
164+
def add_bytes(tar: tarfile.TarFile, name: str, data: bytes, mode: int = 0o644) -> None:
165+
info = tarfile.TarInfo(name=name)
166+
info.size = len(data)
167+
info.mode = mode
168+
info.mtime = 0
169+
info.uid = info.gid = 0
170+
info.uname = info.gname = ""
171+
tar.addfile(info, io.BytesIO(data))
172+
144173
def build_tarball_stream(self, probe: bool = False) -> io.BytesIO:
145174
tar_stream = io.BytesIO()
146175
with tarfile.open(fileobj=tar_stream, mode="w") as tar:
147176
# Add Dockerfile
148-
dockerfile_bytes = self.dockerfile_data.encode("utf-8")
149-
dockerfile_info = tarfile.TarInfo(name="Dockerfile")
150-
dockerfile_info.size = len(dockerfile_bytes)
151-
tar.addfile(dockerfile_info, io.BytesIO(dockerfile_bytes))
152-
177+
DockerContext.add_bytes(tar, "Dockerfile", self.dockerfile_data.encode("utf-8"))
153178
# Add entrypoint.sh
154-
entrypoint_data = self.entrypoint_data.encode("utf-8")
155-
entrypoint_info = tarfile.TarInfo(name="entrypoint.sh")
156-
entrypoint_info.size = len(entrypoint_data)
157-
entrypoint_info.mode = 0o755 # Make it executable
158-
tar.addfile(entrypoint_info, io.BytesIO(entrypoint_data))
159-
179+
DockerContext.add_bytes(tar, "entrypoint.sh", self.entrypoint_data.encode("utf-8"), mode=0o755)
160180
# Add docker_build_env.sh
161-
env_building_data = self.env_building_data.encode("utf-8")
162-
env_building_info = tarfile.TarInfo(name="docker_build_env.sh")
163-
env_building_info.size = len(env_building_data)
164-
env_building_info.mode = 0o755 # Make it executable
165-
tar.addfile(env_building_info, io.BytesIO(env_building_data))
181+
DockerContext.add_bytes(tar, "docker_build_env.sh", self.env_building_data.encode("utf-8"), mode=0o755)
166182

167183
# Add docker_build_base.sh
168-
base_building_data = self.base_building_data.encode("utf-8")
169-
base_building_info = tarfile.TarInfo(name="docker_build_base.sh")
170-
base_building_info.size = len(base_building_data)
171-
base_building_info.mode = 0o755 # Make it executable
172-
tar.addfile(base_building_info, io.BytesIO(base_building_data))
184+
DockerContext.add_bytes(tar, "docker_build_base.sh", self.base_building_data.encode("utf-8"), mode=0o755)
173185

174186
if not probe:
175187
# Add docker_build_pkg.sh
176-
building_data = self.building_data.encode("utf-8")
177-
building_info = tarfile.TarInfo(name="docker_build_pkg.sh")
178-
building_info.size = len(building_data)
179-
building_info.mode = 0o755 # Make it executable
180-
tar.addfile(building_info, io.BytesIO(building_data))
188+
DockerContext.add_bytes(tar, "docker_build_pkg.sh", self.building_data.encode("utf-8"), mode=0o755)
181189

182190
# Reset the stream position to the beginning
183191
tar_stream.seek(0)
@@ -216,6 +224,11 @@ def build_container(
216224
pass # Image doesn't exist or was removed, proceed to build
217225

218226
if not image_exists:
227+
cache_from = None
228+
if base_image := os.environ.get("DOCKER_CACHE_FROM", None):
229+
build_args = {**build_args, "BASE_IMAGE": base_image}
230+
cache_from = [base_image]
231+
219232
if len(build_args):
220233
build_args_str = " --build-arg ".join(f"{k}={v}" for k, v in build_args.items())
221234
logger.info("$ docker build -t %s src/datasmith/docker/ --build-arg %s", image_name, build_args_str)
@@ -229,6 +242,7 @@ def build_container(
229242
rm=True,
230243
labels=run_labels,
231244
network_mode=os.environ.get("DOCKER_NETWORK_MODE", None),
245+
cache_from=cache_from,
232246
)
233247
except DockerException:
234248
logger.exception("Failed to build Docker image '%s'", image_name)
@@ -289,6 +303,11 @@ def build_container_streaming( # noqa: C901
289303
stdout_buf: deque[str] = deque(maxlen=2000) # chunk-tail buffers
290304
stderr_buf: deque[str] = deque(maxlen=2000)
291305

306+
cache_from = None
307+
if base_image := os.environ.get("DOCKER_CACHE_FROM", None):
308+
build_args = {**build_args, "BASE_IMAGE": base_image}
309+
cache_from = [base_image]
310+
292311
# Pretty log line for transparency
293312
if build_args:
294313
build_args_str = " --build-arg ".join(f"{k}={v}" for k, v in build_args.items())
@@ -308,6 +327,7 @@ def build_container_streaming( # noqa: C901
308327
target=target,
309328
labels=run_labels,
310329
network_mode=os.environ.get("DOCKER_NETWORK_MODE", None),
330+
cache_from=cache_from,
311331
)
312332
except DockerException:
313333
logger.exception("Failed to initiate build for '%s'", image_name)
@@ -424,6 +444,15 @@ def from_dict(cls, data: Mapping[str, Any]) -> DockerContext:
424444
base_building_data=data.get("base_building_data", None),
425445
)
426446

447+
def __hash__(self) -> int:
448+
return hash((
449+
self.dockerfile_data,
450+
self.entrypoint_data,
451+
self.building_data,
452+
self.env_building_data,
453+
self.base_building_data,
454+
))
455+
427456

428457
class ContextRegistry:
429458
"""Registry for Docker contexts keyed by owner/repo[/sha], independent of tag.

src/datasmith/docker/orchestrator.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from docker.errors import APIError, DockerException, ImageNotFound, NotFound
1717
from docker.models.containers import Container
1818

19-
from datasmith.docker.context import BuildResult, ContextRegistry, DockerContext, Task
19+
from datasmith.docker.context import BuildResult, DockerContext, Task
2020
from datasmith.logging_config import get_logger
2121

2222
logger = get_logger("docker.orchestrator")
@@ -118,10 +118,10 @@ async def _guard_loop(
118118
await asyncio.wait_for(stop_event.wait(), timeout=interval_s)
119119

120120

121-
def get_docker_client() -> docker.DockerClient:
121+
def get_docker_client(max_concurrency: int = 10) -> docker.DockerClient:
122122
"""Return an authenticated Docker client or exit with an error."""
123123
try:
124-
return docker.from_env(timeout=60)
124+
return docker.from_env(timeout=60, max_pool_size=max_concurrency)
125125
except DockerException as exc:
126126
sys.exit(f"Could not connect to Docker daemon: {exc}")
127127

@@ -151,21 +151,20 @@ def build_repo_image(client: docker.DockerClient, image_name: str, repo_url: str
151151

152152

153153
def build_repo_sha_image(
154-
client: docker.DockerClient, context_registry: ContextRegistry, task: Task, force: bool = False
154+
client: docker.DockerClient, docker_ctx: DockerContext, task: Task, force: bool = False, run_id: str | None = None
155155
) -> BuildResult:
156156
assert task.sha is not None, "Task.sha must be set" # noqa: S101
157-
image_name = f"asv/{task.owner}/{task.repo}/{task.sha}".lower()
158-
docker_ctx = context_registry[image_name]
157+
repo_url = f"https://www.github.com/{task.owner}/{task.repo}"
159158
build_res: BuildResult = docker_ctx.build_container_streaming(
160159
client=client,
161-
image_name=image_name,
162-
build_args={
163-
"REPO_URL": f"https://www.github.com/{task.owner}/{task.repo}",
164-
"COMMIT_SHA": task.sha,
165-
},
166-
force=force,
160+
image_name=task.get_image_name(),
161+
build_args={"REPO_URL": repo_url, "COMMIT_SHA": task.sha},
162+
probe=False,
163+
force=False,
164+
timeout_s=1800, # 30 minutes
167165
tail_chars=10_000,
168166
pull=False,
167+
run_labels=gen_run_labels(task, runid="unknown" if run_id is None else run_id),
169168
)
170169
return build_res
171170

@@ -214,7 +213,7 @@ def _launch() -> tuple[int, dict[str, str]]: # noqa: C901
214213
image_name=task.get_image_name(),
215214
build_args={"REPO_URL": repo_url, "COMMIT_SHA": task.sha},
216215
probe=False,
217-
force=True,
216+
force=False,
218217
timeout_s=1800, # 30 minutes
219218
tail_chars=10_000,
220219
pull=False,
@@ -299,7 +298,7 @@ def _launch() -> tuple[int, dict[str, str]]: # noqa: C901
299298
logger.exception("Failed to remove container %s", task.get_container_name())
300299
pass
301300
try:
302-
client.images.remove(image=task.get_image_name(), force=True, noprune=False)
301+
client.images.remove(image=task.get_image_name(), force=True, noprune=True)
303302
except Exception:
304303
logger.exception("Failed to remove image %s", task.get_image_name())
305304
pass

src/datasmith/docker/validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def validate_one( # noqa: C901
193193
"REPO_URL": f"https://www.github.com/{task.owner}/{task.repo}",
194194
"COMMIT_SHA": task.sha,
195195
},
196-
force=True, # preserve your original behavior
196+
force=False,
197197
timeout_s=args.build_timeout,
198198
tail_chars=args.tail_chars,
199199
pull=False,

0 commit comments

Comments
 (0)