formula-code · atharvas · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -195,4 +195,4 @@ cython_debug/
 
 tokens.env
 benchmark_results/
-artifacts/
+scratch/artifacts/
diff --git a/Dockerfile b/Dockerfile
@@ -1,21 +1,63 @@
-# Install uv
-FROM python:3.12-slim
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+FROM buildpack-deps:jammy
 
-# Change the working directory to the `app` directory
-WORKDIR /app
+ARG REPO_URL
+ARG COMMIT_SHA
+ARG BUILD_SCRIPT # A build script with custom installation commands provided by the user
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl git build-essential jq && \
+    rm -rf /var/lib/apt/lists/*
 
-# Copy the lockfile and `pyproject.toml` into the image
-COPY uv.lock /app/uv.lock
-COPY pyproject.toml /app/pyproject.toml
+RUN curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest \
+      | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba
 
-# Install dependencies
-RUN uv sync --frozen --no-install-project
+ENV MAMBA_ROOT_PREFIX=/opt/conda \
+    PATH=/opt/conda/bin:$PATH \
+    MAMBA_DOCKERFILE_ACTIVATE=1 \
+    OPENBLAS_NUM_THREADS=1 \
+    MKL_NUM_THREADS=1 \
+    OMP_NUM_THREADS=1
 
-# Copy the project into the image
-COPY . /app
+RUN micromamba install -y -p $MAMBA_ROOT_PREFIX -c conda-forge \
+        python=3.10 \
+        git asv pyperf mamba conda libmambapy jq && \
+    micromamba clean --all --yes
 
-# Sync the project
-RUN uv sync --frozen
+RUN mkdir -p /workspace /output
+WORKDIR /workspace
 
-CMD [ "python", "datasmith/foo.py" ]
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+RUN git clone "${REPO_URL}" repo && \
+    cd repo && \
+    git checkout "${COMMIT_SHA}" && \
+    \
+    CONF_FILE=$(find . -type f -name "asv.*.json" | head -n 1) && \
+    if [[ -z "${CONF_FILE}" ]]; then \
+        echo "❌  No asv.*.json found." && exit 1; \
+    fi && \
+    echo "✅  Using ASV config: ${CONF_FILE}" && \
+    \
+PY_VERS=$(echo "import json, pathlib; \
+cfg = pathlib.Path('${CONF_FILE}').read_text(); \
+data = json.loads(cfg); \
+vers = data.get('pythons') or data.get('python') or []; \
+print(' '.join(dict.fromkeys(vers)))" | python -) && \
+    if [[ -z "${PY_VERS}" ]]; then \
+        echo "❌  No Python versions declared in ${CONF_FILE}" && exit 1; \
+    fi && \
+    echo "🐍  Creating Conda envs for: ${PY_VERS}" && \
+    \
+    for v in ${PY_VERS}; do \
+        micromamba create -y -n "asv_${v}" -c conda-forge \
+            python=${v} git mamba conda "libmambapy<=1.9.9"; \
+    done
+
+WORKDIR /workspace/repo
+
+RUN echo "${BUILD_SCRIPT}" > /workspace/repo/docker_build.sh && \
+    chmod +x /workspace/repo/docker_build.sh && \
+    /workspace/repo/docker_build.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/Makefile b/Makefile
@@ -16,8 +16,8 @@ backup: ## Create a backup of the datasets, results, and analysis directories
 			echo "❌ Error: BACKUP_DIR not defined in tokens.env"; exit 1; \
 		fi; \
 		mkdir -p "$$BACKUP_DIR"; \
-		zip -qr "$$BACKUP_DIR/datasmith.bckp" artifacts/benchmark_results artifacts/raw; \
-		cp -f artifacts/cache.db "$$BACKUP_DIR/datasmith.cache.bckp"; \
+		zip -qr "$$BACKUP_DIR/datasmith.bckp" scratch/artifacts/benchmark_results scratch/artifacts/raw; \
+		cp -f scratch/artifacts/cache.db "$$BACKUP_DIR/datasmith.cache.bckp"; \
 	'
 
 .PHONY: check

diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ FormulaCode is designed to benchmark the capabilities of large language models (
 ## Data layout
 The general layout of the artifacts is as follows:
 ```bash
-artifacts/
+scratch/artifacts
 ├── raw/                        # Raw downloads & lists produced by scripts
 │   ├── downloads/              # Per‑repo dashboard archives
 │   ├── online_dashboards.jsonl # Updated config for dashboard scraper
@@ -61,7 +61,7 @@ $ cat tokens.env
 GH_TOKEN=github_pat_???
 COVERALLS_TOKEN=XdK???
 CODECOV_TOKEN=54c6???
-CACHE_LOCATION=/home/???/formulacode/datasmith/artifacts/cache.db
+CACHE_LOCATION=/home/???/formulacode/datasmith/scratch/artifacts/cache.db
 BACKUP_DIR=/home/???/formulacode/backup/
 ```
 
@@ -74,30 +74,30 @@ FormulaCode Lite is a small dataset of 5 repositories with ~440 performance impr
 
 ### Scrape online dashboards
 
-Each of these repositories has a publicly accessible perpetually updating dashboard (e.g. Astropy's dashboard lives [here](https://spacetelescope.github.io/bench/astropy-benchmarks)) that tracks the performance of each commit against various benchmarks. These dashboards were manually curated and placed in a file called `artifacts/raw/online_dashboards.jsonl`.
+Each of these repositories has a publicly accessible perpetually updating dashboard (e.g. Astropy's dashboard lives [here](https://spacetelescope.github.io/bench/astropy-benchmarks)) that tracks the performance of each commit against various benchmarks. These dashboards were manually curated and placed in a file called `scratch/artifacts/raw/online_dashboards.jsonl`.
 
 ```json
-{"url": "https://pv.github.io/scipy-bench/", "output_dir": "artifacts/raw/downloads/scipy"}
-{"url": "https://pandas-dev.github.io/asv-runner/", "output_dir": "artifacts/raw/downloads/pandas"}
-{"url": "https://scikit-learn.org/scikit-learn-benchmarks/", "output_dir": "artifacts/raw/downloads/sklearn"}
-{"url": "https://spacetelescope.github.io/bench/astropy-benchmarks/", "output_dir": "artifacts/raw/downloads/astropy"}
-{"url": "https://pv.github.io/numpy-bench/", "output_dir": "artifacts/raw/downloads/numpy"}
+{"url": "https://asv-runner.github.io/asv-collection/pandas/", "output_dir": "artifacts/processed/downloads/pandas"}
+{"url": "https://pv.github.io/scipy-bench/", "output_dir": "artifacts/processed/downloads/scipy"}
+{"url": "https://scikit-learn.org/scikit-learn-benchmarks/", "output_dir": "artifacts/processed/downloads/sklearn"}
+{"url": "https://spacetelescope.github.io/bench/astropy-benchmarks/", "output_dir": "artifacts/processed/downloads/astropy"}
+{"url": "https://pv.github.io/numpy-bench/", "output_dir": "artifacts/processed/downloads/numpy"}
 ```
 As all these dashboards have the same structure, we developed an ethical scraper that can scrape these dashboards and download the performance data in a structured format. The scraper is invoked using `scripts/download_dataset.py` and can be run as follows:
 
 ```bash
 $ python scripts/download_dataset.py \
        --force \
-       --dashboards artifacts/raw/online_dashboards.jsonl
+       --dashboards scratch/artifacts/raw/online_dashboards.jsonl
 # machines: 100%|██████████████████████████████████████| 7/7 [00:56<00:00,  8.05s/it]
 # Collected 46,143 rows from 805 benchmark files.
 # summaries: 100%|█████████████████████████████████| 115/115 [00:09<00:00, 12.56it/s]
-# Saved 46,143 benchmark rows and 22,577 summary rows -> /home/???/formulacode/datasmith/artifacts/raw/downloads/sklearn/dashboard.fc.pkl
-# Data downloaded to artifacts/raw/downloads/sklearn
+# Saved 46,143 benchmark rows and 22,577 summary rows -> /home/???/formulacode/datasmith/scratch/artifacts/processed/downloads/sklearn/dashboard.fc.pkl
+# Data downloaded to scratch/artifacts/processed/downloads/sklearn
 # ...
 ```
 
-This should create a directory called `artifacts/raw/downloads` that contains the downloaded data for each repository. The data is stored in a structured format that can be easily processed later. More information about the format is available in `datasmith/benchmark/collection.py`.
+This should create a directory called `scratch/artifacts/processed/downloads` that contains the downloaded data for each repository. The data is stored in a structured format that can be easily processed later. More information about the format is available in `datasmith/benchmark/collection.py`.
 
 
 ### 2. Detect performance improving commits
@@ -113,12 +113,12 @@ $ python scripts/detect_breakpoints.py \
        --build-reports \
        --method rbf \
        --compute-coverage \
-       --dataset artifacts/raw/downloads/astropy/dashboard.fc.pkl
+       --dataset scratch/artifacts/processed/downloads/astropy/dashboard.fc.pkl
 # Found 1,085 potential downward shifts.
 # Codecov: 100%|███████████████████████████████| 119/119 [08:50<00:00,  4.46s/commit]
 # Building GitHub commit reports and merged dataframe ...
 # Reports: 100%|█████████████████████████████████| 40/40 [02:55<00:00,  4.38s/commit]
-# Enriched breakpoints saved to '/home/???/formulacode/datasmith/artifacts/raw/downloads/astropy/breakpoints.fc.pkl'.
+# Enriched breakpoints saved to '/home/???/formulacode/datasmith/scratch/artifacts/processed/downloads/astropy/breakpoints.fc.pkl'.
 ```
 
 The `breakpoints.fc.pkl` collection contains all the information about the detected performance improving commits, a markdown report for each commit with useful hints for the optimizer, and a merged CSV file that contains the performance data for all commits in the repository. These files can then be used in the evaluation harness for benchmarking the performance of an optimizer `[@TODO:link formula-code/evaluation-harness]`.
@@ -145,13 +145,13 @@ To run the script, you need to have a GitHub token with `repo` and `read:org` pe
 The scraper can be run using the following command:
 ```bash
 $ python scripts/scrape_repositories.py \
-       --outfile artifacts/raw/repos_discovered.csv \
+       --outfile scratch/artifacts/processed/repos_discovered.csv \
        --min-stars 500 \
-       --filtered-outfile artifacts/raw/repos_valid.csv
-# Writes artifacts/raw/repos_discovered.csv and artifacts/raw/repos_valid.csv
+       --filtered-outfile scratch/artifacts/processed/repos_valid.csv
+# Writes scratch/artifacts/processed/repos_discovered.csv and scratch/artifacts/processed/repos_valid.csv
 ```
 
-The `artifacts/raw/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / has atleast 500 stars / pass other sanity checks. We found ~700 filtered repositories for this dataset.
+The `scratch/artifacts/processed/repos_valid.csv` file contains a subset of the repositories that aren't forks / reuploads / has atleast 500 stars / pass other sanity checks. We found ~700 filtered repositories for this dataset.
 
 
 ### 4. Collect relevant commits for all repositories
@@ -160,13 +160,13 @@ Given the list of repositories, we find the subset of commits that have already
 
 ```bash
 $ python scripts/collect_commits.py \
-       --dashboards artifacts/raw/repos_valid.csv \
-       --outfile    artifacts/raw/commits_all.jsonl \
+       --dashboards scratch/artifacts/raw/repos_valid.csv \
+       --outfile    scratch/artifacts/raw/commits_all.jsonl \
        --max-pages  50
 $ python scripts/filter_commits.py \
-       --filtered-benchmarks-pth artifacts/raw/repos_valid.csv \
-       --merged-commits-pth     artifacts/raw/commits_all.jsonl \
-       --output-pth             artifacts/raw/commits_filtered.jsonl \
+       --filtered-benchmarks-pth scratch/artifacts/raw/repos_valid.csv \
+       --merged-commits-pth     scratch/artifacts/raw/commits_all.jsonl \
+       --output-pth             scratch/artifacts/raw/commits_filtered.jsonl \
        --max-repos 350 \
        --threads   8   \
        --procs     8
@@ -178,6 +178,8 @@ $ python scripts/filter_commits.py \
 
 Once we've collected the relevant commits, we can benchmark their performance using `asv`. `asv` includes many quality-of-life features to ensure that benchmarks are robust to noise and that the results are reproducible. Our script benchmarks multiple commits in parallel. Proper benchmarking requires some system tuning. Refer to the [asv tuning guidelines](https://asv.readthedocs.io/en/latest/tuning.html) for more details.
 
+The `dependency_recommendations.json` file is a dictionary that contains recommended dependencies for each package. The key is an input to `pandas.query` for the `filtered-commits` dataframe, and the value is a list of dependencies that should be installed before running the benchmarks. For example, certain commits in `scikit_learn_scikit_learn` repository require `numpy==1.22.0` to run properly. This is a stop-gap solution to ensure that the benchmarks run correctly.
+
 ```bash
 # in a root shell:
 (sudo) $ export OPENBLAS_NUM_THREADS=1
@@ -186,24 +188,25 @@ Once we've collected the relevant commits, we can benchmark their performance us
 (sudo) $ sudo python -m pyperf system tune
 # in userspace:
 $ python scripts/benchmark_commits.py \
-       --filtered-commits artifacts/raw/commits_filtered.jsonl \
-       --max-concurrency 15 \
-       --num-cores       4  \
+       --filtered-commits scratch/artifacts/raw/commits_filtered.jsonl \
+       --dep-recs scratch/artifacts/raw/dependency_recommendations.json \
+       --max-concurrency 30 \
+       --num-cores       2  \
        --asv-args "--interleave-rounds --append-samples -a rounds=2 -a repeat=2" \
-       --output-dir      artifacts/benchmark_results/
+       --output-dir      scratch/artifacts/benchmark_results/
 ```
 
-Generally, each benchmark takes ~2 minutes to run, so benchmarking 70,000 commits on 16 dedicated 4-core machines takes around 6 days. The script will create a directory called `artifacts/benchmark_results/` that contains the results of the benchmarks for each commit. The results are stored in a structured format that can be easily processed later.
+Generally, each benchmark takes ~2 minutes to run, so benchmarking 70,000 commits on 16 dedicated 4-core machines takes around 6 days. The script will create a directory called `scratch/artifacts/benchmark_results/` that contains the results of the benchmarks for each commit. The results are stored in a structured format that can be easily processed later.
 
 ### 6. Collate benchmark results
 
 This step aggregates the benchmark results and generates the `*.fc.pkl` file. The `detect_breakpoints.py` script can then be used unchanged to detect performance improving commits. The script can be run as follows:
 
 ```bash
 $ python scripts/collate_benchmark_results.py \
-       --results-dir     artifacts/benchmark_results/results \
-       --output-dir      artifacts/benchmark_results/published/ \
-       --commit-metadata artifacts/raw/commits_filtered.jsonl \
+       --results-dir     scratch/artifacts/benchmark_results/results \
+       --output-dir      scratch/artifacts/benchmark_results/published/ \
+       --commit-metadata scratch/artifacts/raw/commits_filtered.jsonl \
        --default-machine-name "docker"
 # machines: 100%|██████████████████████████████████████████████| 1/1 [00:00<00:00,  1.53it/s]
 # Collected 53,705 rows from 115 benchmark files.
@@ -214,7 +217,7 @@ $ python scripts/detect_breakpoints.py \
        --build-reports \
        --method rbf \
        --compute-coverage \
-       --dataset artifacts/benchmark_results/published/html/scikit-learn_scikit-learn/dashboard.fc.pkl
+       --dataset scratch/artifacts/benchmark_results/published/html/scikit-learn_scikit-learn/dashboard.fc.pkl
 # ...
 ```
 
@@ -226,9 +229,9 @@ How closely do our benchmarked metrics match the original performance improvemen
 
 ```bash
 $ python scripts/replication_experiment.py \
-       --dataset1 artifacts/benchmark_results/published/html/scikit-learn_scikit-learn/breakpoints.fc.pkl \
-       --dataset2 artifacts/raw/downloads/sklearn/breakpoints.fc.pkl \
-       --output-dir artifacts/replication/
+       --dataset1 scratch/artifacts/benchmark_results/published/html/scikit-learn_scikit-learn/breakpoints.fc.pkl \
+       --dataset2 scratch/artifacts/raw/downloads/sklearn/breakpoints.fc.pkl \
+       --output-dir scratch/artifacts/replication/
 ```
 ### Pipeline flowchart
 
@@ -295,3 +298,4 @@ flowchart TD
 - [ ] FormulaCode: `asv` supports profiling the benchmarking function. We should collect such profiling data for all commits in the dataset.
 - [ ] FormulaCode: In `search_commits` replace the endpoint with `"/search/issues?q=type:pr+is:merged+repo:{repo_name}&per_page={per_page}&page={page}&advanced_search=true` endpoint to use each query more efficiently.
 - [ ] FormulaCode: Make an object oriented API for the dataset. Do not rely on a folder structure.
+- [ ] FormualCode Docker: need to get relative path from asv.conf.json instead of assuming root directory is the base directory.
diff --git a/artifacts/raw/online_dashboards.jsonl b/artifacts/raw/online_dashboards.jsonl
diff --git a/scratch/artifacts/raw/online_dashboards.jsonl b/scratch/artifacts/raw/online_dashboards.jsonl
@@ -0,0 +1,12 @@
+{"url": "https://asv-runner.github.io/asv-collection/pandas/", "output_dir": "artifacts/processed/downloads/pandas"}
+{"url": "https://asv-runner.github.io/asv-collection/xarray/", "output_dir": "artifacts/processed/downloads/xarray"}
+{"url": "https://asv-runner.github.io/asv-collection/distributed/", "output_dir": "artifacts/processed/downloads/distributed"}
+{"url": "https://asv-runner.github.io/asv-collection/pymc3/", "output_dir": "artifacts/processed/downloads/pymc3"}
+{"url": "https://asv-runner.github.io/asv-collection/scikit-image/", "output_dir": "artifacts/processed/downloads/scikit-image"}
+{"url": "https://asv-runner.github.io/asv-collection/joblib/", "output_dir": "artifacts/processed/downloads/joblib"}
+{"url": "https://asv-runner.github.io/asv-collection/arrow/", "output_dir": "artifacts/processed/downloads/arrow"}
+{"url": "https://pv.github.io/scipy-bench/", "output_dir": "artifacts/processed/downloads/scipy"}
+{"url": "https://scikit-learn.org/scikit-learn-benchmarks/", "output_dir": "artifacts/processed/downloads/sklearn"}
+{"url": "https://spacetelescope.github.io/bench/astropy-benchmarks/", "output_dir": "artifacts/processed/downloads/astropy"}
+{"url": "https://pv.github.io/numpy-bench/", "output_dir": "artifacts/processed/downloads/numpy"}
+{"url": "https://asv-runner.github.io/asv-collection/dask/", "output_dir": "artifacts/processed/downloads/dask"}
diff --git a/scratch/notebooks/cache.db b/scratch/notebooks/cache.db