seqra · Saloed · Jun 18, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.github/workflows/regression.yaml b/.github/workflows/regression.yaml
@@ -253,7 +253,7 @@ jobs:
             --filter   "${{ inputs.projects_filter }}" \
             --misses-only misses.json)
           echo "matrix=$matrix" >> "$GITHUB_OUTPUT"
-          has=$(python -c "import json;print('true' if json.loads('''$matrix''')['include'] else 'false')")
+          has=$(printf '%s' "$matrix" | python -c "import json, sys; print('true' if json.load(sys.stdin)['include'] else 'false')")
           echo "has_cells=$has" >> "$GITHUB_OUTPUT"
 
   analyze:
@@ -296,14 +296,18 @@ jobs:
           echo "key=$key" >> "$GITHUB_OUTPUT"
       - name: Run analysis
         id: ra
+        env:
+          SCAN_FLAGS_JSON: ${{ toJson(matrix.scan_flags) }}
         run: |
           set +e
           python scripts/run_analysis.py \
-            --build-dir   build \
-            --project-dir project-root \
-            --results-dir results-bundle \
-            --max-memory  "${{ matrix.max_memory }}" \
-            --timeout     1200
+            --build-dir       build \
+            --project-dir     project-root \
+            --results-dir     results-bundle \
+            --max-memory      "${{ matrix.max_memory }}" \
+            --timeout         "${{ matrix.compilation_timeout }}" \
+            --extensions-dir  projects/extensions \
+            --scan-flags-json "$SCAN_FLAGS_JSON"
           rc=$?
           echo "rc=$rc" >> "$GITHUB_OUTPUT"
           # Exit 0 so cache/upload steps still run; we fail the job at the end

diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ Full diff detail is available in the `regression-diff` artifact.
 | --------------------------------- | ------------------------------------------------------------- |
 | `.github/workflows/regression.yaml` | Workflow: resolve → probe → build → analyze → compare.     |
 | `projects/repos.yaml`             | Benchmark project list (name, git URL, pinned head, etc.).   |
+| `projects/extensions/`            | Files (passthroughs, approximations, custom rules…) referenced by per-project `scan-flags`. |
 | `scripts/build_opentaint.sh`      | Build analyzer + autobuilder JARs and Go CLI from a checkout.|
 | `scripts/generate_matrix.py`      | Expand `repos.yaml` into a GH Actions matrix.                |
 | `scripts/run_analysis.py`         | Run opentaint `compile` + `scan`, extract analyzer status.   |
@@ -48,6 +49,24 @@ Full diff detail is available in the `regression-diff` artifact.
 | `tests/`                          | Unit tests for pure-Python logic. Run `python -m pytest tests`. |
 | `test-system-design-plan.md`      | Design document (authoritative spec).                        |
 
+## Per-project fields (`repos.yaml`)
+
+Each entry requires `name`, `git`, and `head` (a pinned commit/tag SHA). These
+optional fields tune one project:
+
+| Field          | Default | Purpose                                                      |
+| -------------- | ------- | ------------------------------------------------------------ |
+| `java-version`        | `17`    | JDK the project is compiled against (`actions/setup-java`).  |
+| `max-memory`          | `8G`    | Analyzer scan memory ceiling.                                |
+| `compilation-timeout` | `1200`  | Wall-clock seconds for the autobuilder compile step (and, less the usual margin, the scan). Raise for large reactors that pull from slow mirrors (e.g. `ruoyi-vue-pro`, `yudao-cloud`). |
+
+The autobuilder's Maven invocation runs with download-retry/timeout system
+properties (`run_analysis.py:maven_resilient_env`) so a transient mirror hiccup
+(e.g. an HTTP 502 from a project-pinned mirror) is retried rather than failing
+the job. A project whose build is deterministically broken at its pinned `head`
+(e.g. an inconsistent dev SNAPSHOT, or a non-Java module that fails to build) is
+commented out with a `QUARANTINED` note explaining the cause.
+
 ## Caching
 
 Per-project results (SARIF + `status.json` + analyzer log) are cached in GitHub
@@ -73,6 +92,69 @@ cd new-test
 python -m pytest tests -v
 ```
 
+## Per-project `opentaint scan` flags
+
+Each entry in `projects/repos.yaml` may declare a `scan-flags` list whose
+tokens are appended verbatim to the `opentaint scan` invocation. Use the
+literal substring `{ext}` to reference files shipped in `projects/extensions/`
+— the runner substitutes it with that directory's absolute path. Since the
+substitution is plain string replacement, the resolved path may point at
+either a **file** or a **directory** — whichever the underlying flag accepts
+(e.g. `--passthrough-approximations` and `--dataflow-approximations` each
+take a single file or a whole directory, and may be repeated):
+
+```yaml
+- name: spring-petclinic
+  git: https://github.com/spring-projects/spring-petclinic.git
+  head: 3e1ce239f4488f20abda24441388a515ea55a815
+  scan-flags:
+    - --passthrough-approximations              # single YAML file
+    - "{ext}/spring-petclinic/passthroughs.yaml"
+    - --passthrough-approximations              # …or repeat with a directory
+    - "{ext}/spring-petclinic/passthroughs"
+    - --dataflow-approximations                 # directory of approximations
+    - "{ext}/spring-petclinic/approximations"
+    - --rule-id
+    - java.taint.sql-injection
+```
+
+Flags reserved by the runner (`--analyzer-jar`, `--project-model`,
+`--output`, `--timeout`, `--max-memory`, `--debug`, `--experimental`) must
+not be repeated here. `--ruleset` is **not** reserved — if `scan-flags`
+contains no `--ruleset`, the runner inserts a default pointing at the
+staged source-tree rule pack at `<build>/rules` (copied from
+`opentaint/rules/ruleset` at build time). Supplying any `--ruleset` value
+puts the project in full control of which rule packs are loaded and in
+what order.
+
+The literal value `builtin` is **rewritten by the runner** to that same
+staged pack — the CLI would otherwise try to fetch the pack from a GitHub
+release that does not exist for in-development opentaint SHAs (and the
+current CLI's URL is malformed, producing a 404 against
+`api.github.com/repos/seqra/seqra/opentaint/…`). Example — stack
+`builtin` with a custom YAML file and a custom rules directory:
+
+```yaml
+scan-flags:
+  - --ruleset
+  - builtin                                          # → <build>/rules (staged)
+  - --ruleset
+  - "{ext}/my-project/rules/sql-injection.yaml"     # custom YAML file
+  - --ruleset
+  - "{ext}/my-project/rules"                        # custom rules directory
+```
+
+Two placeholders are expanded inside `scan-flags`:
+
+* `{ext}`    → absolute path of `projects/extensions/`.
+* `{rules}`  → absolute path of the staged source-tree pack at
+  `<build>/rules`. After the `builtin` rewrite, mostly redundant for
+  `--ruleset` values; still useful for other flags that take a rules
+  directory.
+
+See [`projects/extensions/README.md`](projects/extensions/README.md) for
+the layout convention and the rationale behind the `builtin` rewrite.
+
 ## Open items
 
 See `test-system-design-plan.md` §10. The exact spelling of the

diff --git a/projects/extensions/README.md b/projects/extensions/README.md
@@ -0,0 +1,161 @@
+# Project extensions
+
+This directory holds files referenced by the `scan-flags` of projects in
+[`../repos.yaml`](../repos.yaml). It is mounted into the analyzer runner as
+the **extensions directory** and can contain anything `opentaint scan` knows
+how to consume — most commonly:
+
+| Subject                        | Typical `opentaint scan` flag      | Accepts                          |
+| ------------------------------ | ---------------------------------- | -------------------------------- |
+| Pass-through approximations    | `--passthrough-approximations`     | YAML file **or** directory       |
+| Dataflow approximations        | `--dataflow-approximations`        | Class directory **or** Java sources directory |
+| Custom YAML rules              | `--ruleset`                        | YAML file **or** directory of `*.yml`/`*.yaml` |
+| Rule-id filter                 | `--rule-id`                        | rule-id string (repeatable)      |
+
+All path-valued flags above are *repeatable* and accept either a single file
+or a directory — group as many or as few entries under `{ext}/...` as you
+like, then point the flag at the file or the enclosing directory.
+
+## Layout convention
+
+Group files by project name to keep things tidy. Either point a flag at one
+file, or at a directory and let opentaint pick up everything inside it:
+
+```
+projects/extensions/
+├── README.md
+├── <project-name>/
+│   ├── passthroughs/          # ← pass directory to --passthrough-approximations
+│   │   ├── jackson.yaml
+│   │   └── spring.yaml
+│   ├── single-passthrough.yaml # ← or pass one YAML file
+│   └── approximations/        # ← pass directory to --dataflow-approximations
+│       └── ...
+└── shared/
+    └── ...
+```
+
+## Referencing extension files from `repos.yaml`
+
+Use the literal token `{ext}` inside `scan-flags` — the runner substitutes it
+with the absolute path of this directory at analysis time. The substitution
+is pure string replacement, so the resolved path can point at a **file** or a
+**directory**, whichever the flag accepts:
+
+```yaml
+- name: spring-petclinic
+  git: https://github.com/spring-projects/spring-petclinic.git
+  head: 3e1ce239f4488f20abda24441388a515ea55a815
+  scan-flags:
+    # Single YAML file:
+    - --passthrough-approximations
+    - "{ext}/spring-petclinic/single-passthrough.yaml"
+    # Whole directory of passthrough YAMLs (also valid):
+    - --passthrough-approximations
+    - "{ext}/spring-petclinic/passthroughs"
+    # Approximation classes / Java sources directory:
+    - --dataflow-approximations
+    - "{ext}/spring-petclinic/approximations"
+    # Plain flags without path arguments work too:
+    - --rule-id
+    - java.taint.sql-injection
+```
+
+The `--passthrough-approximations` and `--dataflow-approximations` flags are
+repeatable — add the flag multiple times in `scan-flags` to point at several
+files or directories.
+
+Flags that don't reference any extension file (e.g. `--rule-id`,
+`--code-flow-limit`) work just as well — they're appended verbatim to the
+`opentaint scan` invocation.
+
+## Custom rulesets
+
+The analyzer's `--ruleset` is a `stringArray` (default `[builtin]` per
+`opentaint scan --help`). Each value is either:
+
+* The literal **`builtin`** — normally fetched by the CLI from a GitHub
+  release tagged `rules/<version>`. The bench tests in-development
+  opentaint SHAs whose rule packs are not (yet) released, so the runner
+  **intercepts** this sentinel and points the analyzer at the same
+  source-tree pack that would have been packaged into the release.
+* A path to a YAML file, or a directory of `*.yml` / `*.yaml` files.
+  Reported by the CLI under `User ruleset`.
+
+The runner's policy:
+
+* If `scan-flags` contains no `--ruleset`, the runner inserts a default
+  pointing at the staged source-tree pack — same effect as `builtin`.
+* If `scan-flags` supplies one or more `--ruleset` tokens, they are
+  passed verbatim except that every `builtin` value is rewritten to the
+  staged source-tree pack's absolute path. The project owns the list.
+
+Example — use `builtin` plus a custom YAML file and a custom directory:
+
+```yaml
+- name: spring-petclinic
+  git: https://github.com/spring-projects/spring-petclinic.git
+  head: 3e1ce239f4488f20abda24441388a515ea55a815
+  scan-flags:
+    # Rewritten by the runner to `<build-dir>/rules` (the staged pack):
+    - --ruleset
+    - builtin
+    # A single custom YAML file:
+    - --ruleset
+    - "{ext}/spring-petclinic/rules/sql-injection.yaml"
+    # …and a whole directory of `*.yaml` / `*.yml` rule files:
+    - --ruleset
+    - "{ext}/spring-petclinic/rules"
+```
+
+Resulting analyzer command (conceptually):
+`--ruleset <build>/rules --ruleset <ext>/.../sql-injection.yaml --ruleset <ext>/.../rules`.
+
+Use `--rule-id` to narrow which rules from those sets actually run.
+
+### Why we rewrite `builtin`
+
+The CLI resolves `--ruleset builtin` by fetching
+`https://api.github.com/repos/<org>/<repo>/releases/tags/rules/<version>`.
+That path is unusable in the bench for two reasons:
+
+1. The bench analyses in-development SHAs whose rule packs may not be
+   published as a release.
+2. The current CLI's URL template duplicates the org segment, producing a
+   404 (`api.github.com/repos/seqra/seqra/opentaint/…`).
+
+`build_opentaint.sh` copies `opentaint/rules/ruleset` — the same YAMLs that
+would have been packaged into the release — into `<build-dir>/rules`.
+From the analyzer's perspective the rules are identical; only the CLI's
+coverage report labels the pack as `User ruleset` rather than `Bundled`,
+which is purely cosmetic.
+
+### The `{rules}` placeholder
+
+`{rules}` is an explicit alias for the same staged pack. After the
+`builtin` rewrite described above it is mostly redundant, but it remains
+useful in tokens that aren't `--ruleset` values (for example, if a future
+flag accepts a rules directory):
+
+```yaml
+  scan-flags:
+    - --some-future-flag
+    - "{rules}/foo.yaml"
+```
+
+## Reserved flags
+
+The runner already sets these and you should **not** repeat them in
+`scan-flags`:
+
+- `--analyzer-jar`
+- `--project-model`
+- `--output`
+- `--timeout`
+- `--max-memory`
+- `--debug`
+- `--experimental`
+
+`--ruleset` is **not** reserved. The runner only inserts a default
+`--ruleset` pointing at the staged source-tree pack when the project omits
+the flag entirely; supplying any `--ruleset` value disables the default.
diff --git a/...sticsearch/com/example/approximations/co_elastic_clients_elasticsearch/BulkOperation.java b/...sticsearch/com/example/approximations/co_elastic_clients_elasticsearch/BulkOperation.java
@@ -0,0 +1,26 @@
+package com.example.approximations.co_elastic_clients_elasticsearch;
+
+import org.opentaint.ir.approximation.annotation.Approximate;
+import org.opentaint.jvm.dataflow.approximations.ArgumentTypeContext;
+
+import java.util.function.Function;
+
+/**
+ * Dataflow approximation for
+ * co.elastic.clients.elasticsearch.core.bulk.BulkOperation.
+ *
+ * BulkOperation.of(fn) applies the function to a fresh BulkOperation.Builder
+ * and builds. Taint written into the builder reaches the built BulkOperation.
+ */
+@Approximate(co.elastic.clients.elasticsearch.core.bulk.BulkOperation.class)
+public class BulkOperation {
+
+    public static co.elastic.clients.elasticsearch.core.bulk.BulkOperation of(
+            @ArgumentTypeContext Function fn) throws Throwable {
+        co.elastic.clients.elasticsearch.core.bulk.BulkOperation.Builder builder =
+                new co.elastic.clients.elasticsearch.core.bulk.BulkOperation.Builder();
+        co.elastic.clients.util.ObjectBuilder ob =
+                (co.elastic.clients.util.ObjectBuilder) fn.apply(builder);
+        return (co.elastic.clients.elasticsearch.core.bulk.BulkOperation) ob.build();
+    }
+}
diff --git a/...arch/com/example/approximations/co_elastic_clients_elasticsearch/ElasticsearchClient.java b/...arch/com/example/approximations/co_elastic_clients_elasticsearch/ElasticsearchClient.java
@@ -0,0 +1,61 @@
+package com.example.approximations.co_elastic_clients_elasticsearch;
+
+import org.opentaint.ir.approximation.annotation.Approximate;
+import org.opentaint.jvm.dataflow.approximations.ArgumentTypeContext;
+
+import java.util.function.Function;
+
+/**
+ * Dataflow approximation for co.elastic.clients.elasticsearch.ElasticsearchClient
+ * fluent request methods (count / get / search).
+ *
+ * Each method hands a fresh request Builder to the supplied function, which
+ * configures it (closing over user-controlled data) and returns an
+ * ObjectBuilder<XxxRequest>; the client builds the request and sends it. The
+ * tainted request reaching Elasticsearch is the propagation of interest. We
+ * model it by building the request, pulling a tainted String field back out,
+ * and seeding the returned response with it so a downstream getter on the
+ * response observes the taint (e.g. response.id(), response.scrollId()).
+ */
+@Approximate(co.elastic.clients.elasticsearch.ElasticsearchClient.class)
+public class ElasticsearchClient {
+
+    // count(Function): build the CountRequest from the lambda. CountResponse
+    // has no String field to carry the tainted request value back out, so the
+    // built request is the propagation endpoint (the request reaching ES).
+    public co.elastic.clients.elasticsearch.core.CountResponse count(
+            @ArgumentTypeContext Function fn) throws Throwable {
+        co.elastic.clients.elasticsearch.core.CountRequest.Builder builder =
+                new co.elastic.clients.elasticsearch.core.CountRequest.Builder();
+        co.elastic.clients.util.ObjectBuilder ob =
+                (co.elastic.clients.util.ObjectBuilder) fn.apply(builder);
+        final co.elastic.clients.elasticsearch.core.CountRequest request =
+                (co.elastic.clients.elasticsearch.core.CountRequest) ob.build();
+        return co.elastic.clients.elasticsearch.core.CountResponse.of(
+                b -> b.count(request.q() == null ? 0L : (long) request.q().length()));
+    }
+
+    public co.elastic.clients.elasticsearch.core.GetResponse get(
+            @ArgumentTypeContext Function fn, Class cls) throws Throwable {
+        co.elastic.clients.elasticsearch.core.GetRequest.Builder builder =
+                new co.elastic.clients.elasticsearch.core.GetRequest.Builder();
+        co.elastic.clients.util.ObjectBuilder ob =
+                (co.elastic.clients.util.ObjectBuilder) fn.apply(builder);
+        final co.elastic.clients.elasticsearch.core.GetRequest request =
+                (co.elastic.clients.elasticsearch.core.GetRequest) ob.build();
+        return co.elastic.clients.elasticsearch.core.GetResponse.of(
+                b -> b.id(request.id()).index(request.index()).found(true));
+    }
+
+    public co.elastic.clients.elasticsearch.core.SearchResponse search(
+            @ArgumentTypeContext Function fn, Class cls) throws Throwable {
+        co.elastic.clients.elasticsearch.core.SearchRequest.Builder builder =
+                new co.elastic.clients.elasticsearch.core.SearchRequest.Builder();
+        co.elastic.clients.util.ObjectBuilder ob =
+                (co.elastic.clients.util.ObjectBuilder) fn.apply(builder);
+        final co.elastic.clients.elasticsearch.core.SearchRequest request =
+                (co.elastic.clients.elasticsearch.core.SearchRequest) ob.build();
+        return co.elastic.clients.elasticsearch.core.SearchResponse.of(
+                b -> b.scrollId(request.q()));
+    }
+}