From 9b07623e4541b183643ed02026d2daafea08be4a Mon Sep 17 00:00:00 2001 From: Szymon Janikowski Date: Fri, 8 May 2026 18:08:24 +0200 Subject: [PATCH] Add draft-to-design-doc eval comparing three variant levels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a quick benchmark project under `evals/draft-to-design-doc/` that runs Claude Code at three levels of scaffolding on the same Markdown design draft and produces a Design Doc JSON for inspection. Variants: - vanilla — minimal CLAUDE.md, no plugin/MCP. Reads draft.md + schema-reference.md and writes /app/output/design-doc.json directly. - guided — extended prompt mirroring noesis:analyze-design-draft's extract-design-model.md. Same task as vanilla. - noesis — full noesis plugin installed via Dockerfile (bun install on ubuntu:24.04 — needs GLIBC 2.39 for lbug native module). noesis-graph MCP server registered through [[environment.mcp_servers]]. The skill is invoked on /app/draft.md. Both task sandboxes clone DDD-starter-dotnet into /app/repo/ so the agent can inspect what is already implemented and classify model elements correctly as modified vs added. `tests/test.sh` is TDD-style: ONE specification (byte-for-byte identical between the two task copies) asserts the expected correct ChangeSet shape: - Sales BC in boundedContexts.modified (already in /app/repo/Sources/Sales) - Sales.Pricing.Discounts module in modules.modified (already exists) - ThresholdDiscount in buildingBlocks.added (genuinely new) - Discount in buildingBlocks.modified (existing union gains a new variant) Currently noesis-graph:save_design_doc enforces a green-field shape, so all three variants fail this test. The failure IS the signal — when the green-field constraint is lifted, runs producing the correct shape will start passing. `assessment_dimensions.json` is intentionally empty for now — quality grading is left to manual inspection until the dimensions are designed. Use `prepare-context.sh` to stage plugin sources into the noesis Dockerfile's build context before running the noesis variant; see README.md for invocation. Co-Authored-By: Claude Opus 4.7 (1M context) --- evals/draft-to-design-doc/.gitignore | 3 + evals/draft-to-design-doc/README.md | 64 ++++ .../assessment_dimensions.json | 3 + evals/draft-to-design-doc/nasde.toml | 20 ++ .../draft.md | 103 ++++++ .../environment/.dockerignore | 6 + .../environment/.gitignore | 1 + .../environment/Dockerfile | 68 ++++ .../environment/prepare-context.sh | 32 ++ .../instruction.md | 61 ++++ .../task.toml | 38 +++ .../tests/test.sh | 92 ++++++ .../threshold-discount-extraction/draft.md | 103 ++++++ .../environment/Dockerfile | 20 ++ .../instruction.md | 41 +++ .../schema-reference.md | 309 ++++++++++++++++++ .../threshold-discount-extraction/task.toml | 25 ++ .../tests/test.sh | 92 ++++++ .../variants/guided/CLAUDE.md | 63 ++++ .../variants/guided/harbor_config.json | 15 + .../variants/guided/variant.toml | 2 + .../variants/noesis/CLAUDE.md | 7 + .../variants/noesis/harbor_config.json | 14 + .../variants/noesis/variant.toml | 2 + .../variants/vanilla/CLAUDE.md | 7 + .../variants/vanilla/harbor_config.json | 15 + .../variants/vanilla/variant.toml | 2 + 27 files changed, 1208 insertions(+) create mode 100644 evals/draft-to-design-doc/.gitignore create mode 100644 evals/draft-to-design-doc/README.md create mode 100644 evals/draft-to-design-doc/assessment_dimensions.json create mode 100644 evals/draft-to-design-doc/nasde.toml create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/draft.md create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.dockerignore create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.gitignore create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/Dockerfile create mode 100755 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/prepare-context.sh create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/instruction.md create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/task.toml create mode 100755 evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/tests/test.sh create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction/draft.md create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction/environment/Dockerfile create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction/instruction.md create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction/schema-reference.md create mode 100644 evals/draft-to-design-doc/tasks/threshold-discount-extraction/task.toml create mode 100755 evals/draft-to-design-doc/tasks/threshold-discount-extraction/tests/test.sh create mode 100644 evals/draft-to-design-doc/variants/guided/CLAUDE.md create mode 100644 evals/draft-to-design-doc/variants/guided/harbor_config.json create mode 100644 evals/draft-to-design-doc/variants/guided/variant.toml create mode 100644 evals/draft-to-design-doc/variants/noesis/CLAUDE.md create mode 100644 evals/draft-to-design-doc/variants/noesis/harbor_config.json create mode 100644 evals/draft-to-design-doc/variants/noesis/variant.toml create mode 100644 evals/draft-to-design-doc/variants/vanilla/CLAUDE.md create mode 100644 evals/draft-to-design-doc/variants/vanilla/harbor_config.json create mode 100644 evals/draft-to-design-doc/variants/vanilla/variant.toml diff --git a/evals/draft-to-design-doc/.gitignore b/evals/draft-to-design-doc/.gitignore new file mode 100644 index 0000000..0fda25f --- /dev/null +++ b/evals/draft-to-design-doc/.gitignore @@ -0,0 +1,3 @@ +jobs/ +.env +__pycache__/ diff --git a/evals/draft-to-design-doc/README.md b/evals/draft-to-design-doc/README.md new file mode 100644 index 0000000..94cd374 --- /dev/null +++ b/evals/draft-to-design-doc/README.md @@ -0,0 +1,64 @@ +# Draft → Design Doc + +Quick eval comparing how Claude Code extracts a Design Doc JSON from a Markdown design draft, across three levels of scaffolding: + +| Variant | Task | Prompt | Plugin / MCP | Expected output | +|---|---|---|---|---| +| `vanilla` | `threshold-discount-extraction` | minimal | none | `/app/output/design-doc.json` | +| `guided` | `threshold-discount-extraction` | extended (mirrors `analyze-design-draft` reference) | none | `/app/output/design-doc.json` | +| `noesis` | `threshold-discount-extraction-noesis` | "use the skill" | full plugin + `noesis-graph` MCP | `/app/noesis/design-docs/-.json` | + +## TDD note (read this first) + +`tests/test.sh` is **TDD-style** — it asserts the *expected correct behaviour*, not the current behaviour of the noesis plugin. It is **identical** for all three variants (one spec, three implementations). + +Currently `noesis-graph:save_design_doc` enforces a green-field ChangeSet shape: every collection's `added` is populated and `modified` / `removed` are empty. That's a bug — the Design Doc is supposed to express the diff against the implemented codebase. The DDD-starter-dotnet repo is mounted at `/app/repo/`; the agent should use it as the diff baseline. + +The test asserts: + +- `Sales` BC in `boundedContexts.modified` (it already exists in `/app/repo/Sources/Sales/`) +- `Sales.Pricing.Discounts` module in `Sales.modules.modified` (already exists) +- `ThresholdDiscount` in `buildingBlocks.added` under that module (genuinely new) +- `Discount` in `buildingBlocks.modified` (already exists; the draft adds a new `Threshold` factory behaviour) + +Until the green-field constraint is lifted, **all three variants will FAIL**. That's the intended TDD signal — when the bug is fixed, runs that produce the correct shape start passing. + +Each task ships its own copy of `test.sh`, but the assertions are byte-for-byte identical. The two paths exist only because Harbor binds `task.toml` → one Dockerfile → one verifier per task, and the noesis variant needs a different sandbox (Bun + plugin install + MCP server). + +## Auth + +```bash +source ~/Documents/git/noesis/sdlc-projects/nasde-toolkit/scripts/export_oauth_token.sh +``` + +## Run + +The two non-noesis variants share a single task: + +```bash +nasde run --variant vanilla --tasks threshold-discount-extraction --without-eval -C evals/draft-to-design-doc +nasde run --variant guided --tasks threshold-discount-extraction --without-eval -C evals/draft-to-design-doc +``` + +The noesis variant has a dedicated task (different sandbox: bun + plugin install + MCP server). Stage the plugin sources into the docker build context once, then run: + +```bash +bash evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/prepare-context.sh +nasde run --variant noesis --tasks threshold-discount-extraction-noesis --without-eval -C evals/draft-to-design-doc +``` + +Re-run `prepare-context.sh` whenever the plugin sources under `src/agent_extensions/plugins/noesis/` change — it's a `rsync` mirror so it stays cheap. + +## Inspect output + +```bash +LATEST=$(ls -td evals/draft-to-design-doc/jobs/*____*/ | head -1) +DOC=$(find "$LATEST" -name "design-doc.json" -o -name "*.json" -path "*noesis/design-docs/*" | head -1) +jq '.' "$DOC" | less +``` + +## Notes + +- `--without-eval` skips the LLM-as-Judge phase. `assessment_dimensions.json` is intentionally empty here — quality grading is owned by manual inspection until the dimensions are designed. +- The vanilla and guided variants do NOT pass `--with-opik` (no Opik tracking by default). Add the flag when comparing across runs. +- Per-variant results live under `jobs/____/__/`. The produced design doc is in `artifacts/workspace/output/design-doc.json` (vanilla/guided) or `artifacts/workspace/noesis/design-docs/.json` (noesis). diff --git a/evals/draft-to-design-doc/assessment_dimensions.json b/evals/draft-to-design-doc/assessment_dimensions.json new file mode 100644 index 0000000..45e9b41 --- /dev/null +++ b/evals/draft-to-design-doc/assessment_dimensions.json @@ -0,0 +1,3 @@ +{ + "dimensions": [] +} diff --git a/evals/draft-to-design-doc/nasde.toml b/evals/draft-to-design-doc/nasde.toml new file mode 100644 index 0000000..0d27548 --- /dev/null +++ b/evals/draft-to-design-doc/nasde.toml @@ -0,0 +1,20 @@ +[project] +name = "draft-to-design-doc" +version = "1.0.0" + +[defaults] +variant = "vanilla" +model = "claude-sonnet-4-6" +timeout_sec = 600 + +[docker] +base_image = "ubuntu:22.04" +build_commands = [] + +[evaluation] +model = "claude-sonnet-4-6" +dimensions_file = "assessment_dimensions.json" + +[reporting] +platform = "opik" +project_name = "draft-to-design-doc" diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/draft.md b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/draft.md new file mode 100644 index 0000000..00e6545 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/draft.md @@ -0,0 +1,103 @@ +# Threshold-activated percentage discount + +Author: Sales Product Team + Architecture +Date: 2026-05-07 +Status: Draft for review + +## Background + +Our sales reps already have two ways to grant discounts on a single product: + +- **Percentage discount** — e.g. 10% off, applied to any price. +- **Value discount** — e.g. 50 PLN off, capped at the price (never goes negative). + +Both live in `Sales.DeepModel.Pricing.Discounts` as immutable value objects, and they are exposed to the rest of the system through a single `Discount` discriminated-union type (so callers don't have to know which variant they hold). + +A percentage discount applies regardless of the underlying price. Reps have been complaining that this is too blunt for premium products: a 10% promotion on a 50-PLN accessory doesn't move the needle, but the same promo on a 600-PLN device is a real margin hit. They want a discount type that **only kicks in once the price clears a certain bar** — below that bar, the customer pays the original price. + +## Business requirement + +Add a third kind of discount: a **threshold-activated percentage discount**. + +It carries two parameters: + +- a **percentage** (same shape as in `PercentageDiscount`), +- a **price threshold** (a `Money` amount that the price must *exceed* to activate). + +Behaviour when applied to a price: + +- if the price is **above** the threshold → return the price reduced by the percentage, +- if the price is **at** the threshold or **below** it → return the price unchanged. + +The new discount must be usable everywhere the existing `Discount` type is used today — no new caller, no parallel union, no separate wiring. From the outside, a `Discount` should now have three variants instead of two. + +## Worked examples + +| Price | Threshold | Percentage | Result | Why | +|---|---|---|---|---| +| 600 PLN | 500 PLN | 10% | 540 PLN | price > threshold → discount applies | +| 500 PLN | 500 PLN | 10% | 500 PLN | price == threshold → no discount (strict greater-than) | +| 300 PLN | 500 PLN | 10% | 300 PLN | price < threshold → unchanged | +| 1000 PLN | 0 PLN | 25% | 750 PLN | any positive price clears a 0 threshold | + +## Acceptance criteria + +1. The `Discount` discriminated union has a **third variant**. Existing call sites that build a percentage or value discount keep working unchanged — no breaking change to their public surface. +2. `Discount.ApplyOn(price)` dispatches correctly to the new variant. +3. Construction of the new variant is **fail-fast**: an instance with a percentage outside the legal range, or with a non-positive threshold, cannot exist. +4. Existing `PercentageDiscount` and `ValueDiscount` are **not modified** — their behaviour and tests stay green. +5. Tests cover at minimum: above-threshold, at-threshold (boundary), below-threshold, zero-price, factory rejection on invalid percentage, factory rejection on non-positive threshold, equality of two equivalent instances. + +## Architectural decisions + +These were resolved during the design review on 2026-05-06. They are not up for re-debate during implementation; if a constraint conflicts with one of them, escalate. + +### A1. New value object: `ThresholdDiscount` + +A new Building Block in the `Sales.DeepModel.Pricing.Discounts` module. Type: **value object**, mirroring the shape of `PercentageDiscount` and `ValueDiscount`: + +- C# `readonly struct` annotated with `[DddValueObject]`. +- Implements the existing `PriceModifier` interface (provides `ApplyOn(Money price)`). +- Implements `IEquatable`, with `Equals(object?)`, `GetHashCode()`, and a sensible `ToString()` consistent with the two existing discount value objects. +- Internal state is **two private fields**: a `Percentage` and a `Money` threshold. No public getters — equality, application and `ToString` are the only externally observable behaviours. +- Construction goes through a **static factory method** (`Of(Percentage value, Money threshold)`, mirroring `PercentageDiscount.Of` and `ValueDiscount.Of`). The constructor is private. The factory enforces the invariant from §A4. + +### A2. `ApplyOn(Money price)` semantics + +`ApplyOn` returns: + +- `price * (Percentage.Of100 - percentage)` when `price > threshold`, +- `price` otherwise. + +The `>` comparison is **strict** (price equal to threshold returns `price` unchanged). This matches the worked examples and the way reps describe the rule ("the price has to *exceed* the threshold"). + +### A3. `Discount` union — third variant + +The `Discount` discriminated union in `Discount.cs` is **modified**, not replaced or paralleled: + +- A new private discriminator state must be introduced. The current `bool _isPercentage` is no longer sufficient; replace it with a small enum-like discriminator (e.g. a private nested enum or three named constants) so that the three cases are exhaustively distinguishable inside `ApplyOn`. Keep the field private. +- Add a third field `private readonly ThresholdDiscount _thresholdDiscount;` next to the two existing variant fields. +- Add a public static factory `Discount.Threshold(Percentage value, Money threshold)`, mirroring `Discount.Percentage(...)` and `Discount.Value(...)`. +- `ApplyOn(Money price)` dispatches on the discriminator across all three branches. +- `Equals`, `GetHashCode` and `ToString` continue to cover all three variants. +- The two existing factory methods (`Discount.Percentage`, `Discount.Value`) keep their signature exactly as today — call sites must not need any change. + +### A4. Construction invariants + +`ThresholdDiscount.Of(Percentage value, Money threshold)` rejects invalid inputs at construction time: + +- `value` outside `[0%, 100%]` → reject (this validation already lives in the `Percentage` value object; rely on it, do not duplicate). +- `threshold` non-positive (≤ zero in its currency) → reject with a clear domain error indicating the threshold must be strictly positive. + +Apply-time code does **not** re-validate. The factory is the only invariant gatekeeper. + +### A5. Persistence + +The SQL repository in `Sales.Adapters/Pricing/Discounts/DiscountsSqlRepository.cs` already round-trips the `Discount` union. Extend it minimally to handle the third variant — same shape of change as the two existing variants. No schema migration is in scope of this draft (the storage representation is whatever the repository today maps onto; if the existing serialisation is shape-flexible enough, no DB change is needed). + +### A6. Out of scope + +- Multi-product / cart-level threshold discounts (this is per single product price). +- Time-bounded promotions (a discount valid only in a date range). +- Stacking rules between discount types — unchanged. +- Renaming or refactoring the existing `PercentageDiscount`, `ValueDiscount`, or the `Discount` discriminator field. diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.dockerignore b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.dockerignore new file mode 100644 index 0000000..a0f2028 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.dockerignore @@ -0,0 +1,6 @@ +_plugin-staging/node_modules +_plugin-staging/.serena +_plugin-staging/noesis +_plugin-staging/.git +_plugin-staging/**/__pycache__ +_plugin-staging/**/.DS_Store diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.gitignore b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.gitignore new file mode 100644 index 0000000..be198d4 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/.gitignore @@ -0,0 +1 @@ +_plugin-staging/ diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/Dockerfile b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/Dockerfile new file mode 100644 index 0000000..8764f35 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/Dockerfile @@ -0,0 +1,68 @@ +# Sandbox for the Noesis variant of the threshold-discount-extraction task. +# +# Layers: +# 1. Ubuntu base + apt deps (jq, curl, ca-certs, python3 for prepare.ts). +# 2. Bun install (cached unless the install URL changes). +# 3. Plugin source copy (re-runs only when plugin sources change). +# 4. bun install + plugin registration (re-runs only when sources change). +# +# The noesis-graph MCP server is started by Harbor via the [[environment.mcp_servers]] +# entry in task.toml: `cd /opt/noesis-plugin && bun run mcp/noesis-graph/server.ts`. +# The skill is registered as a Claude Code plugin under /root/.claude/plugins/noesis/. + +# ubuntu:24.04 ships GLIBC 2.39 — required by `lbug` (ladybug-db) native module. +# Older ubuntu:22.04 has GLIBC 2.35 and fails to dlopen lbugjs.node. +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + jq \ + curl \ + ca-certificates \ + python3 \ + git \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +# Install Bun. We pin a version to keep builds reproducible. +RUN curl -fsSL https://bun.sh/install | BUN_INSTALL=/usr/local bash +ENV PATH="/usr/local/bin:${PATH}" +RUN bun --version + +# Plugin sources are staged into ./_plugin-staging/ next to this Dockerfile by +# the prepare-context.sh helper before `nasde run` is invoked. Harbor's docker +# build context is the directory containing this Dockerfile, so a relative COPY +# is the only portable way to pull plugin sources in. +# +# .dockerignore (sibling file) excludes node_modules, .serena, noesis/ scratch +# dirs and other host-only artefacts so the COPY stays small (~6 MB). +COPY _plugin-staging/ /opt/noesis-plugin/ + +WORKDIR /opt/noesis-plugin +RUN bun install --production --frozen-lockfile + +# Register the plugin so Claude Code in the sandbox finds the noesis:analyze-design-draft skill. +# Claude Code looks for plugins under ~/.claude/plugins// with a .claude-plugin/plugin.json manifest. +RUN mkdir -p /root/.claude/plugins && \ + ln -s /opt/noesis-plugin /root/.claude/plugins/noesis && \ + ls /root/.claude/plugins/noesis/.claude-plugin/ + +# CLAUDE_PLUGIN_ROOT is referenced by SKILL.md scripts at runtime — set it so +# any "${CLAUDE_PLUGIN_ROOT}/..." path inside the skill resolves correctly. +ENV CLAUDE_PLUGIN_ROOT=/opt/noesis-plugin + +# Workdir where the agent will operate. The skill will write Design Doc JSON +# under /app/noesis/design-docs/ via save_design_doc. +WORKDIR /app +RUN mkdir -p /app/noesis/design-docs /app/noesis/conversations /app/noesis/documents /app/noesis/topics /app/noesis/decisions /app/noesis/design-drafts + +# Clone the target codebase so the agent (and the noesis-graph scan_to_tmp tool) +# can verify what is already implemented. The skill's Step 6 expects the diff +# baseline to be the implemented codebase. Without the repo the agent cannot +# tell that the existing `Sales` BC, `Sales.Pricing.Discounts` module and +# the `Discount` / `PercentageDiscount` / `ValueDiscount` value objects already +# exist — and therefore must be classified as `modified` rather than `added`. +RUN git clone --depth 1 https://github.com/itlibrium/DDD-starter-dotnet.git /app/repo + +CMD ["/bin/bash"] diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/prepare-context.sh b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/prepare-context.sh new file mode 100755 index 0000000..072c2c1 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/prepare-context.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Stage the noesis plugin sources into ./_plugin-staging/ so the Dockerfile +# can COPY them. Run this once (or whenever plugin sources change) BEFORE +# `nasde run --variant noesis -C evals/draft-to-design-doc`. +# +# The staging dir is git-ignored. +# +# Usage (from anywhere): +# bash evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/environment/prepare-context.sh +# +set -euo pipefail + +HERE="$(cd "$(dirname "$0")" && pwd)" +STAGING="$HERE/_plugin-staging" +PLUGIN_SRC="$(cd "$HERE/../../../../../src/agent_extensions/plugins/noesis" && pwd)" + +echo "[prepare-context] plugin source: $PLUGIN_SRC" +echo "[prepare-context] staging dir: $STAGING" + +# Use rsync to mirror only what we need. Excludes match the .dockerignore so the +# Docker build context stays small. +mkdir -p "$STAGING" +rsync -a --delete \ + --exclude=node_modules \ + --exclude=.serena \ + --exclude=noesis \ + --exclude=.git \ + --exclude=__pycache__ \ + --exclude=.DS_Store \ + "$PLUGIN_SRC/" "$STAGING/" + +echo "[prepare-context] staged $(du -sh "$STAGING" | cut -f1) into _plugin-staging/" diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/instruction.md b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/instruction.md new file mode 100644 index 0000000..b8b941f --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/instruction.md @@ -0,0 +1,61 @@ +# Extract a Design Doc using the noesis:analyze-design-draft skill + +The Noesis plugin is installed in this sandbox and exposes the `noesis:analyze-design-draft` skill plus the `noesis-graph` MCP server. Use them. + +## Inputs + +- `/app/draft.md` — the design draft to analyse. +- `/app/repo/` — the target codebase the Design Doc describes (DDD-starter-dotnet, cloned from upstream). The skill / MCP `scan_to_tmp` tool can use this to determine what is already implemented and what is genuinely new. + +## Output + +- The skill persists the extracted Design Doc as a JSON file under `/app/noesis/design-docs/`. The filename is derived from the design doc's name and id by `prepare_design_doc_path`. Do not write the JSON manually — let the skill do it via `save_design_doc`. + +## How to invoke the skill + +Invoke the skill with the draft path and a design-doc title: + +``` +@/app/draft.md title="Threshold-activated discount" date=2026-05-08 main_topic="Discount value objects in Sales pricing" design_doc_title="threshold-activated-discount" +``` + +When the skill asks (during Setup) for the `design_doc_path`, answer: + +``` +/app/noesis/design-docs/ +``` + +(the skill computes the canonical filename from the title and id; just give it the directory under `noesis/`). + +The skill will: + +1. Run `prepare.ts` on the draft → working dir + section tree + fragment list. +2. Find existing topics in the graph (Goldilocks). +3. Categorise fragments + assign topics. +4. Find existing decisions. +5. Review topics, generate summaries, extract decisions. +6. Extract the design model (per `extract-design-model.md` + `design-doc-schema.md`) and persist via `save_design_doc`. The diff baseline is the implemented codebase — call `noesis-graph:scan_to_tmp` first if you need to see what already exists in `/app/repo/`. +7. Merge topics, fragments and decisions via `merge_document`. + +Follow the skill's workflow exactly; don't shortcut steps. The MCP server is `noesis-graph` (already registered in the sandbox's Claude config). + +## Diff baseline + +Per `extract-design-model.md` and `design-doc-schema.md` Section 3, the Design Doc is a diff against the **currently implemented codebase** under `/app/repo/`. For each element, ask: *is this already in /app/repo/, and is the draft changing it?* + +- **Already in /app/repo, draft changes it** → goes in `modified`. +- **Not in /app/repo** → goes in `added`. +- **Already in /app/repo, no change** → omit. +- **In /app/repo, draft removes it** → `removed` (by name). + +For this draft specifically: +- `Sales` BC already exists in `/app/repo/Sources/Sales/`. → `boundedContexts.modified`. +- `Sales.Pricing.Discounts` module already exists. → `modules.modified` under Sales. +- `Discount`, `PercentageDiscount`, `ValueDiscount` value objects already exist. → `modified` if changed, else omitted. +- `ThresholdDiscount` is genuinely new. → `buildingBlocks.added`. + +## Rules + +- Do not modify `/app/draft.md` or `/app/repo/`. +- Do not write JSON files outside the path the `save_design_doc` MCP tool returns. +- Do not bypass the skill — write the design doc through `save_design_doc`, never with `Write` directly. diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/task.toml b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/task.toml new file mode 100644 index 0000000..43a5df8 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/task.toml @@ -0,0 +1,38 @@ +name = "threshold-discount-extraction-noesis" +description = "Extract a Design Doc from a Markdown design draft using the noesis:analyze-design-draft skill backed by the noesis-graph MCP server." +difficulty = "intermediate" +estimated_time_minutes = 15 +tags = [ + "design-doc", + "model-extraction", + "DDD", + "noesis-plugin", + "noesis-mcp" +] +instruction = "./instruction.md" + +[environment] +type = "docker" +dockerfile = "./environment/Dockerfile" + +# Register the noesis-graph MCP server for the agent. Harbor writes this into +# the sandbox's .claude.json under mcpServers, so the noesis:analyze-design-draft +# skill can call its MCP tools (list_topics, save_design_doc, …). +[[environment.mcp_servers]] +name = "noesis-graph" +transport = "stdio" +command = "sh" +args = [ + "-c", + "export CLAUDE_PROJECT_DIR=/app && export CLAUDE_PLUGIN_DATA=/var/noesis-plugin-data && mkdir -p $CLAUDE_PLUGIN_DATA && cd /opt/noesis-plugin && exec bun run mcp/noesis-graph/server.ts" +] + +[evaluation] +type = "script" +script = "./tests/test.sh" +timeout_seconds = 60 + +[metadata] +domain = "Software Architecture" +draft_language = "English" +expected_output_dir = "/app/noesis/design-docs/" diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/tests/test.sh b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/tests/test.sh new file mode 100755 index 0000000..ab063fe --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction-noesis/tests/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# TDD-style verifier for draft-to-design-doc. The test asserts the EXPECTED +# correct behaviour, not the current behaviour of the noesis plugin. +# +# At the time of writing, the noesis plugin's `save_design_doc` enforces a +# green-field ChangeSet shape (everything in `added`). That is a bug — the +# Design Doc should reflect the diff against the implemented codebase, where +# the `Sales` Bounded Context, `Sales.Pricing.Discounts` module and the +# `Discount` discriminated union ALREADY EXIST in the DDD-starter-dotnet repo +# (mounted at /app/repo). The correct shape is therefore: +# +# - `Sales` BC in `boundedContexts.modified` +# - `Sales.Pricing.Discounts` module in `modules.modified` +# - `Discount` BB in `buildingBlocks.modified` (a new `Threshold` factory +# behaviour is added, but the BB itself already exists) +# - `ThresholdDiscount` BB in `buildingBlocks.added` (genuinely new) +# +# Until the plugin lifts the green-field constraint, ALL three variants will +# fail this test. That's the intended TDD signal — the test is the spec. + +set -u + +# Search for the design doc JSON in the two known canonical locations: +# - /app/output/design-doc.json (vanilla / guided variants) +# - /app/noesis/design-docs/*.json (noesis variant — name computed by the plugin) +OUT="" +if [ -f /app/output/design-doc.json ]; then + OUT=/app/output/design-doc.json +elif compgen -G "/app/noesis/design-docs/*.json" > /dev/null; then + # shellcheck disable=SC2012 + OUT=$(ls /app/noesis/design-docs/*.json 2>/dev/null | head -1) +fi + +REWARD_DIR=/logs/verifier +mkdir -p "$REWARD_DIR" 2>/dev/null || true + +fail() { + echo "✗ FAILURE: $1" + echo 0 > "$REWARD_DIR/reward.txt" + exit 1 +} + +if [ -z "$OUT" ]; then + fail "no design-doc JSON found (looked at /app/output/design-doc.json and /app/noesis/design-docs/*.json)" +fi + +echo "Inspecting: $OUT" + +if ! jq -e . "$OUT" > /dev/null 2>&1; then + fail "$OUT is not valid JSON" +fi + +# --- Top-level shape --- +jq -e '.name and .description and .boundedContexts' "$OUT" > /dev/null \ + || fail "missing top-level field: name, description, or boundedContexts" + +# --- Sales BC must be in `boundedContexts.modified` (it already exists in /app/repo) --- +jq -e ' + (.boundedContexts.modified // []) | map(.name) | index("Sales") +' "$OUT" > /dev/null \ + || fail "Sales is not in boundedContexts.modified — it should be classified as modified, since it already exists in /app/repo/Sources/Sales/. Found instead: added=$(jq -r ".boundedContexts.added // [] | map(.name) | join(\",\")" "$OUT"), modified=$(jq -r ".boundedContexts.modified // [] | map(.name) | join(\",\")" "$OUT")" + +# --- Sales.Pricing.Discounts module must be in modules.modified under Sales --- +jq -e ' + (.boundedContexts.modified // [])[] | select(.name == "Sales") | + (.modules.modified // []) | map(.name) | index("Sales.Pricing.Discounts") +' "$OUT" > /dev/null \ + || fail "Sales.Pricing.Discounts is not in Sales.modules.modified — it should be modified, since it already exists in /app/repo/Sources/Sales/Sales.DeepModel/Pricing/Discounts/" + +# --- ThresholdDiscount must be in buildingBlocks.added under that module (genuinely new) --- +jq -e ' + (.boundedContexts.modified // [])[] | select(.name == "Sales") | + (.modules.modified // [])[] | select(.name == "Sales.Pricing.Discounts") | + (.buildingBlocks.added // []) | map(.name) | index("ThresholdDiscount") +' "$OUT" > /dev/null \ + || fail "ThresholdDiscount is not in Sales.Pricing.Discounts.buildingBlocks.added — it is the genuinely new building block introduced by this draft" + +# --- Discount must be in buildingBlocks.modified (already exists; gains a new factory behaviour) --- +jq -e ' + (.boundedContexts.modified // [])[] | select(.name == "Sales") | + (.modules.modified // [])[] | select(.name == "Sales.Pricing.Discounts") | + (.buildingBlocks.modified // []) | map(.name) | index("Discount") +' "$OUT" > /dev/null \ + || fail "Discount is not in Sales.Pricing.Discounts.buildingBlocks.modified — it already exists in /app/repo/Sources/Sales/Sales.DeepModel/Pricing/Discounts/Discount.cs and the draft adds a Threshold variant to it" + +# All checks passed. +echo "✓ SUCCESS: design doc reflects the correct diff against the implemented codebase" +echo " Path: $OUT" +echo " Sales is modified, Sales.Pricing.Discounts is modified," +echo " ThresholdDiscount is added, Discount is modified." +echo 1 > "$REWARD_DIR/reward.txt" +exit 0 diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction/draft.md b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/draft.md new file mode 100644 index 0000000..00e6545 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/draft.md @@ -0,0 +1,103 @@ +# Threshold-activated percentage discount + +Author: Sales Product Team + Architecture +Date: 2026-05-07 +Status: Draft for review + +## Background + +Our sales reps already have two ways to grant discounts on a single product: + +- **Percentage discount** — e.g. 10% off, applied to any price. +- **Value discount** — e.g. 50 PLN off, capped at the price (never goes negative). + +Both live in `Sales.DeepModel.Pricing.Discounts` as immutable value objects, and they are exposed to the rest of the system through a single `Discount` discriminated-union type (so callers don't have to know which variant they hold). + +A percentage discount applies regardless of the underlying price. Reps have been complaining that this is too blunt for premium products: a 10% promotion on a 50-PLN accessory doesn't move the needle, but the same promo on a 600-PLN device is a real margin hit. They want a discount type that **only kicks in once the price clears a certain bar** — below that bar, the customer pays the original price. + +## Business requirement + +Add a third kind of discount: a **threshold-activated percentage discount**. + +It carries two parameters: + +- a **percentage** (same shape as in `PercentageDiscount`), +- a **price threshold** (a `Money` amount that the price must *exceed* to activate). + +Behaviour when applied to a price: + +- if the price is **above** the threshold → return the price reduced by the percentage, +- if the price is **at** the threshold or **below** it → return the price unchanged. + +The new discount must be usable everywhere the existing `Discount` type is used today — no new caller, no parallel union, no separate wiring. From the outside, a `Discount` should now have three variants instead of two. + +## Worked examples + +| Price | Threshold | Percentage | Result | Why | +|---|---|---|---|---| +| 600 PLN | 500 PLN | 10% | 540 PLN | price > threshold → discount applies | +| 500 PLN | 500 PLN | 10% | 500 PLN | price == threshold → no discount (strict greater-than) | +| 300 PLN | 500 PLN | 10% | 300 PLN | price < threshold → unchanged | +| 1000 PLN | 0 PLN | 25% | 750 PLN | any positive price clears a 0 threshold | + +## Acceptance criteria + +1. The `Discount` discriminated union has a **third variant**. Existing call sites that build a percentage or value discount keep working unchanged — no breaking change to their public surface. +2. `Discount.ApplyOn(price)` dispatches correctly to the new variant. +3. Construction of the new variant is **fail-fast**: an instance with a percentage outside the legal range, or with a non-positive threshold, cannot exist. +4. Existing `PercentageDiscount` and `ValueDiscount` are **not modified** — their behaviour and tests stay green. +5. Tests cover at minimum: above-threshold, at-threshold (boundary), below-threshold, zero-price, factory rejection on invalid percentage, factory rejection on non-positive threshold, equality of two equivalent instances. + +## Architectural decisions + +These were resolved during the design review on 2026-05-06. They are not up for re-debate during implementation; if a constraint conflicts with one of them, escalate. + +### A1. New value object: `ThresholdDiscount` + +A new Building Block in the `Sales.DeepModel.Pricing.Discounts` module. Type: **value object**, mirroring the shape of `PercentageDiscount` and `ValueDiscount`: + +- C# `readonly struct` annotated with `[DddValueObject]`. +- Implements the existing `PriceModifier` interface (provides `ApplyOn(Money price)`). +- Implements `IEquatable`, with `Equals(object?)`, `GetHashCode()`, and a sensible `ToString()` consistent with the two existing discount value objects. +- Internal state is **two private fields**: a `Percentage` and a `Money` threshold. No public getters — equality, application and `ToString` are the only externally observable behaviours. +- Construction goes through a **static factory method** (`Of(Percentage value, Money threshold)`, mirroring `PercentageDiscount.Of` and `ValueDiscount.Of`). The constructor is private. The factory enforces the invariant from §A4. + +### A2. `ApplyOn(Money price)` semantics + +`ApplyOn` returns: + +- `price * (Percentage.Of100 - percentage)` when `price > threshold`, +- `price` otherwise. + +The `>` comparison is **strict** (price equal to threshold returns `price` unchanged). This matches the worked examples and the way reps describe the rule ("the price has to *exceed* the threshold"). + +### A3. `Discount` union — third variant + +The `Discount` discriminated union in `Discount.cs` is **modified**, not replaced or paralleled: + +- A new private discriminator state must be introduced. The current `bool _isPercentage` is no longer sufficient; replace it with a small enum-like discriminator (e.g. a private nested enum or three named constants) so that the three cases are exhaustively distinguishable inside `ApplyOn`. Keep the field private. +- Add a third field `private readonly ThresholdDiscount _thresholdDiscount;` next to the two existing variant fields. +- Add a public static factory `Discount.Threshold(Percentage value, Money threshold)`, mirroring `Discount.Percentage(...)` and `Discount.Value(...)`. +- `ApplyOn(Money price)` dispatches on the discriminator across all three branches. +- `Equals`, `GetHashCode` and `ToString` continue to cover all three variants. +- The two existing factory methods (`Discount.Percentage`, `Discount.Value`) keep their signature exactly as today — call sites must not need any change. + +### A4. Construction invariants + +`ThresholdDiscount.Of(Percentage value, Money threshold)` rejects invalid inputs at construction time: + +- `value` outside `[0%, 100%]` → reject (this validation already lives in the `Percentage` value object; rely on it, do not duplicate). +- `threshold` non-positive (≤ zero in its currency) → reject with a clear domain error indicating the threshold must be strictly positive. + +Apply-time code does **not** re-validate. The factory is the only invariant gatekeeper. + +### A5. Persistence + +The SQL repository in `Sales.Adapters/Pricing/Discounts/DiscountsSqlRepository.cs` already round-trips the `Discount` union. Extend it minimally to handle the third variant — same shape of change as the two existing variants. No schema migration is in scope of this draft (the storage representation is whatever the repository today maps onto; if the existing serialisation is shape-flexible enough, no DB change is needed). + +### A6. Out of scope + +- Multi-product / cart-level threshold discounts (this is per single product price). +- Time-bounded promotions (a discount valid only in a date range). +- Stacking rules between discount types — unchanged. +- Renaming or refactoring the existing `PercentageDiscount`, `ValueDiscount`, or the `Discount` discriminator field. diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction/environment/Dockerfile b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/environment/Dockerfile new file mode 100644 index 0000000..3c5fc92 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/environment/Dockerfile @@ -0,0 +1,20 @@ +# Draft → Design Doc extraction — sandbox for vanilla / guided variants. +# +# Includes the DDD-starter-dotnet repo at /app/repo so the agent can verify what +# already exists in code (e.g. the existing `Discount`, `PercentageDiscount`, +# `ValueDiscount` value objects in `Sources/Sales/Sales.DeepModel/Pricing/Discounts/`). +# This is needed for the agent to correctly classify the BC + module + existing BBs +# as `modified` rather than `added` in the produced Design Doc. +FROM ubuntu:22.04 + +RUN apt-get update && apt-get install -y \ + jq \ + ca-certificates \ + git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +RUN git clone --depth 1 https://github.com/itlibrium/DDD-starter-dotnet.git /app/repo +RUN mkdir -p /app/output + +CMD ["/bin/bash"] diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction/instruction.md b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/instruction.md new file mode 100644 index 0000000..ce12dd8 --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/instruction.md @@ -0,0 +1,41 @@ +# Extract a Design Doc JSON from a design draft + +Your task is to read the design draft at `/app/draft.md` and produce a Design Doc JSON file at `/app/output/design-doc.json`. + +The Design Doc JSON must conform to the schema described at `/app/schema-reference.md`. Read the schema reference before writing the JSON — it is the contract the file is validated against. + +## Inputs + +- `/app/draft.md` — the design draft (Markdown). It mixes business requirements with explicit architectural decisions; treat the *Architectural decisions* section as authoritative for every "how" choice and the *Business requirement* / *Worked examples* / *Acceptance criteria* sections as authoritative for every "what" choice. +- `/app/schema-reference.md` — the JSON schema and modelling rules the Design Doc must satisfy. +- `/app/repo/` — the target codebase the Design Doc describes (DDD-starter-dotnet, cloned from upstream). Inspect it to determine what is already implemented and what is genuinely new. + +## Output + +- `/app/output/design-doc.json` — a single JSON document conforming to the schema. + +## What "good" looks like + +The Design Doc must be **faithful**: every Building Block, Behaviour, Rule, Scenario, Property and Quality Attribute corresponds to something stated in the draft. Do not invent elements the draft does not describe; do not omit decisions the draft makes explicit. + +The Design Doc must be **schema-valid**: ChangeSets are correctly populated; every referenced Building Block name resolves; descriptions meet the length and structure requirements stated in the schema reference. + +The Design Doc must be a **correct diff against the implemented codebase**. Per `schema-reference.md` Section 3, the diff baseline is the currently implemented codebase — i.e. `/app/repo/`. For each element in the doc, ask: *is this already in /app/repo/, and is the draft changing it?* + +- **Already in /app/repo, draft changes it** → goes in `modified` (with only the changed sub-fields plus the identity `name`). +- **Not in /app/repo** → goes in `added`. +- **Already in /app/repo, no change** → omit (don't restate). +- **In /app/repo, draft removes it** → `removed` (by name). + +Example for this draft: +- The `Sales` Bounded Context already exists in `/app/repo/Sources/Sales/`. → `boundedContexts.modified`. +- The `Sales.Pricing.Discounts` module already exists. → `modules.modified` under Sales. +- `Discount`, `PercentageDiscount`, `ValueDiscount` value objects already exist. → if the draft changes them, `buildingBlocks.modified`; if not, omit. +- `ThresholdDiscount` is genuinely new. → `buildingBlocks.added`. + +## Rules + +- Do not modify `/app/draft.md`, `/app/schema-reference.md`, or `/app/repo/`. +- Do not add files outside `/app/output/`. +- All JSON keys are camelCase. +- Write a single JSON file; do not split the doc across multiple files. diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction/schema-reference.md b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/schema-reference.md new file mode 100644 index 0000000..d06fe7a --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/schema-reference.md @@ -0,0 +1,309 @@ +# Design Doc Schema Reference + +This file is loaded only when Step 6 of `noesis:analyze-design-draft` is reached. It captures (a) the JSON schema the produced design doc must conform to, (b) the heuristics for recognising model-describing content in a design draft, and (c) ChangeSet semantics. + +## 1. Recognising model content in a draft + +A section is treated as **model-describing** when its heading or body contains terms from the following lexicon. The match is fuzzy — "Bounded Context", "BC", "Context: Pricing" all count. + +| Concept | Heading / inline cues | +|---|---| +| Actor | "Actor", "User Role", "Persona" | +| Bounded Context | "Bounded Context", "BC", "Context:", "Subdomain" | +| Module | "Module", "Domain Module", "Package" | +| Building Block | "Aggregate", "Entity", "Value Object", "Domain Event", "Command", "Query", "Domain Service", "Application Service", "Repository", "Factory", "External Integration" | +| Behaviour | "Behaviour", "Use case", "Operation", "Scenario name" | +| Rule | "Business rule", "Invariant", "Constraint", "Rule:" — must be a **domain concern** | +| Scenario | "Scenario:", "Given/When/Then", "Acceptance criteria" | +| Quality Attribute | "Performance", "Availability", "Security", "Quality attribute", "NFR", "SLA", "SLO" — must be a **technical concern** | + +A section that contains none of the above (e.g. background discussion, narrative, comparison tables) is NOT a model section. Skip it. + +**Rule vs Quality Attribute discriminator.** Rules describe *what the business says is true* (invariants over domain state, transitions, computations). Quality attributes describe *how the system must behave technically* (latency, throughput, availability targets, authn/authz constraints, data-protection requirements). When in doubt, ask: does the constraint live in the ubiquitous language of the domain, or in the operational vocabulary of the platform? Domain → Rule. Platform → Quality Attribute. + +## 2. JSON schema (TypeScript / Zod, mirrors `shared-contracts/design-doc.ts`) + +All field names are **camelCase**. All collection fields are wrapped in a `ChangeSet`. + +```ts +DesignDoc { + id?: string // omit for first iteration → server generates UUID + name: string // stable human-readable, e.g. "pageindex-tree-search" + description: string // 1–2 sentences, what this design covers + boundedContexts?: ChangeSet +} + +DesignedBoundedContext { + name + description? + modules?: ChangeSet + buildingBlocks?: ChangeSet // blocks not belonging to any module + qualityAttributes?: ChangeSet // BC-wide technical concerns +} + +DesignedDomainModule { + name; description? + buildingBlocks?: ChangeSet + qualityAttributes?: ChangeSet // Module-wide technical concerns +} + +DesignedBuildingBlock { + name + type?: "aggregate"|"entity"|"value_object"|"domain_event"|"domain_command"|"domain_query" + | "domain_service"|"application_service"|"repository"|"factory"|"external_integration" + description? + implements?: string[] // BB names this block implements (OOP-style polymorphism) + properties?: ChangeSet + behaviours?: ChangeSet + rules?: ChangeSet + scenarios?: ChangeSet + qualityAttributes?: ChangeSet // BB-wide technical concerns +} + +DesignedProperty { + name + type? // BuildingBlock name OR primitive + description? // free-form per-property note (range, format, special semantics) + nullable?: boolean // default false + collection?: boolean // default false — true means "list of " +} + +DesignedBehaviour { + name + description // Required ≥400 chars on `added`; omit on `modified` when not changing. + // For application_service behaviours or those using ≥3 building blocks, + // embed a ```mermaid sequence diagram (warning, not error, when missing). + type?: "Command"|"Event"|"Query" + input?: ChangeSet // BuildingBlock names + output?: ChangeSet // BuildingBlock names + usedBuildingBlocks?: ChangeSet + rules?: ChangeSet + scenarios?: ChangeSet + qualityAttributes?: ChangeSet // Behaviour-scoped technical concerns + isPublic: boolean // default false + actor?: string // Graph-global actor name. Only valid when host BuildingBlock type === "application_service". +} + +DesignedRule { + name + ruleType?: "Consistency"|"Structure"|"Computation"|"State change" + description // Domain concern. Required ≥80 chars on `added`; omit on `modified` when not changing. + // Server rejects tautologies that paraphrase `name`. +} +DesignedScenario { name; description; given; when; then } +DesignedQualityAttribute { + name + type?: "performance"|"availability"|"security"|"other" + description // Technical concern. Required ≥80 chars on `added`; omit on `modified` when not changing. + // State a measurable expectation (target metric, threshold, scope). +} + +ChangeSet { added: T[]; modified: T[]; removed: string[] } // removed by name +``` + +**Actors are not part of the DesignDoc tree.** They live as a graph-global catalog. Use them via three side channels: + +1. `noesis-graph:list_actors` returns the catalog `[{ name, description }, …]`. Call it before authoring to see what already exists. +2. Reference an actor on an `application_service` behaviour by setting `behaviour.actor = ""`. Reuse names verbatim from the catalog whenever the persona matches. +3. Introduce a new actor with `noesis-graph:upsert_actor` (`{ name, description }`) **before** calling `save_design_doc`. The save validates that every referenced actor exists in the catalog and that `actor` is set only on `application_service` behaviours; both are hard errors. + +`save_design_doc` rejects: +- A `qualityAttributes` ChangeSet attached at the wrong level when the same QA name is also declared at a wider scope in the same doc (pick one — the narrowest level that covers the constraint). +- An `actor` set on a behaviour whose host BuildingBlock type is anything other than `application_service`. +- A `behaviour.actor` whose name is not present in the actor catalog. + +## 3. ChangeSet rules + +The diff baseline is the **currently implemented codebase** (what `noesis:implement-design-doc` will read as the starting state of the codebase), **not** the prior Design Doc record. Bucket every item by asking *"is this item already in code?"*: + +- **Not in code** → `added`. The implementer needs to bring it into existence. +- **In code, definition unchanged** → omit (don't restate). +- **In code, definition changed** → `modified` with only the changed sub-fields plus the identity `name`. +- **In code, no longer wanted** → `removed` (by name). + +For nested `ChangeSet`s (e.g. `DesignedBuildingBlock.properties`), recurse with the same baseline question per item. A `modified` building block whose only change is a new property emits `{ name: "...", properties: { added: [{name, type}] } }`. + +When all of `added`, `modified` and `removed` are empty for a given collection field, **omit the field entirely** rather than emitting `{ "added": [], "modified": [], "removed": [] }`. + +**Implementation status — pin the baseline before authoring the diff.** + +- **Green-field implementation status** (no `implement-design-doc` run has materialised this design in code yet — the typical case for a first or second authoring pass): every item belongs in `added`, even when a prior Design Doc record already lists them. `modified` and `removed` stay empty until implementation has happened. A second authoring pass against the same unimplemented design keeps items in `added` (with refined definitions); it does **not** move them to `modified`. +- **Post-implementation status** (one or more `implement-design-doc` runs have produced code from this design): the diff is against the resulting code. The prior Design Doc record is a *hint* about what was last asked-for; the system of record is the code. + +**Identity.** `name` is the identity key for every entity. A rename of an item already in code is `removed: [""]` + `added: []`, plus a sweep of cross-references (`input`, `output`, `usedBuildingBlocks`, `properties[].type`, `implements`, `behaviour.actor`) to point at the new name. A rename of an item *not* in code is just `added: []` — the prior design doc's `` is irrelevant because no code has it yet. + +## 4. Naming conventions + +- BuildingBlock names: `PascalCase` (e.g. `OrderAggregate`, `PriceCalculated`). +- Behaviour names: `PascalCase` matching the type — Commands as imperative (`PlaceOrder`), Events past tense (`OrderPlaced`), Queries noun + `By...` (`OrderById`). +- Actor names: free-form (`Customer`, `Warehouse Operator`). **Always check `list_actors` for an existing match before introducing a new one.** +- BoundedContext / Module names: domain-language nouns (`Pricing`, `InventoryManagement`). +- Quality attribute names: `:` is a useful convention (`Performance:OrderListLatency`, `Security:CardholderData`), but free-form is allowed when a single descriptive token is clearer. +- Re-use names that already exist in the knowledge graph (check by reading the current design doc first); only introduce new names when no semantic match exists. + +## 5. Modelling conventions + +### 5.1 Quality attribute placement — narrowest level that applies + +Each quality attribute attaches at exactly one of: `Behaviour`, `BuildingBlock`, `DesignedDomainModule`, `DesignedBoundedContext`. Pick the **narrowest** scope that covers the constraint: + +- The constraint applies only to one specific behaviour (e.g. *"`PlaceOrder` p95 ≤ 200 ms"*) → attach to that `Behaviour`. +- The constraint applies to a BuildingBlock as a whole (e.g. *"`OrderRepository` reads must be served by a read replica"*) → attach to that `BuildingBlock`. +- The constraint spans sibling BuildingBlocks in one Module (e.g. *"`Pricing.Calculation` must be deterministic and pure"*) → attach to that `DesignedDomainModule`. +- The constraint covers the whole Bounded Context (e.g. *"`Billing` must redact PII from all logs"*) → attach to that `DesignedBoundedContext`. + +Do not duplicate a QA across levels. If you find yourself attaching the same QA to multiple levels, lift it to the lowest common ancestor. + +### 5.2 Rule vs Quality Attribute + +- **Rule** = domain concern. Belongs to the ubiquitous language. Examples: *"An invoice cannot be issued before the order has been paid in full."*, *"A discount of more than 30% requires manager approval."*. Verified at runtime by domain code, exercised by Scenarios. +- **Quality Attribute** = technical concern. Belongs to operations, security, performance, availability vocabulary. Examples: *"`PlaceOrder` p95 ≤ 200 ms under 100 RPS sustained"*, *"All requests authenticated via OAuth2 bearer."*, *"Service availability ≥ 99.9% measured monthly."*. Verified by tests/SLOs/policies, not by domain rule machinery. + +A constraint that fits both buckets is almost always a Rule with one or more derived Quality Attributes — express the domain truth as a Rule, then add a QA only when the technical envelope is also part of the contract. + +### 5.3 Interchangeable Building Blocks + +When two or more BBs need to be interchangeable in some context (heterogeneous collection elements, polymorphic property values, behaviour I/O), introduce an explicit **base Building Block** that models the common abstraction. Implementing BBs declare `implements: [""]`; property / input / output `type` then references the **base BB by name**. + +The base BB is a real domain concept — name the role and the shared shape, not "Anything" or "Item". Do not invent a phantom umbrella BB just to satisfy the schema. If the polymorphism does not correspond to a real shared abstraction, rethink the model rather than fabricate a base. + +**Worked example.** A composite component tree — a `CompositeComponent` whose `children` is a list of either `CompositeComponent` or `SimpleComponent`: + +```jsonc +{ + "name": "Component", + "type": "value_object", + "description": "Common abstraction over composite and leaf components in an emissions breakdown tree.", + "properties": { "added": [ + { "name": "id", "type": "ComponentId" }, + { "name": "label", "type": "String" } + ] } +}, +{ + "name": "CompositeComponent", + "type": "value_object", + "implements": ["Component"], + "properties": { "added": [ + { "name": "children", "type": "Component", "collection": true } + ] } +}, +{ + "name": "SimpleComponent", + "type": "value_object", + "implements": ["Component"] +} +``` + +The validator resolves every `implements` entry against declared BBs (in this doc or in the prior model). Property `type` values like `"Component"` are valid because `Component` is declared. + +### 5.4 Mermaid blocks inside `description` fields + +Embed mermaid diagrams in JSON `description` fields with `\n`-separated lines: + +```json +"description": "...behaviour body...\n\n```mermaid\nsequenceDiagram\n Actor->>Service: trigger\n Service-->>Actor: ack\n```" +``` + +The triple-backtick `mermaid` opener and the trailing triple-backtick close the block. Use this for application_service behaviours and any behaviour using ≥3 Building Blocks. Authoring the diagram in §3.5 of `noesis:create-design-doc` is preferred over deferring it to a save-warning round-trip. + +## 6. Producing the JSON + +1. If iterating, call `noesis-graph:read_design_doc` first; cache the rendered Markdown, never load the JSON wholesale. Treat it as a hint — the diff baseline is the implemented codebase (§3). +2. Call `noesis-graph:list_actors` and keep the catalog handy — it is the deduplication source for `behaviour.actor` names. +3. Walk model-bearing fragments grouped by Bounded Context. +4. For each entity built, set its name from the source heading or the first declarative sentence; do NOT invent names that are absent from the draft. +5. Validate locally: every `usedBuildingBlocks` / `input` / `output` reference, every `properties[].type`, every entry in `implements`, and every `behaviour.actor` must resolve — BB references against BBs in this doc (existing or `added`), actor names against the catalog (or new actors you will introduce). Unknown references are bugs in extraction — flag and either drop the reference, promote the missing block to `added`, or call `upsert_actor` for a new actor before save. +6. Before `save_design_doc`, call `noesis-graph:upsert_actor` for every new actor name introduced — the save fails if a referenced actor isn't in the catalog. +7. SKILL.md Step 4 owns the Save flow (write the JSON, run the pre-save check, call `save_design_doc`). Do not duplicate Save instructions here. + +## 7. Worked examples + +### 7.1 Green-field iteration (no code yet) + +Second authoring pass on a Design Doc whose `implement-design-doc` has **not** run. The prior Design Doc record lists `PlaceOrder` with `description: "..."` (300 chars). The new authoring pass refines the description to 480 chars, adds a new property to `Order`, attaches an actor and a behaviour-scoped quality attribute. + +Correct ChangeSets: + +```jsonc +{ + "boundedContexts": { "added": [ + { + "name": "Sales", + "buildingBlocks": { "added": [ + { + "name": "Order", + "type": "aggregate", + "properties": { "added": [ + { "name": "id", "type": "OrderId" }, + { "name": "customerId", "type": "CustomerId" }, + { "name": "couponCode", "type": "String", "nullable": true } + ] } + }, + { + "name": "PlaceOrderService", + "type": "application_service", + "behaviours": { "added": [ + { + "name": "PlaceOrder", + "type": "Command", + "isPublic": true, + "actor": "Customer", + "description": "...refined 480-char description...", + "qualityAttributes": { "added": [ + { + "name": "Performance:PlaceOrderLatency", + "type": "performance", + "description": "p95 ≤ 200 ms at 100 RPS sustained, measured at the API boundary; degrades to p95 ≤ 500 ms at 250 RPS." + } + ] } + } + ] } + } + ] } + } + ] } +} +``` + +Both Order and PlaceOrder stay in `added` because no code exists yet. `modified` and `removed` are absent. The QA hangs off the behaviour because it is specific to `PlaceOrder` and would not survive being lifted to the BC. + +The skill must have called `upsert_actor({ name: "Customer", description: "..." })` before this save — `Customer` either reuses the catalog entry or is freshly registered. + +### 7.2 Post-implementation rename + +Code exists. The design renames `Lock` → `PriceStateLock`. The aggregate keeps its body; only the name changes. `Lock.quantity` is also referenced from `PriceState.lock`. + +```jsonc +{ + "boundedContexts": { "modified": [ + { + "name": "Pricing", + "buildingBlocks": { + "removed": ["Lock"], + "added": [ + { + "name": "PriceStateLock", + "type": "aggregate", + "properties": { "added": [{ "name": "id", "type": "PriceStateLockId" }, { "name": "quantity", "type": "Quantity" }] }, + "behaviours": { "added": [/* full new spec — same body, new name */] } + } + ] + } + } + ] } +} +``` + +Plus a sweep so `PriceState.lock`'s property `type` and any `usedBuildingBlocks` referencing the old name now reference `PriceStateLock`. The validator rejects a save where `removed: ["Lock"]` coexists with any reference to `"Lock"` elsewhere. + +## 8. Things this agent must NOT do + +- Do not invent business rules, scenarios, properties, or quality attributes not stated in the source. The draft is the source of truth; gap-filling is the architect's job, not the extractor's. +- Do not classify discussion / comparison content as model content. Comparison tables ("Vector RAG vs PageIndex") are NOT Building Blocks. +- Do not invent module names from arbitrary headings. Modules must reflect actual cohesion in the model — not arbitrary heading structure in the source. **However**, once a Bounded Context contains more than ~15 Building Blocks, group them into 3–7 Modules along natural cohesion axes (typically the topic structure pulled in Step 2). Reuse those topic names rather than inventing fresh module names. A Module with fewer than 3 Building Blocks is a smell — fold it back into the BC or merge with a sibling. `save_design_doc` emits a warning when a Bounded Context has >20 building blocks and zero modules. +- Do not produce an empty DesignDoc (no contexts). If extraction yields no model material, skip writing the design doc JSON and skip the `save_design_doc` call. +- Do not invent a phantom umbrella Building Block to satisfy a heterogeneous collection — model the real abstraction or omit `type` (§5.3). +- Do not classify a domain invariant as a Quality Attribute or vice versa (§5.2). The discriminator is "domain concern vs technical concern". +- Do not attach a Quality Attribute at a level wider than its actual scope (§5.1) — pick the narrowest container. +- Do not place an `actor` on a behaviour whose host is not an `application_service` — `save_design_doc` rejects it. +- Do not reference an actor name that is not in the catalog. Either reuse from `list_actors` or call `upsert_actor` first. diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction/task.toml b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/task.toml new file mode 100644 index 0000000..dd324da --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/task.toml @@ -0,0 +1,25 @@ +name = "threshold-discount-extraction" +description = "Extract a Design Doc JSON from a Markdown design draft describing a new threshold-activated discount value object in a Sales pricing model." +difficulty = "intermediate" +estimated_time_minutes = 8 +tags = [ + "design-doc", + "model-extraction", + "DDD", + "noesis-plugin" +] +instruction = "./instruction.md" + +[environment] +type = "docker" +dockerfile = "./environment/Dockerfile" + +[evaluation] +type = "script" +script = "./tests/test.sh" +timeout_seconds = 60 + +[metadata] +domain = "Software Architecture" +draft_language = "English" +expected_output_path = "/app/output/design-doc.json" diff --git a/evals/draft-to-design-doc/tasks/threshold-discount-extraction/tests/test.sh b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/tests/test.sh new file mode 100755 index 0000000..ab063fe --- /dev/null +++ b/evals/draft-to-design-doc/tasks/threshold-discount-extraction/tests/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# TDD-style verifier for draft-to-design-doc. The test asserts the EXPECTED +# correct behaviour, not the current behaviour of the noesis plugin. +# +# At the time of writing, the noesis plugin's `save_design_doc` enforces a +# green-field ChangeSet shape (everything in `added`). That is a bug — the +# Design Doc should reflect the diff against the implemented codebase, where +# the `Sales` Bounded Context, `Sales.Pricing.Discounts` module and the +# `Discount` discriminated union ALREADY EXIST in the DDD-starter-dotnet repo +# (mounted at /app/repo). The correct shape is therefore: +# +# - `Sales` BC in `boundedContexts.modified` +# - `Sales.Pricing.Discounts` module in `modules.modified` +# - `Discount` BB in `buildingBlocks.modified` (a new `Threshold` factory +# behaviour is added, but the BB itself already exists) +# - `ThresholdDiscount` BB in `buildingBlocks.added` (genuinely new) +# +# Until the plugin lifts the green-field constraint, ALL three variants will +# fail this test. That's the intended TDD signal — the test is the spec. + +set -u + +# Search for the design doc JSON in the two known canonical locations: +# - /app/output/design-doc.json (vanilla / guided variants) +# - /app/noesis/design-docs/*.json (noesis variant — name computed by the plugin) +OUT="" +if [ -f /app/output/design-doc.json ]; then + OUT=/app/output/design-doc.json +elif compgen -G "/app/noesis/design-docs/*.json" > /dev/null; then + # shellcheck disable=SC2012 + OUT=$(ls /app/noesis/design-docs/*.json 2>/dev/null | head -1) +fi + +REWARD_DIR=/logs/verifier +mkdir -p "$REWARD_DIR" 2>/dev/null || true + +fail() { + echo "✗ FAILURE: $1" + echo 0 > "$REWARD_DIR/reward.txt" + exit 1 +} + +if [ -z "$OUT" ]; then + fail "no design-doc JSON found (looked at /app/output/design-doc.json and /app/noesis/design-docs/*.json)" +fi + +echo "Inspecting: $OUT" + +if ! jq -e . "$OUT" > /dev/null 2>&1; then + fail "$OUT is not valid JSON" +fi + +# --- Top-level shape --- +jq -e '.name and .description and .boundedContexts' "$OUT" > /dev/null \ + || fail "missing top-level field: name, description, or boundedContexts" + +# --- Sales BC must be in `boundedContexts.modified` (it already exists in /app/repo) --- +jq -e ' + (.boundedContexts.modified // []) | map(.name) | index("Sales") +' "$OUT" > /dev/null \ + || fail "Sales is not in boundedContexts.modified — it should be classified as modified, since it already exists in /app/repo/Sources/Sales/. Found instead: added=$(jq -r ".boundedContexts.added // [] | map(.name) | join(\",\")" "$OUT"), modified=$(jq -r ".boundedContexts.modified // [] | map(.name) | join(\",\")" "$OUT")" + +# --- Sales.Pricing.Discounts module must be in modules.modified under Sales --- +jq -e ' + (.boundedContexts.modified // [])[] | select(.name == "Sales") | + (.modules.modified // []) | map(.name) | index("Sales.Pricing.Discounts") +' "$OUT" > /dev/null \ + || fail "Sales.Pricing.Discounts is not in Sales.modules.modified — it should be modified, since it already exists in /app/repo/Sources/Sales/Sales.DeepModel/Pricing/Discounts/" + +# --- ThresholdDiscount must be in buildingBlocks.added under that module (genuinely new) --- +jq -e ' + (.boundedContexts.modified // [])[] | select(.name == "Sales") | + (.modules.modified // [])[] | select(.name == "Sales.Pricing.Discounts") | + (.buildingBlocks.added // []) | map(.name) | index("ThresholdDiscount") +' "$OUT" > /dev/null \ + || fail "ThresholdDiscount is not in Sales.Pricing.Discounts.buildingBlocks.added — it is the genuinely new building block introduced by this draft" + +# --- Discount must be in buildingBlocks.modified (already exists; gains a new factory behaviour) --- +jq -e ' + (.boundedContexts.modified // [])[] | select(.name == "Sales") | + (.modules.modified // [])[] | select(.name == "Sales.Pricing.Discounts") | + (.buildingBlocks.modified // []) | map(.name) | index("Discount") +' "$OUT" > /dev/null \ + || fail "Discount is not in Sales.Pricing.Discounts.buildingBlocks.modified — it already exists in /app/repo/Sources/Sales/Sales.DeepModel/Pricing/Discounts/Discount.cs and the draft adds a Threshold variant to it" + +# All checks passed. +echo "✓ SUCCESS: design doc reflects the correct diff against the implemented codebase" +echo " Path: $OUT" +echo " Sales is modified, Sales.Pricing.Discounts is modified," +echo " ThresholdDiscount is added, Discount is modified." +echo 1 > "$REWARD_DIR/reward.txt" +exit 0 diff --git a/evals/draft-to-design-doc/variants/guided/CLAUDE.md b/evals/draft-to-design-doc/variants/guided/CLAUDE.md new file mode 100644 index 0000000..96d30a7 --- /dev/null +++ b/evals/draft-to-design-doc/variants/guided/CLAUDE.md @@ -0,0 +1,63 @@ +You are an assistant that extracts a Design Doc JSON from a Markdown design draft. + +Inputs in /app: +- draft.md — the design draft (mixes business requirements with explicit architectural decisions). +- schema-reference.md — the JSON schema, naming conventions, ChangeSet rules and modelling conventions the Design Doc must satisfy. + +Output: +- /app/output/design-doc.json — a single JSON document. + +# Approach + +Work through the draft section by section. The draft author splits intent across two flavours of sections, and each flavour maps to a different part of the Design Doc: + +- **Business sections** (Background, Business requirement, Worked examples, Acceptance criteria) state *what* the system must do. Treat them as the source for: Building Block descriptions (the why and where), behaviour preconditions and post-conditions, scenarios in Given/When/Then form. +- **Architectural decisions** (sections like A1, A2, …) state *how* the model is shaped. Treat them as the source for: Building Block types and structure, behaviour algorithms, rules (Trigger / Pre / Algorithm / Post / Edge), interactions between Building Blocks. + +# Step 1 — recognise model content + +Walk fragments grouped by section. Detect model-bearing sections using the lexicon in `schema-reference.md` Section 1 (Bounded Context, Module, Aggregate / Entity / Value Object / Service / Repository / Factory, Behaviour, Rule, Scenario, Quality Attribute). A section that is pure narrative or background is NOT model content — skip it. + +# Step 2 — distribute over Bounded Contexts and Modules + +Decide which Bounded Context(s) the draft touches. If the draft names a module path (e.g. `Sales.Pricing.Discounts`), use that path verbatim. Do not invent module names from arbitrary headings; group only when there is real cohesion in the model. + +# Step 3 — extract elements + +For each model-bearing section, decide what kind of element it produces: + +- **Building Block** — a concept with state and behaviour (or a value object with state alone). Sections that name a new type, give it properties, and describe its operations. +- **Behaviour** on a Building Block — a Command / Event / Query the BB exposes. Sections that describe an operation's input, validation, steps, output. +- **Rule** on a BB or Behaviour — a domain invariant, transition guard, or computation. Use Rules for *domain* concerns; use Quality Attributes for *technical* concerns (latency, throughput, security, observability). +- **Scenario** on a BB or Behaviour — a Given/When/Then triplet that verifies a Rule or exercises a Behaviour. Worked examples in the draft are excellent scenario sources. +- **Property** on a BB — a named field (with a type referring to another BB or a primitive). + +Attach Rules and Scenarios to **exactly one** parent (BB or Behaviour) — the narrowest level the constraint covers. Same for Quality Attributes (BB / Module / Bounded Context — never the top of the doc). + +# Step 4 — write descriptions that satisfy the schema's quality gates + +`schema-reference.md` mandates description shapes for the gated fields. Match them — the file is rejected on save if you don't: + +- Every `Rule.description` ≥ 80 characters, structured as: **Trigger** (when it fires), **Pre-conditions**, **Algorithm** (numbered steps or formula), **Post-conditions**, **Edge cases**. No tautologies that paraphrase the rule's `name`. No pure rationale without an algorithm. +- Every `Behaviour.description` ≥ 400 characters, structured as: **Input** (message/command/event with its fields), **Validation / preconditions**, **Steps** (numbered list with the transactional boundary), **Output** (events emitted, what the caller observes). For an `application_service` behaviour or any behaviour with `usedBuildingBlocks.added.length ≥ 3`, embed a ```mermaid sequence diagram inside the description. +- Every `QualityAttribute.description` ≥ 80 characters, stating a **measurable expectation** (target metric, threshold, scope). + +# Step 5 — references must resolve + +Every `properties[].type`, every entry in `behaviour.input` / `output` / `usedBuildingBlocks`, every entry in `implements`, must refer to a Building Block name that exists in this Design Doc OR a primitive (`String`, `Int`, `Boolean`, `Money`, `Percentage`, `…`). If a reference points at a name that has no declaring BB and is not a primitive, either declare the BB or remove the reference. Dangling references make the doc useless to a downstream implementer. + +# Step 6 — ChangeSet shape + +Follow `schema-reference.md` Section 3 — the diff baseline is the **currently implemented codebase**. Decide for each element whether it goes in `added`, `modified`, or `removed` based on what the draft says relative to that baseline. + +# Don'ts + +- Do not invent business rules, scenarios, properties, or quality attributes not stated in the draft. Gap-filling is the architect's job, not the extractor's. +- Do not classify discussion or comparison content as model content (e.g. "Vector RAG vs PageIndex" is not a Building Block). +- Do not invent module names from arbitrary headings. Group BBs into modules only when the draft itself does, or when a Bounded Context grows beyond ~15 BBs. +- Do not collapse a Rule into a Quality Attribute or vice versa. Domain invariant → Rule. Technical envelope → Quality Attribute. +- Do not place an `actor` on a Behaviour whose host is anything other than `application_service`. Actors are graph-global personas (end-user roles), never an external system, scheduler, or another module. + +# Output + +Write the validated `DesignDoc` payload to `/app/output/design-doc.json`. Single file, single top-level JSON object. All field names camelCase. Top-level fields: `name`, `description`, `boundedContexts`. Nested ChangeSets follow the shape `{ "added": [...], "modified": [], "removed": [] }`. diff --git a/evals/draft-to-design-doc/variants/guided/harbor_config.json b/evals/draft-to-design-doc/variants/guided/harbor_config.json new file mode 100644 index 0000000..55b4bab --- /dev/null +++ b/evals/draft-to-design-doc/variants/guided/harbor_config.json @@ -0,0 +1,15 @@ +{ + "agents": [ + { + "import_path": "nasde_toolkit.agents.configurable_claude:ConfigurableClaude", + "name": "guided", + "kwargs": { + "sandbox_files": { + "/app/CLAUDE.md": "variants/guided/CLAUDE.md", + "/app/draft.md": "tasks/threshold-discount-extraction/draft.md", + "/app/schema-reference.md": "tasks/threshold-discount-extraction/schema-reference.md" + } + } + } + ] +} diff --git a/evals/draft-to-design-doc/variants/guided/variant.toml b/evals/draft-to-design-doc/variants/guided/variant.toml new file mode 100644 index 0000000..6268cc1 --- /dev/null +++ b/evals/draft-to-design-doc/variants/guided/variant.toml @@ -0,0 +1,2 @@ +agent = "claude" +model = "claude-sonnet-4-6" diff --git a/evals/draft-to-design-doc/variants/noesis/CLAUDE.md b/evals/draft-to-design-doc/variants/noesis/CLAUDE.md new file mode 100644 index 0000000..298a798 --- /dev/null +++ b/evals/draft-to-design-doc/variants/noesis/CLAUDE.md @@ -0,0 +1,7 @@ +You are an assistant inside a sandbox where the Noesis plugin is installed and the `noesis-graph` MCP server is registered. + +The plugin exposes the `noesis:analyze-design-draft` skill. Invoke it on `/app/draft.md` exactly as the task instruction tells you. Do not write the Design Doc JSON manually — the skill writes it through `save_design_doc` under `/app/noesis/design-docs/`. + +Plugin location: `/opt/noesis-plugin` (registered at `/root/.claude/plugins/noesis/`). +Available MCP server: `noesis-graph` (registered in the sandbox's Claude config). +Available skill: `noesis:analyze-design-draft`. diff --git a/evals/draft-to-design-doc/variants/noesis/harbor_config.json b/evals/draft-to-design-doc/variants/noesis/harbor_config.json new file mode 100644 index 0000000..4772e56 --- /dev/null +++ b/evals/draft-to-design-doc/variants/noesis/harbor_config.json @@ -0,0 +1,14 @@ +{ + "agents": [ + { + "import_path": "nasde_toolkit.agents.configurable_claude:ConfigurableClaude", + "name": "noesis", + "kwargs": { + "sandbox_files": { + "/app/CLAUDE.md": "variants/noesis/CLAUDE.md", + "/app/draft.md": "tasks/threshold-discount-extraction-noesis/draft.md" + } + } + } + ] +} diff --git a/evals/draft-to-design-doc/variants/noesis/variant.toml b/evals/draft-to-design-doc/variants/noesis/variant.toml new file mode 100644 index 0000000..6268cc1 --- /dev/null +++ b/evals/draft-to-design-doc/variants/noesis/variant.toml @@ -0,0 +1,2 @@ +agent = "claude" +model = "claude-sonnet-4-6" diff --git a/evals/draft-to-design-doc/variants/vanilla/CLAUDE.md b/evals/draft-to-design-doc/variants/vanilla/CLAUDE.md new file mode 100644 index 0000000..79a553d --- /dev/null +++ b/evals/draft-to-design-doc/variants/vanilla/CLAUDE.md @@ -0,0 +1,7 @@ +You are an assistant that extracts a Design Doc JSON from a Markdown design draft. + +Inputs in /app: +- draft.md — the design draft. +- schema-reference.md — the JSON schema and modelling rules. + +Read both files before producing output. Then write a single JSON document to /app/output/design-doc.json. diff --git a/evals/draft-to-design-doc/variants/vanilla/harbor_config.json b/evals/draft-to-design-doc/variants/vanilla/harbor_config.json new file mode 100644 index 0000000..fda914e --- /dev/null +++ b/evals/draft-to-design-doc/variants/vanilla/harbor_config.json @@ -0,0 +1,15 @@ +{ + "agents": [ + { + "import_path": "nasde_toolkit.agents.configurable_claude:ConfigurableClaude", + "name": "vanilla", + "kwargs": { + "sandbox_files": { + "/app/CLAUDE.md": "variants/vanilla/CLAUDE.md", + "/app/draft.md": "tasks/threshold-discount-extraction/draft.md", + "/app/schema-reference.md": "tasks/threshold-discount-extraction/schema-reference.md" + } + } + } + ] +} diff --git a/evals/draft-to-design-doc/variants/vanilla/variant.toml b/evals/draft-to-design-doc/variants/vanilla/variant.toml new file mode 100644 index 0000000..6268cc1 --- /dev/null +++ b/evals/draft-to-design-doc/variants/vanilla/variant.toml @@ -0,0 +1,2 @@ +agent = "claude" +model = "claude-sonnet-4-6"