From 1412ddfbd9de7d8a264032d7868db92e2d44df3a Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Wed, 10 Jun 2026 13:25:30 +0400 Subject: [PATCH] chore: prepare 0.28.0 release --- CHANGELOG.md | 17 ++++++++++++++ README.md | 38 +++++++++++++++++++++++--------- docs/mcp-registry/server.json | 4 ++-- package-lock.json | 4 ++-- package.json | 2 +- tests/unit/why-madar-doc.test.ts | 13 ++++++----- 6 files changed, 56 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16fa7c40..58e9c49d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,23 @@ All notable changes to the TypeScript package will be documented in this file. +## [0.28.0] - 2026-06-10 + +### Added + +- **Proof-backed public TypeScript benchmark receipts are now part of the stable release**: the public `documenso`, `formbricks`, `dub`, `twenty`, `cal-diy`, and `novu` `explain-runtime` legacy rows now have checked-in share-safe receipts with `benchmark_outcome = "full_win"`, `benchmark_readiness = "ready"`, passing Madar answer-quality gates, and empty runtime-proof missing obligations. +- **Strict runtime-proof benchmarking is now first-class**: benchmark rows can require explicit entrypoint, handoff, and terminal-effect obligations, and reports now expose runtime-proof evidence so a row cannot be claimed as a win when required flow evidence is missing. + +### Changed + +- **README and claim surfaces now lead with the 0.28.0 proof boundary**: public copy now shows the six-row TypeScript `explain-runtime` legacy benchmark table while keeping the claim scoped to single-trial, repo/task-specific receipts and keeping SPI arms separate. +- **Runtime retrieval is more completeness-driven**: slice selection, targeted recovery, source evidence, scoped benchmark roots, and framework/runtime handoff handling were tightened so Madar can surface direct evidence before the agent answers. + +### Fixed + +- **Benchmark receipts no longer hide missing proof behind soft wins**: strict rows now fail closed when required runtime obligations are absent, direct-evidence answer checks are enforced, nested trace tool inputs are summarized more reliably, and mixed workspace-relative evidence path issues are removed from the saved reports. +- **Public benchmark reproducibility is stronger**: the suite honors explicit benchmark CLI overrides, keeps scoped-root fixtures platform-aware, avoids dropping source-visible runtime files behind broad ignore rules, and records share-safe reports for each public legacy row. + ## [0.27.9] - 2026-06-04 ### Added diff --git a/README.md b/README.md index bbb57c18..f95b51f2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Madar builds a local graph of your TypeScript/Node repo, then gives agents like It helps agents spend less time rediscovering the same files, routes, imports, and flows. +In the latest public TypeScript benchmark receipts, Madar produced proof-backed `full_win` outcomes on 6/6 `explain-runtime` legacy rows with strict runtime-proof gates enabled. + [![npm](https://img.shields.io/npm/v/%40lubab%2Fmadar)](https://www.npmjs.com/package/@lubab/madar) [![node >=20](https://img.shields.io/badge/node-%E2%89%A520-3c873a)](https://nodejs.org/) [![local first](https://img.shields.io/badge/local--first-no%20cloud%20required-0f766e)](#privacy) @@ -23,6 +25,18 @@ Madar gives the agent a smaller, repo-grounded starting point. It does not replace the agent. It helps the agent start from better evidence. +## What Agents Get + +For each task, Madar can surface: + +- the likely entry files, symbols, routes, and handlers +- direct snippets and file paths for the current question +- relationships such as imports, calls, framework roles, and runtime handoffs +- freshness metadata tied to git state +- share-safe benchmark and handoff artifacts for review + +The goal is not to make the agent blind to the repo. The goal is to make the first pass smaller, more relevant, and easier to verify. + ## Install ```bash @@ -145,16 +159,18 @@ Use `--require-fresh-context` when the selected files must be fresh. Use `--requ ## Evidence -On one verified GoValidate backend explain task, Madar reduced: +Madar now has proof-backed public TypeScript `explain-runtime` legacy benchmark receipts across six open-source repos. Each row below has `benchmark_outcome = "full_win"`, `benchmark_readiness = "ready"`, `answer_quality.madar.passed = true`, and `answer_contract.runtime_proof.missing_obligations = []`. -| Metric | Without Madar | With Madar | -| --- | ---: | ---: | -| Tool calls | 28 | 7 | -| Input tokens | 2,366,946 | 498,688 | -| Wall-clock latency | 158,995 ms | 72,420 ms | -| Cost | $2.6595 | $0.9728 | +| Repo | Input tokens | Fresh tokens | Tool calls | Turns | Latency | Cost | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `documenso` | 174,504 -> 76,721 (2.27x) | 31,754 -> 16,001 (1.98x) | 7 -> 2 | 8 -> 3 | 58.2s -> 35.3s (1.65x) | $0.3498 -> $0.1634 (2.14x) | +| `formbricks` | 163,482 -> 74,395 (2.20x) | 19,471 -> 14,663 (1.33x) | 37 -> 2 | 6 -> 3 | 157.6s -> 22.6s (6.99x) | $0.4973 -> $0.1350 (3.68x) | +| `dub` | 233,038 -> 76,538 (3.04x) | 33,088 -> 15,847 (2.09x) | 9 -> 2 | 10 -> 3 | 69.4s -> 30.2s (2.29x) | $0.3928 -> $0.1570 (2.50x) | +| `twenty` | 694,972 -> 103,125 (6.74x) | 48,000 -> 22,355 (2.15x) | 21 -> 3 | 22 -> 4 | 128.5s -> 58.7s (2.19x) | $0.8000 -> $0.2069 (3.87x) | +| `cal-diy` | 1,588,241 -> 101,820 (15.60x) | 61,669 -> 21,688 (2.84x) | 37 -> 3 | 38 -> 4 | 252.0s -> 38.7s (6.51x) | $1.4263 -> $0.1946 (7.33x) | +| `novu` | 1,055,389 -> 75,772 (13.93x) | 63,542 -> 15,491 (4.10x) | 23 -> 2 | 24 -> 3 | 220.3s -> 31.1s (7.09x) | $1.1316 -> $0.1620 (6.98x) | -This is not a universal benchmark claim. It is one repo, one prompt, one agent runtime, and one verified install path. +This is not a universal benchmark claim. These are repo/task-specific, single-trial, legacy-row receipts for public TypeScript `explain-runtime` prompts. SPI arms are tracked separately and are not folded into this 6/6 claim. The public evidence map tracks what is proven, what is mixed, and what should not be claimed yet: [claims and evidence](https://github.com/mohanagy/madar/blob/main/docs/claims-and-evidence.md). @@ -184,11 +200,11 @@ It does not record prompt text, answer text, source paths, source content, or re ## What's New -Current version: `0.27.9`. +Current version: `0.28.0`. -This release includes the stable next-track adoption bundle: the one-command `madar try` flow, opt-in telemetry, verified agent quickstarts, public benchmark-suite work, freshness improvements, and Windows Claude workflow fixes. +This release promotes the public benchmark work to a proof-backed stable release: six public TypeScript `explain-runtime` legacy rows now have checked-in `full_win` receipts, strict runtime-proof gates, direct-evidence answer checks, scoped benchmark roots, and share-safe reports. It also includes retrieval and extraction improvements for runtime handoffs, source-visible framework flows, and benchmark reproducibility. -Read the full notes in the [0.27.9 changelog](https://github.com/mohanagy/madar/blob/main/CHANGELOG.md#0279---2026-06-04). +Read the full notes in the [0.28.0 changelog](https://github.com/mohanagy/madar/blob/main/CHANGELOG.md#0280---2026-06-10). ## Docs diff --git a/docs/mcp-registry/server.json b/docs/mcp-registry/server.json index 440f3e48..51e2540c 100644 --- a/docs/mcp-registry/server.json +++ b/docs/mcp-registry/server.json @@ -9,13 +9,13 @@ "source": "github", "url": "https://github.com/mohanagy/madar" }, - "version": "0.27.9", + "version": "0.28.0", "packages": [ { "registryType": "npm", "registryBaseUrl": "https://registry.npmjs.org", "identifier": "@lubab/madar", - "version": "0.27.9", + "version": "0.28.0", "runtimeHint": "npx", "transport": { "type": "stdio" diff --git a/package-lock.json b/package-lock.json index 16afb1ca..f152451a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@lubab/madar", - "version": "0.27.9", + "version": "0.28.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@lubab/madar", - "version": "0.27.9", + "version": "0.28.0", "license": "MIT", "dependencies": { "@vscode/tree-sitter-wasm": "^0.3.1", diff --git a/package.json b/package.json index e8a14bcb..864e6890 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@lubab/madar", - "version": "0.27.9", + "version": "0.28.0", "description": "Stop AI coding agents from rediscovering large TypeScript/Node repos. Madar compiles task-aware local context packs from what runs for this task.", "license": "MIT", "author": "mohanagy", diff --git a/tests/unit/why-madar-doc.test.ts b/tests/unit/why-madar-doc.test.ts index 78ca8d3d..57225c7b 100644 --- a/tests/unit/why-madar-doc.test.ts +++ b/tests/unit/why-madar-doc.test.ts @@ -86,8 +86,8 @@ describe('public marketing copy honesty', () => { }) it('surfaces the current stable release and benchmark evidence pointers in the main README flow', () => { - expect(content).toContain('Current version: `0.27.9`') - expect(content).toContain('0.27.9 changelog') + expect(content).toContain('Current version: `0.28.0`') + expect(content).toContain('0.28.0 changelog') expect(content).toContain('madar summary') expect(content).toContain('docs/claims-and-evidence.md') expect(content).toContain('docs/benchmarks/suite/') @@ -142,10 +142,11 @@ describe('public marketing copy honesty', () => { }) it('pins the benchmark evidence table values in the README', () => { - expect(content).toContain('| Tool calls | 28 | 7 |') - expect(content).toContain('| Input tokens | 2,366,946 | 498,688 |') - expect(content).toContain('| Wall-clock latency | 158,995 ms | 72,420 ms |') - expect(content).toContain('| Cost | $2.6595 | $0.9728 |') + expect(content).toContain('6/6 `explain-runtime` legacy rows') + expect(content).toContain('| `documenso` | 174,504 -> 76,721 (2.27x)') + expect(content).toContain('| `cal-diy` | 1,588,241 -> 101,820 (15.60x)') + expect(content).toContain('| `novu` | 1,055,389 -> 75,772 (13.93x)') + expect(content).toContain('SPI arms are tracked separately and are not folded into this 6/6 claim.') }) it('keeps claim buckets in the evidence docs while the README stays a compact pointer', () => {