From 95e2756272d2ecfff150e2939a85742ed98530a9 Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Sun, 14 Jun 2026 18:00:25 -0400
Subject: [PATCH 1/7] fix: release-unblock + silent-failure correctness fixes
 (wave 1)

Highest-value, lowest-risk findings from the codebase audit.

Release readiness (dsprrr-w0e):
- Lower the ellmer requirement to the released >= 0.4.1 and drop the
  CRAN-forbidden "Remotes: tidyverse/ellmer" entry.
- Remove unused Suggests: future and lifecycle.
- Add NEWS.md.

Correctness:
- print.dsprrr_batch_result now counts failed items. It read the error from
  item$error / the output object, but create_error_result() stores it in
  metadata$error with output = NA, so failed batches always printed
  "All items completed successfully" (dsprrr-8l0).
- Unknown or misspelled signature types now error with a closest-match
  suggestion instead of silently becoming string fields. validate_type_string()
  was fully written but never called (dsprrr-47p).
- module_trials() warns and returns an inspectable summary when every trial
  fails, instead of a cryptic "attempt to select less than one element"
  (dsprrr-hew).
- .cache is accepted and validated by all module types and pipelines, not
  only PredictModule, where it previously fell into ... and was dropped with
  a spurious "unknown input" warning (dsprrr-jup).

Test isolation:
- Add tests/testthat/setup.R plus a DSPRRR_CACHE_PATH env var so the suite
  no longer writes the disk cache into the package source tree (dsprrr-63v).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 DESCRIPTION                            |  6 +---
 NEWS.md                                | 43 +++++++++++++++++++++++
 R/cache.R                              | 14 ++++++--
 R/module-base.R                        | 12 +++++--
 R/optimize.R                           | 16 +++++++++
 R/run.R                                | 47 ++++++++++++++++++--------
 R/signature-parser.R                   | 11 ++++++
 man/configure_cache.Rd                 |  2 +-
 tests/testthat/setup.R                 | 32 ++++++++++++++++++
 tests/testthat/test-cache.R            | 14 ++++++++
 tests/testthat/test-optimize.R         | 19 +++++++++++
 tests/testthat/test-run.R              | 34 +++++++++++++++++++
 tests/testthat/test-signature-parser.R | 26 ++++++++++++++
 13 files changed, 250 insertions(+), 26 deletions(-)
 create mode 100644 NEWS.md
 create mode 100644 tests/testthat/setup.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 9f0dc788..0966cff7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,7 +26,7 @@ Imports:
     cachem,
     cli,
     digest,
-    ellmer (>= 0.4.0.9000),
+    ellmer (>= 0.4.1),
     glue,
     jsonlite,
     methods,
@@ -38,8 +38,6 @@ Imports:
     tibble,
     withr
 Suggests:
-    future,
-    lifecycle,
     callr,
     coro,
     dials,
@@ -60,8 +58,6 @@ Suggests:
     vcr (>= 2.0.0),
     vitals,
     yardstick
-Remotes:
-    tidyverse/ellmer
 Collate:
     'accessors.R'
     'assertion-helpers.R'
diff --git a/NEWS.md b/NEWS.md
new file mode 100644
index 00000000..8838be9e
--- /dev/null
+++ b/NEWS.md
@@ -0,0 +1,43 @@
+# dsprrr (development version)
+
+First development changelog. dsprrr is experimental; the API may change.
+
+## Bug fixes
+
+* `BootstrapFewShot` now harvests demonstrations when the metric targets a
+  specific output field (e.g. `metric_exact_match(field = "answer")`).
+  Previously the single-module path passed a bare value to the metric, which
+  errored internally and silently bootstrapped zero demos (#dsprrr-s3b).
+
+* Failed items in a batch are now counted and reported. Previously
+  `print()` on a batch result always said "All items completed successfully"
+  because it looked for the error in the wrong place (#dsprrr-8l0).
+
+* Unknown or misspelled types in a signature string (e.g. `"q -> a: interger"`)
+  now raise an error suggesting the closest valid type, instead of silently
+  becoming a string field (#dsprrr-47p).
+
+* `optimize_grid()` now gives a clear error when every trial fails (for
+  example, when the API is unreachable) instead of a cryptic
+  "attempt to select less than one element" (#dsprrr-hew).
+
+* `.cache` is now honored by all module types and pipelines, not only
+  `PredictModule`. Previously it was silently dropped (with a spurious
+  "unknown input" warning) for every other module (#dsprrr-jup).
+
+* Evaluation now treats a failed prediction as a score of 0 rather than
+  dropping it from the mean, so optimizers no longer prefer configurations
+  that fail on most of the data (#dsprrr-tn1).
+
+## Internal
+
+* Tests now isolate the on-disk cache to a temporary directory, so running
+  the test suite no longer writes into the package source tree (#dsprrr-63v).
+
+## Packaging
+
+* Lowered the minimum `ellmer` requirement to the released `>= 0.4.1` and
+  removed the `Remotes:` entry, so the package installs from CRAN-released
+  dependencies (#dsprrr-w0e).
+
+* Removed unused `Suggests`: `future` and `lifecycle`.
diff --git a/R/cache.R b/R/cache.R
index 2c35605a..e2b990d3 100644
--- a/R/cache.R
+++ b/R/cache.R
@@ -12,6 +12,16 @@ NULL
 
 # ── Cache Configuration ──────────────────────────────────────────────────────
 
+#' Default on-disk cache directory
+#'
+#' Honors the `DSPRRR_CACHE_PATH` environment variable so the test suite (and
+#' users with read-only working directories) can redirect the disk cache
+#' without writing into the current working directory.
+#' @noRd
+default_disk_cache_path <- function() {
+  Sys.getenv("DSPRRR_CACHE_PATH", unset = ".dsprrr_cache")
+}
+
 #' Configure dsprrr Cache
 #'
 #' @description
@@ -69,7 +79,7 @@ configure_cache <- function(
   enable = TRUE,
   enable_memory = TRUE,
   enable_disk = TRUE,
-  disk_path = ".dsprrr_cache",
+  disk_path = default_disk_cache_path(),
   memory_max_entries = 1000L,
   disk_max_size = 500 * 1024^2,
   disk_max_age = Inf
@@ -245,7 +255,7 @@ get_cache_config <- function() {
       enable = !env_disabled,
       enable_memory = TRUE,
       enable_disk = TRUE,
-      disk_path = ".dsprrr_cache",
+      disk_path = default_disk_cache_path(),
       memory_max_entries = 1000L,
       disk_max_size = 500 * 1024^2,
       disk_max_age = Inf
diff --git a/R/module-base.R b/R/module-base.R
index f652f756..f097ac64 100644
--- a/R/module-base.R
+++ b/R/module-base.R
@@ -83,7 +83,8 @@ Module <- R6::R6Class(
       .verbose = FALSE,
       .parallel = FALSE,
       .progress = TRUE,
-      .return_format = "simple"
+      .return_format = "simple",
+      .cache = NULL
     ) {
       inputs <- list(...)
 
@@ -131,7 +132,12 @@ Module <- R6::R6Class(
         results <- vector("list", max_length)
         for (i in seq_len(max_length)) {
           input_set <- lapply(inputs, `[[`, i)
-          result <- self$forward(input_set, .llm = .llm, trace = TRUE)
+          result <- self$forward(
+            input_set,
+            .llm = .llm,
+            trace = TRUE,
+            .cache = .cache
+          )
 
           if (.return_format == "simple") {
             results[[i]] <- result$output[[1]]
@@ -148,7 +154,7 @@ Module <- R6::R6Class(
       }
 
       # Single input processing
-      result <- self$forward(inputs, .llm = .llm, trace = TRUE)
+      result <- self$forward(inputs, .llm = .llm, trace = TRUE, .cache = .cache)
 
       if (.return_format == "simple") {
         return(result$output[[1]])
diff --git a/R/optimize.R b/R/optimize.R
index ca96401f..f041d13a 100644
--- a/R/optimize.R
+++ b/R/optimize.R
@@ -458,6 +458,22 @@ module_trials <- function(
   }
 
   scores <- trials$score
+  if (all(is.na(scores))) {
+    cli::cli_warn(c(
+      "All {nrow(trials)} optimization trial{?s} failed (no valid scores).",
+      "i" = "Check that the LLM is reachable and the metric returns numeric scores.",
+      "i" = "Inspect the {.field trials} list-column for per-trial details."
+    ))
+    return(tibble::tibble(
+      n_trials = nrow(trials),
+      best_trial = NA_integer_,
+      best_score = NA_real_,
+      mean_score = NA_real_,
+      std_error = NA_real_,
+      best_params = list(NULL),
+      trials = list(trials)
+    ))
+  }
   best_idx <- if (objective == "maximize") {
     which.max(scores)
   } else {
diff --git a/R/run.R b/R/run.R
index 29f1273d..07644cd8 100644
--- a/R/run.R
+++ b/R/run.R
@@ -72,6 +72,27 @@ run <- function(module, ...) {
   UseMethod("run")
 }
 
+#' Validate the `.cache` argument
+#'
+#' Shared by [run()] methods so every module type rejects malformed `.cache`
+#' values the same way.
+#' @noRd
+validate_cache_arg <- function(.cache) {
+  if (!is.null(.cache)) {
+    if (!is.logical(.cache) || length(.cache) != 1 || is.na(.cache)) {
+      cache_value <- .cache
+      cli::cli_abort(c(
+        "{.arg .cache} must be {.code TRUE}, {.code FALSE}, or {.code NULL}",
+        "x" = "You provided: {.cls {class(cache_value)}} with value {.val {cache_value}}",
+        "i" = "{.code TRUE} attempts to use cache (if available)",
+        "i" = "{.code FALSE} bypasses cache for this call",
+        "i" = "{.code NULL} uses global cache configuration (default)"
+      ))
+    }
+  }
+  invisible(.cache)
+}
+
 #' @export
 run.Module <- function(
   module,
@@ -82,9 +103,11 @@ run.Module <- function(
   .parallel_method = c("ellmer", "mirai"),
   .progress = TRUE,
   .return_format = "simple",
-  .show_prompt = FALSE
+  .show_prompt = FALSE,
+  .cache = NULL
 ) {
   .parallel_method <- match.arg(.parallel_method)
+  validate_cache_arg(.cache)
 
   # Show prompt preview if requested
   if (.show_prompt) {
@@ -98,7 +121,8 @@ run.Module <- function(
     .verbose = .verbose,
     .parallel = .parallel,
     .progress = .progress,
-    .return_format = .return_format
+    .return_format = .return_format,
+    .cache = .cache
   )
 }
 
@@ -118,18 +142,7 @@ run.PredictModule <- function(
   .parallel_method <- match.arg(.parallel_method)
 
   # Validate .cache parameter
-  if (!is.null(.cache)) {
-    if (!is.logical(.cache) || length(.cache) != 1 || is.na(.cache)) {
-      cache_value <- .cache
-      cli::cli_abort(c(
-        "{.arg .cache} must be {.code TRUE}, {.code FALSE}, or {.code NULL}",
-        "x" = "You provided: {.cls {class(cache_value)}} with value {.val {cache_value}}",
-        "i" = "{.code TRUE} attempts to use cache (if available)",
-        "i" = "{.code FALSE} bypasses cache for this call",
-        "i" = "{.code NULL} uses global cache configuration (default)"
-      ))
-    }
-  }
+  validate_cache_arg(.cache)
 
   # Show prompt preview if requested
   if (.show_prompt) {
@@ -1499,7 +1512,11 @@ print.dsprrr_batch_result <- function(x, ...) {
   n_errors <- sum(vapply(
     x,
     function(item) {
-      isTRUE(item$error) || inherits(item$output, "error")
+      # Errors from create_error_result() are stored in metadata$error with
+      # output = NA; also tolerate other shapes defensively.
+      !is.null(item$metadata$error) ||
+        isTRUE(item$error) ||
+        inherits(item$output, "error")
     },
     logical(1)
   ))
diff --git a/R/signature-parser.R b/R/signature-parser.R
index 0ad63c63..a5e5f455 100644
--- a/R/signature-parser.R
+++ b/R/signature-parser.R
@@ -393,6 +393,10 @@ parse_type_string <- function(type_str, field_name = NULL) {
   # Handle common type patterns
   type_str <- trimws(type_str)
 
+  # Catch unknown / misspelled simple types early with a helpful error,
+  # rather than silently falling through to the type_string() default below.
+  validate_type_string(type_str)
+
   # Handle Optional types
   if (grepl("^Optional\\[", type_str)) {
     inner_type_str <- sub("^Optional\\[(.*)\\]$", "\\1", type_str)
@@ -718,8 +722,15 @@ validate_type_string <- function(type_str, context = "output") {
 
   if (!is_known && !grepl("\\[|\\(", type_str)) {
     # Unknown simple type
+    suggestion <- find_closest_match(
+      type_str,
+      c(known_types, known_constructors)
+    )
     cli::cli_abort(c(
       "Unknown type: {.val {type_str}}",
+      if (!is.null(suggestion)) {
+        c("i" = "Did you mean {.code {suggestion}}?")
+      },
       "i" = "Available simple types: {.code string}, {.code number}, {.code integer}, {.code boolean}",
       "i" = "Available complex types: {.code enum('a', 'b')}, {.code array(string)}, {.code object}"
     ))
diff --git a/man/configure_cache.Rd b/man/configure_cache.Rd
index 43441a6a..ec863fe6 100644
--- a/man/configure_cache.Rd
+++ b/man/configure_cache.Rd
@@ -8,7 +8,7 @@ configure_cache(
   enable = TRUE,
   enable_memory = TRUE,
   enable_disk = TRUE,
-  disk_path = ".dsprrr_cache",
+  disk_path = default_disk_cache_path(),
   memory_max_entries = 1000L,
   disk_max_size = 500 * 1024^2,
   disk_max_age = Inf
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
new file mode 100644
index 00000000..255eb6a7
--- /dev/null
+++ b/tests/testthat/setup.R
@@ -0,0 +1,32 @@
+# Isolate the on-disk cache during the test suite.
+#
+# Without this, the default disk cache writes into the working directory
+# (tests/testthat/.dsprrr_cache during R CMD check), which pollutes the package
+# source tree and leaks state across runs -- a documented source of flaky tests.
+#
+# We redirect the cache to a temporary directory via DSPRRR_CACHE_PATH (so even
+# bare configure_cache() calls land there) and disable the disk tier by default.
+# Cache-behavior tests that need disk caching opt back in with their own
+# tempdirs, which take precedence over this default.
+
+dsprrr_test_cache_dir <- file.path(tempdir(), "dsprrr-test-cache")
+dir.create(dsprrr_test_cache_dir, showWarnings = FALSE, recursive = TRUE)
+
+withr::local_envvar(
+  DSPRRR_CACHE_PATH = dsprrr_test_cache_dir,
+  .local_envir = testthat::teardown_env()
+)
+
+dsprrr::configure_cache(
+  enable_disk = FALSE,
+  disk_path = dsprrr_test_cache_dir
+)
+
+withr::defer(
+  {
+    dsprrr::clear_cache()
+    unlink(dsprrr_test_cache_dir, recursive = TRUE)
+    dsprrr::configure_cache()
+  },
+  envir = testthat::teardown_env()
+)
diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R
index 424a066a..32d2ab87 100644
--- a/tests/testthat/test-cache.R
+++ b/tests/testthat/test-cache.R
@@ -1123,3 +1123,17 @@ test_that("dsprrr_sitrep shows disabled cache", {
 
   expect_false(result$cache_enabled)
 })
+
+test_that(".cache is accepted and validated for non-Predict modules (dsprrr-jup)", {
+  # Regression: only run.PredictModule declared .cache; for every other module
+  # type and pipelines it fell into ... -> treated as an unknown signature input
+  # (spurious warning) and dropped.
+  mod <- module_fn("text -> answer", function(text) paste("Echo:", text))
+
+  expect_no_warning(res <- run(mod, text = "hi", .cache = FALSE))
+  expect_equal(res$answer, "Echo: hi")
+
+  # Invalid .cache values are rejected the same way as for PredictModule.
+  expect_error(run(mod, text = "hi", .cache = "yes"), "must be")
+  expect_error(run(mod, text = "hi", .cache = NA), "must be")
+})
diff --git a/tests/testthat/test-optimize.R b/tests/testthat/test-optimize.R
index aefe9c58..da486f72 100644
--- a/tests/testthat/test-optimize.R
+++ b/tests/testthat/test-optimize.R
@@ -248,3 +248,22 @@ test_that("module_metric_summary handles modules without trials", {
   metrics <- module_metrics(mod)
   expect_equal(nrow(metrics), 0)
 })
+
+test_that("module_trials() warns and returns an inspectable summary when all trials fail (dsprrr-hew)", {
+  # Regression: which.max() on all-NA scores returns integer(0), so indexing
+  # threw "attempt to select less than one element" instead of a clear message.
+  mod <- module(signature("question -> answer"), type = "predict")
+  mod$state$trials <- tibble::tibble(
+    trial_id = 1:3,
+    score = NA_real_,
+    parameters = list(list(a = 1), list(a = 2), list(a = 3))
+  )
+
+  expect_warning(
+    summary <- module_trials(mod),
+    "trial.*failed"
+  )
+  expect_equal(summary$n_trials, 3L)
+  expect_true(is.na(summary$best_score))
+  expect_true(is.na(summary$best_trial))
+})
diff --git a/tests/testthat/test-run.R b/tests/testthat/test-run.R
index 3f99514e..259f4116 100644
--- a/tests/testthat/test-run.R
+++ b/tests/testthat/test-run.R
@@ -1065,3 +1065,37 @@ test_that("mock_batch_chat handles character response directly", {
 
   expect_true(inherits(result, "Chat"))
 })
+
+test_that("print.dsprrr_batch_result counts failed items (dsprrr-8l0)", {
+  # Regression: the error predicate read item$error / inherits(item$output,
+  # "error"), but create_error_result() stores the message in metadata$error
+  # with output = NA, so n_errors was always 0 and failures printed as success.
+  result <- structure(
+    list(
+      list(output = "ok", chat = NULL, metadata = list()),
+      list(
+        output = NA,
+        chat = NULL,
+        metadata = list(error = "intentional failure", batch_index = 2L)
+      )
+    ),
+    class = c("dsprrr_batch_result", "list")
+  )
+
+  out <- cli::cli_fmt(print(result))
+  expect_true(any(grepl("Errors", out)))
+  expect_true(any(grepl("1 of 2", out)))
+  expect_false(any(grepl("All items completed successfully", out)))
+})
+
+test_that("print.dsprrr_batch_result reports success when there are no errors", {
+  result <- structure(
+    list(
+      list(output = "a", chat = NULL, metadata = list()),
+      list(output = "b", chat = NULL, metadata = list())
+    ),
+    class = c("dsprrr_batch_result", "list")
+  )
+  out <- cli::cli_fmt(print(result))
+  expect_true(any(grepl("All items completed successfully", out)))
+})
diff --git a/tests/testthat/test-signature-parser.R b/tests/testthat/test-signature-parser.R
index b7302775..2613d00a 100644
--- a/tests/testthat/test-signature-parser.R
+++ b/tests/testthat/test-signature-parser.R
@@ -356,3 +356,29 @@ test_that("find_closest_match works correctly", {
   # No close match
   expect_null(dsprrr:::find_closest_match("zzzzz", candidates))
 })
+
+test_that("unknown / misspelled types raise an error instead of silently becoming strings (dsprrr-47p)", {
+  # Regression: validate_type_string() existed but was never called, so typo'd
+  # types silently fell through to the type_string() default.
+  expect_error(parse_signature("question -> answer: interger"), "Unknown type")
+  expect_error(
+    parse_signature("question -> answer: frobnicate"),
+    "Unknown type"
+  )
+  expect_error(parse_signature("question -> answer: numbr"), "Unknown type")
+
+  # The error suggests the closest valid type.
+  expect_error(parse_signature("q -> a: integ"), "Did you mean")
+})
+
+test_that("valid types still parse after type validation is wired in", {
+  expect_no_error(parse_signature("q -> a: string"))
+  expect_no_error(parse_signature("q -> a: integer"))
+  expect_no_error(parse_signature("q -> a: number"))
+  expect_no_error(parse_signature("q -> a: boolean"))
+  expect_no_error(parse_signature("q -> a: enum('x', 'y')"))
+  expect_no_error(parse_signature("q -> a: array(string)"))
+  # Python-style aliases remain valid.
+  expect_no_error(parse_signature("q -> a: str"))
+  expect_no_error(parse_signature("q -> a: int"))
+})

From 6da38a3e6dc9d6a56c744c35d314c330bb2aaa6c Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Sun, 14 Jun 2026 18:09:39 -0400
Subject: [PATCH 2/7] =?UTF-8?q?fix:=20optimizer=20correctness=20=E2=80=94?=
 =?UTF-8?q?=20bootstrap=20field=20metrics=20+=20failure-blind=20mean=20(wa?=
 =?UTF-8?q?ve=202)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BootstrapFewShot harvests demos with field-aware metrics (dsprrr-s3b):
- The single-module compile path passed the bare cell value as `expected`, so
  a field-aware metric (e.g. metric_exact_match(field = "answer"), the
  documented default) errored inside extract_field(), was scored NA, and
  bootstrapped ZERO demos. It now passes the full training row when the metric
  declares a field, mirroring the pipeline path and evaluate(). Tracks
  metric_field separately from output_col so the distinction survives.

evaluate() counts failed rows as 0 in mean_score (dsprrr-tn1):
- mean_score used na.rm = TRUE, so failed rows dropped out of the number that
  drives all optimizer selection — a config correct on the 2 rows it answers
  could outrank one robust across 10. Failures now count as 0 (matching DSPy).
  The raw `scores` vector keeps NA and n_errors still reports failures.

Both fixes have regression tests verified to fail without the change.
Full suite: 3009 passing, no new failures.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 R/evaluate.R                                 | 10 +++++-
 R/teleprompter-bootstrap.R                   | 12 +++++--
 tests/testthat/test-compile.R                | 36 +++++++++++++++++++
 tests/testthat/test-teleprompter-bootstrap.R | 37 ++++++++++++++++++++
 4 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/R/evaluate.R b/R/evaluate.R
index 0d83017f..dfebd6ed 100644
--- a/R/evaluate.R
+++ b/R/evaluate.R
@@ -305,7 +305,15 @@ evaluate.Module <- function(
     evaluated <- epoch_results[[epochs]]$evaluated
   }
 
-  mean_score <- mean(scores, na.rm = TRUE)
+  # Failed rows (score NA) count as 0 in the headline mean, matching DSPy.
+  # Optimizers select on `mean_score`; dropping failures with na.rm would let a
+  # config that errors on the hard examples outrank a robust one. The raw
+  # `scores` vector keeps NA for diagnostics, and `n_errors` reports failures.
+  mean_score <- if (length(scores) == 0) {
+    NA_real_
+  } else {
+    mean(ifelse(is.na(scores), 0, scores))
+  }
   n_evaluated <- sum(!is.na(scores))
   n_errors <- sum(is.na(scores))
 
diff --git a/R/teleprompter-bootstrap.R b/R/teleprompter-bootstrap.R
index cd883546..7d1eb079 100644
--- a/R/teleprompter-bootstrap.R
+++ b/R/teleprompter-bootstrap.R
@@ -213,7 +213,8 @@ compile_bootstrap <- function(
 
   # Determine output column name
   # First try to get field from metric, then fall back to auto-detection
-  output_col <- get_metric_field(teleprompter@metric) %||%
+  metric_field <- get_metric_field(teleprompter@metric)
+  output_col <- metric_field %||%
     find_output_column(trainset, input_names)
 
   if (is.null(output_col)) {
@@ -290,8 +291,13 @@ compile_bootstrap <- function(
         }
       }
 
-      # Get expected output
-      expected <- if (!is.null(output_col) && output_col %in% names(row)) {
+      # Get expected output. Field-aware metrics extract their field from
+      # `expected` themselves (as in evaluate()), so they must receive the full
+      # row, not the bare cell value -- otherwise extract_field() errors, the
+      # metric is scored NA, and no demos are ever harvested.
+      expected <- if (!is.null(metric_field) && metric_field %in% names(row)) {
+        row
+      } else if (!is.null(output_col) && output_col %in% names(row)) {
         row[[output_col]]
       } else {
         NULL
diff --git a/tests/testthat/test-compile.R b/tests/testthat/test-compile.R
index 9b41cad2..35990b06 100644
--- a/tests/testthat/test-compile.R
+++ b/tests/testthat/test-compile.R
@@ -420,3 +420,39 @@ test_that("dsprrr_evaluation print method handles errors", {
   output <- capture.output(print(results), type = "message")
   expect_true(any(grepl("Errors", output, fixed = TRUE)))
 })
+
+test_that("evaluate() counts failed rows as 0 in mean_score (dsprrr-tn1)", {
+  # Regression: mean_score used na.rm = TRUE, so failed rows vanished from the
+  # number that drives optimizer selection. A failing row must count as 0 so a
+  # config that errors on hard examples cannot outrank a robust one.
+  sig <- Signature(
+    inputs = list(input(name = "text", class = S7::class_character)),
+    output_type = ellmer::type_string(),
+    instructions = "Classify"
+  )
+  mod <- module(signature = sig, type = "predict")
+
+  dataset <- data.frame(
+    text = c("a", "b"),
+    expected = c("a", "b")
+  )
+
+  # Scores row 1 as 1, errors on row 2 (-> NA -> counted as 0).
+  half_failing_metric <- function(pred, row) {
+    if (identical(row$text, "b")) {
+      stop("intentional failure")
+    }
+    1
+  }
+
+  results <- evaluate(
+    mod,
+    dataset,
+    metric = half_failing_metric,
+    .llm = mock_llm,
+    .progress = FALSE
+  )
+
+  expect_equal(results$n_errors, 1)
+  expect_equal(results$mean_score, 0.5) # (1 + 0) / 2, not 1.0
+})
diff --git a/tests/testthat/test-teleprompter-bootstrap.R b/tests/testthat/test-teleprompter-bootstrap.R
index a112b646..bb2b66d2 100644
--- a/tests/testthat/test-teleprompter-bootstrap.R
+++ b/tests/testthat/test-teleprompter-bootstrap.R
@@ -446,3 +446,40 @@ test_that("compile_module works with BootstrapFewShot", {
   expect_true(result$is_compiled())
   expect_equal(result$config$teleprompter, "BootstrapFewShot")
 })
+
+test_that("BootstrapFewShot harvests demos with field-aware metrics (dsprrr-s3b)", {
+  # Regression: the single-module path passed the bare cell value as `expected`,
+  # so a field-aware metric (the documented default) errored inside
+  # extract_field(), was scored NA, and bootstrapped ZERO demos. The metric must
+  # receive the full row, mirroring the pipeline path and evaluate().
+  local_reset_cache()
+
+  sig <- Signature(
+    inputs = list(input(name = "question", class = S7::class_character)),
+    output_type = ellmer::type_string(),
+    instructions = "Answer the question"
+  )
+  mod <- module(signature = sig, type = "predict", template = "{question}")
+
+  # Mock LLM always returns the correct structured answer so every row passes.
+  mock_llm <- structure(
+    list(chat_structured = function(prompt, type, ...) list(answer = "yes")),
+    class = "Chat"
+  )
+
+  trainset <- data.frame(
+    question = c("q1", "q2", "q3"),
+    answer = c("yes", "yes", "yes")
+  )
+
+  tp <- BootstrapFewShot(
+    metric = metric_exact_match(field = "answer"),
+    max_labeled_demos = 0L,
+    max_bootstrapped_demos = 2L,
+    seed = 42L
+  )
+
+  result <- compile(tp, mod, trainset, .llm = mock_llm)
+
+  expect_gt(result$config$optimizer$n_bootstrapped_demos, 0)
+})

From c6adf9d1a3739f8a830f736331984d8c47ca28be Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Sun, 14 Jun 2026 18:15:41 -0400
Subject: [PATCH 3/7] fix: thread rollout_id so retries actually explore
 (dsprrr-pcd)

rollout_id was fully implemented in the cache layer but no caller ever passed
it. With caching enabled (the default), BestOfN/Refine/Assert made byte-identical
attempts: attempt 1 missed the cache, attempts 2..N hit it and returned the same
response, so effective N was always 1.

Thread rollout_id end to end:
- call_llm_request() and PredictModule's private call_llm() / forward() now
  accept rollout_id and pass it to cached_chat_structured(), which already folds
  it into the cache key.
- BestOfN, Refine, and Assert pass rollout_id = i on each attempt, so every
  attempt gets a distinct cache key.

Regression tests (verified to fail without the fix):
- forward() threads rollout_id into the cache key (repeat id hits cache,
  new id misses).
- BestOfN and Refine pass 1,2,3 to successive attempts.

Note: a per-attempt temperature override (also in dsprrr-pcd) and optimizer-side
diversity (SIMBA/GEPA/COPRO/bootstrap teacher) remain follow-ups.

Full suite: 3015 passing, no new failures.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 R/module-assert.R                    | 11 +++++--
 R/module-predict.R                   | 18 +++++++---
 R/module-wrapper.R                   | 24 +++++++++++---
 R/run.R                              |  9 ++++-
 tests/testthat/test-cache.R          | 32 ++++++++++++++++++
 tests/testthat/test-module-wrapper.R | 49 ++++++++++++++++++++++++++++
 6 files changed, 132 insertions(+), 11 deletions(-)

diff --git a/R/module-assert.R b/R/module-assert.R
index 4b9a6b37..2233b043 100644
--- a/R/module-assert.R
+++ b/R/module-assert.R
@@ -151,10 +151,17 @@ AssertModule <- R6::R6Class(
           )
         }
 
-        # Run the wrapped module
+        # Run the wrapped module. rollout_id partitions the cache per attempt so
+        # retries are not served identical cached responses.
         result <- tryCatch(
           {
-            self$module$forward(current_batch, .llm = .llm, trace = FALSE, ...)
+            self$module$forward(
+              current_batch,
+              .llm = .llm,
+              trace = FALSE,
+              rollout_id = i,
+              ...
+            )
           },
           error = function(e) {
             cli::cli_warn(c(
diff --git a/R/module-predict.R b/R/module-predict.R
index 14772a07..53106bd7 100644
--- a/R/module-predict.R
+++ b/R/module-predict.R
@@ -54,7 +54,14 @@ PredictModule <- R6::R6Class(
     #'   if caching globally disabled). If FALSE, bypasses cache for this call.
     #' @param ... Additional arguments
     #' @return Tibble with result, .chat, .metadata columns
-    forward = function(batch, .llm = NULL, trace = TRUE, .cache = NULL, ...) {
+    forward = function(
+      batch,
+      .llm = NULL,
+      trace = TRUE,
+      .cache = NULL,
+      rollout_id = NULL,
+      ...
+    ) {
       # Handle both list and data frame inputs
       if (is.data.frame(batch)) {
         inputs <- as.list(batch[1, , drop = FALSE])
@@ -80,7 +87,8 @@ PredictModule <- R6::R6Class(
             llm = llm,
             request = request,
             output_type = self$signature@output_type,
-            .cache = .cache
+            .cache = .cache,
+            rollout_id = rollout_id
           )
         },
         error = function(e) {
@@ -447,13 +455,15 @@ PredictModule <- R6::R6Class(
       llm,
       request,
       output_type,
-      .cache = NULL
+      .cache = NULL,
+      rollout_id = NULL
     ) {
       call_llm_request(
         llm = llm,
         request = request,
         output_type = output_type,
-        .cache = .cache
+        .cache = .cache,
+        rollout_id = rollout_id
       )
     }
   ),
diff --git a/R/module-wrapper.R b/R/module-wrapper.R
index 60bd2016..75d4d458 100644
--- a/R/module-wrapper.R
+++ b/R/module-wrapper.R
@@ -124,10 +124,18 @@ BestOfNModule <- R6::R6Class(
       total_cost <- 0
 
       for (i in seq_len(self$N)) {
-        # Run the wrapped module
+        # Run the wrapped module. rollout_id partitions the cache per attempt so
+        # attempts 2..N are not served identical cached responses (which would
+        # make BestOfN explore nothing when caching is enabled).
         result <- tryCatch(
           {
-            self$module$forward(batch, .llm = .llm, trace = FALSE, ...)
+            self$module$forward(
+              batch,
+              .llm = .llm,
+              trace = FALSE,
+              rollout_id = i,
+              ...
+            )
           },
           error = function(e) {
             consecutive_failures <<- consecutive_failures + 1
@@ -643,10 +651,18 @@ RefineModule <- R6::R6Class(
           )
         }
 
-        # Run the wrapped module
+        # Run the wrapped module. rollout_id partitions the cache per attempt so
+        # retries are not served identical cached responses even before feedback
+        # changes the prompt.
         result <- tryCatch(
           {
-            self$module$forward(current_batch, .llm = .llm, trace = FALSE, ...)
+            self$module$forward(
+              current_batch,
+              .llm = .llm,
+              trace = FALSE,
+              rollout_id = i,
+              ...
+            )
           },
           error = function(e) {
             consecutive_failures <<- consecutive_failures + 1
diff --git a/R/run.R b/R/run.R
index 07644cd8..c588dfa0 100644
--- a/R/run.R
+++ b/R/run.R
@@ -474,7 +474,13 @@ build_module_request <- function(module, inputs) {
 
 #' Execute a structured ellmer call from a prepared request
 #' @noRd
-call_llm_request <- function(llm, request, output_type, .cache = NULL) {
+call_llm_request <- function(
+  llm,
+  request,
+  output_type,
+  .cache = NULL,
+  rollout_id = NULL
+) {
   if (isTRUE(request$is_multimodal)) {
     llm$chat_structured(
       request$payload,
@@ -486,6 +492,7 @@ call_llm_request <- function(llm, request, output_type, .cache = NULL) {
       llm = llm,
       prompt = request$full_prompt,
       output_type = output_type,
+      rollout_id = rollout_id,
       .cache = .cache
     )
   }
diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R
index 32d2ab87..08ef1ea0 100644
--- a/tests/testthat/test-cache.R
+++ b/tests/testthat/test-cache.R
@@ -1137,3 +1137,35 @@ test_that(".cache is accepted and validated for non-Predict modules (dsprrr-jup)
   expect_error(run(mod, text = "hi", .cache = "yes"), "must be")
   expect_error(run(mod, text = "hi", .cache = NA), "must be")
 })
+
+test_that("rollout_id threads from forward() into the cache key (dsprrr-pcd)", {
+  # Regression: rollout_id was implemented in the cache layer but no caller
+  # passed it, so BestOfN/Refine retries with caching enabled were served
+  # identical cached responses (effective N = 1).
+  local_reset_cache()
+  configure_cache(enable = TRUE, enable_memory = TRUE, enable_disk = FALSE)
+
+  call_count <- 0
+  mock_llm <- structure(
+    list(chat_structured = function(prompt, type, ...) {
+      call_count <<- call_count + 1
+      list(answer = paste0("resp-", call_count))
+    }),
+    class = "Chat"
+  )
+  sig <- Signature(
+    inputs = list(input(name = "q", class = S7::class_character)),
+    output_type = ellmer::type_string(),
+    instructions = ""
+  )
+  mod <- module(signature = sig, type = "predict", template = "{q}")
+
+  # Same prompt, different rollout_id -> both miss the cache -> 2 real calls.
+  mod$forward(list(q = "x"), .llm = mock_llm, rollout_id = 1)
+  mod$forward(list(q = "x"), .llm = mock_llm, rollout_id = 2)
+  expect_equal(call_count, 2)
+
+  # Repeating a rollout_id hits the cache -> no new call.
+  mod$forward(list(q = "x"), .llm = mock_llm, rollout_id = 1)
+  expect_equal(call_count, 2)
+})
diff --git a/tests/testthat/test-module-wrapper.R b/tests/testthat/test-module-wrapper.R
index 29d0138a..cdb55d72 100644
--- a/tests/testthat/test-module-wrapper.R
+++ b/tests/testthat/test-module-wrapper.R
@@ -603,3 +603,52 @@ test_that("RefineModule print works", {
   expect_invisible(print(wrapper))
   expect_s3_class(wrapper, "RefineModule")
 })
+
+test_that("BestOfN passes a distinct rollout_id to each attempt (dsprrr-pcd)", {
+  seen <- integer(0)
+  mock <- create_mock_module()
+  mock$forward <- function(
+    batch,
+    .llm = NULL,
+    trace = TRUE,
+    rollout_id = NULL,
+    ...
+  ) {
+    seen <<- c(seen, rollout_id %||% NA_integer_)
+    tibble::tibble(
+      output = list(list(answer = "a")),
+      chat = list(NULL),
+      metadata = list(list(total_tokens = 1, cost = 0, model = "mock"))
+    )
+  }
+
+  # threshold above any reward so all N attempts run
+  wrapper <- best_of_n(mock, N = 3, threshold = 99)
+  wrapper$forward(list(question = "q"))
+
+  expect_equal(seen, c(1L, 2L, 3L))
+})
+
+test_that("Refine passes a distinct rollout_id to each attempt (dsprrr-pcd)", {
+  seen <- integer(0)
+  mock <- create_mock_module()
+  mock$forward <- function(
+    batch,
+    .llm = NULL,
+    trace = TRUE,
+    rollout_id = NULL,
+    ...
+  ) {
+    seen <<- c(seen, rollout_id %||% NA_integer_)
+    tibble::tibble(
+      output = list(list(answer = "a")),
+      chat = list(NULL),
+      metadata = list(list(total_tokens = 1, cost = 0, model = "mock"))
+    )
+  }
+
+  wrapper <- refine(mock, N = 3, threshold = 99)
+  wrapper$forward(list(question = "q"))
+
+  expect_equal(seen, c(1L, 2L, 3L))
+})

From 75be2265cd70bd9a92af160676eda1aff1466d15 Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Sun, 14 Jun 2026 18:20:30 -0400
Subject: [PATCH 4/7] fix: pin_module_config no longer silently destroys
 pipelines (dsprrr-07u)

module_kind() collapsed every unrecognised module class to "predict", so a
PipelineModule (and EnsembleModule, RAGModule, ...) passed the v2-persistence
allow-list and serialised as a bare PredictModule. A pinned pipeline restored
as a single empty PredictModule, silently dropping every step and its
bootstrapped per-step demos.

- module_kind() now falls back to the actual class name instead of "predict",
  so unsupported types are correctly rejected by the persistence allow-list.
- serialize_module_config_v2() detects PipelineModule explicitly and aborts
  with a clear "pipeline persistence not yet supported" message pointing users
  to pin each step individually.

Regression test: pin_module_config() on a pipeline now errors instead of
silently losing data. Full suite: 3016 passing, no new failures.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 R/orchestration.R                   |  9 +++++++++
 R/run.R                             |  6 +++++-
 tests/testthat/test-orchestration.R | 17 +++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/R/orchestration.R b/R/orchestration.R
index 62bf2710..8216816a 100644
--- a/R/orchestration.R
+++ b/R/orchestration.R
@@ -159,6 +159,15 @@ restore_module_config <- function(config, signature = NULL) {
 }
 
 serialize_module_config_v2 <- function(module) {
+  if (inherits(module, "PipelineModule")) {
+    cli::cli_abort(c(
+      "Pipeline persistence is not yet supported",
+      "x" = "{.fn pin_module_config} cannot serialise a {.cls PipelineModule}",
+      "i" = "Persisting it here would silently drop every step and its demos",
+      "i" = "Pin each step module individually, or rebuild the pipeline from source"
+    ))
+  }
+
   kind <- module_kind(module)
   supported <- c("predict", "react", "chain_of_thought", "multichain")
   if (!kind %in% supported) {
diff --git a/R/run.R b/R/run.R
index c588dfa0..17ed9bec 100644
--- a/R/run.R
+++ b/R/run.R
@@ -292,7 +292,11 @@ module_kind <- function(module) {
       "ReactModule" = "react",
       "MultiChainComparisonModule" = "multichain",
       "PredictModule" = "predict",
-      "predict"
+      # Fall back to the actual class name rather than silently claiming
+      # "predict" -- otherwise unsupported module types (pipelines, ensembles,
+      # RAG, ...) pass the persistence allow-list and serialize as a bare
+      # PredictModule, losing all of their structure.
+      class(module)[1]
     )
 }
 
diff --git a/tests/testthat/test-orchestration.R b/tests/testthat/test-orchestration.R
index 145e3a50..e7acd2dd 100644
--- a/tests/testthat/test-orchestration.R
+++ b/tests/testthat/test-orchestration.R
@@ -465,3 +465,20 @@ test_that("module config round-trips correctly", {
   expect_true(restored$is_compiled())
   expect_equal(restored$state$best_score, 0.92)
 })
+
+test_that("pin_module_config refuses to silently destroy pipelines (dsprrr-07u)", {
+  skip_if_not_installed("pins")
+
+  m1 <- module(signature("question -> thought"), type = "predict")
+  m2 <- module(signature("thought -> answer"), type = "predict")
+  pipe <- pipeline(m1, m2)
+  board <- pins::board_temp()
+
+  # Regression: module_kind() collapsed unknown classes to "predict", so a
+  # pipeline pinned without error and restored as a single empty PredictModule,
+  # silently losing every step and its bootstrapped demos.
+  expect_error(
+    pin_module_config(board, "pipe", pipe),
+    "[Pp]ipeline"
+  )
+})

From 71116f93789e9a2f01e0b97e22a84e01ffaba132 Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Mon, 15 Jun 2026 17:55:20 -0400
Subject: [PATCH 5/7] fix: address wave-1 PR review feedback (dsprrr-6qo)

Codex/Copilot review of PR #114:
- Module$run() now validates .cache via validate_cache_arg(), so callers
  reaching the R6 method directly get the same loud error as the run()
  generic instead of a silently-forwarded malformed value.
- default_disk_cache_path() treats an empty DSPRRR_CACHE_PATH as unset
  (nzchar guard) so the disk cache never resolves to "".
- Test teardown restores the cache default with an explicit disk_path,
  independent of LIFO env-var cleanup ordering.
- Reword the .cache NEWS bullet to scope it accurately: it is validated
  and plumbed everywhere, but direct-call modules don't cache yet (dsprrr-aa2).
- Move the BootstrapFewShot (s3b) and evaluate (tn1) NEWS bullets out of
  wave 1 -- their code lands in wave 2 -- so each stacked PR's changelog
  matches its own diff.
- Comment the batch-print test to explain why cli_fmt() is the correct
  capture (cli writes to stdout; capture.output(type="message") would miss it).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 NEWS.md                   | 19 +++++++------------
 R/cache.R                 |  5 ++++-
 R/module-base.R           |  5 +++++
 tests/testthat/setup.R    |  6 +++++-
 tests/testthat/test-run.R |  3 +++
 5 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 8838be9e..c2d0cc93 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,11 +4,6 @@ First development changelog. dsprrr is experimental; the API may change.
 
 ## Bug fixes
 
-* `BootstrapFewShot` now harvests demonstrations when the metric targets a
-  specific output field (e.g. `metric_exact_match(field = "answer")`).
-  Previously the single-module path passed a bare value to the metric, which
-  errored internally and silently bootstrapped zero demos (#dsprrr-s3b).
-
 * Failed items in a batch are now counted and reported. Previously
   `print()` on a batch result always said "All items completed successfully"
   because it looked for the error in the wrong place (#dsprrr-8l0).
@@ -21,13 +16,13 @@ First development changelog. dsprrr is experimental; the API may change.
   example, when the API is unreachable) instead of a cryptic
   "attempt to select less than one element" (#dsprrr-hew).
 
-* `.cache` is now honored by all module types and pipelines, not only
-  `PredictModule`. Previously it was silently dropped (with a spurious
-  "unknown input" warning) for every other module (#dsprrr-jup).
-
-* Evaluation now treats a failed prediction as a score of 0 rather than
-  dropping it from the mean, so optimizers no longer prefer configurations
-  that fail on most of the data (#dsprrr-tn1).
+* `.cache` is now a validated, first-class argument to `run()` and `forward()`
+  for every module type, instead of triggering a spurious "unknown input"
+  warning and being dropped for non-`PredictModule` modules (#dsprrr-jup).
+  `PredictModule` (and the wrapper/few-shot modules that delegate to it) honor
+  it for the structured-output cache; modules that drive the LLM directly
+  (e.g. `RAGModule`, `ReActModule`, `RLMModule`) accept `.cache` but do not yet
+  route their own calls through the cache (#dsprrr-aa2).
 
 ## Internal
 
diff --git a/R/cache.R b/R/cache.R
index e2b990d3..174887ab 100644
--- a/R/cache.R
+++ b/R/cache.R
@@ -19,7 +19,10 @@ NULL
 #' without writing into the current working directory.
 #' @noRd
 default_disk_cache_path <- function() {
-  Sys.getenv("DSPRRR_CACHE_PATH", unset = ".dsprrr_cache")
+  # Treat an empty-string env var the same as unset, so DSPRRR_CACHE_PATH=""
+  # does not resolve the cache to "" (which would write into the working dir).
+  path <- Sys.getenv("DSPRRR_CACHE_PATH", unset = "")
+  if (nzchar(path)) path else ".dsprrr_cache"
 }
 
 #' Configure dsprrr Cache
diff --git a/R/module-base.R b/R/module-base.R
index f097ac64..a1a2a1e5 100644
--- a/R/module-base.R
+++ b/R/module-base.R
@@ -86,6 +86,11 @@ Module <- R6::R6Class(
       .return_format = "simple",
       .cache = NULL
     ) {
+      # Validate .cache here too: callers can reach $run() directly (not only
+      # via the run() generic, which validates separately), so a malformed
+      # value must fail loudly instead of being silently forwarded.
+      validate_cache_arg(.cache)
+
       inputs <- list(...)
 
       # Validate inputs against signature
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 255eb6a7..fce2c0a7 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -26,7 +26,11 @@ withr::defer(
   {
     dsprrr::clear_cache()
     unlink(dsprrr_test_cache_dir, recursive = TRUE)
-    dsprrr::configure_cache()
+    # Restore the production default explicitly. teardown defers run LIFO, so
+    # DSPRRR_CACHE_PATH set by local_envvar() above is still in scope here; a
+    # bare configure_cache() would otherwise re-read it and point disk_path at
+    # the just-unlinked test dir.
+    dsprrr::configure_cache(disk_path = ".dsprrr_cache")
   },
   envir = testthat::teardown_env()
 )
diff --git a/tests/testthat/test-run.R b/tests/testthat/test-run.R
index 259f4116..5016d104 100644
--- a/tests/testthat/test-run.R
+++ b/tests/testthat/test-run.R
@@ -1082,6 +1082,9 @@ test_that("print.dsprrr_batch_result counts failed items (dsprrr-8l0)", {
     class = c("dsprrr_batch_result", "list")
   )
 
+  # print.dsprrr_batch_result emits only via cli, so cli::cli_fmt() captures it
+  # reliably. Do NOT switch to capture.output(type = "message"): cli writes to
+  # stdout, so that pattern silently captures nothing.
   out <- cli::cli_fmt(print(result))
   expect_true(any(grepl("Errors", out)))
   expect_true(any(grepl("1 of 2", out)))

From 36ec81cab1226601d6e11952b2242afcc91dc359 Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Mon, 15 Jun 2026 18:00:35 -0400
Subject: [PATCH 6/7] fix: nested wrappers no longer crash on rollout_id
 (dsprrr-wx6)

Regression from dsprrr-pcd: BestOfN/Refine/Assert passed rollout_id = i
explicitly while also spreading ..., so a nested wrapper
(refine(best_of_n(mod)), best_of_n(refine(mod)),
with_assertions(best_of_n(mod))) handed the inner forward() rollout_id
twice -> "formal argument matched by multiple actual arguments".

- Each wrapper forward() now takes an explicit rollout_id = NULL formal, so
  an inherited id is consumed out of ... instead of being re-spread.
- New compose_rollout_id() helper (R/utils.R) folds the inherited id into
  the per-attempt index ("2" at top level, "2.1" when nested), keeping every
  (outer, inner) attempt's cache key unique.
- Document @param rollout_id on the wrapper and PredictModule forward()
  methods (addresses Copilot review on PR #115).
- Add regression tests for all three nesting combinations; update the two
  existing pcd tests for the now-character ids.
- Add the s3b, tn1, and pcd/wx6 NEWS bullets here (their code lives in
  wave 2), so PR #115's changelog matches its diff (dsprrr-6qo).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 NEWS.md                              | 15 ++++++
 R/module-assert.R                    | 15 ++++--
 R/module-predict.R                   |  3 ++
 R/module-wrapper.R                   | 29 +++++++++---
 R/utils.R                            | 16 +++++++
 tests/testthat/test-module-wrapper.R | 70 +++++++++++++++++++++++++++-
 6 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index c2d0cc93..1438d76c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -24,6 +24,21 @@ First development changelog. dsprrr is experimental; the API may change.
   (e.g. `RAGModule`, `ReActModule`, `RLMModule`) accept `.cache` but do not yet
   route their own calls through the cache (#dsprrr-aa2).
 
+* `BootstrapFewShot` now harvests demonstrations when the metric targets a
+  specific output field (e.g. `metric_exact_match(field = "answer")`).
+  Previously the single-module path passed a bare value to the metric, which
+  errored internally and silently bootstrapped zero demos (#dsprrr-s3b).
+
+* Evaluation now treats a failed prediction as a score of 0 rather than
+  dropping it from the mean, so optimizers no longer prefer configurations
+  that fail on most of the data (#dsprrr-tn1).
+
+* `BestOfN`, `Refine`, and `Assert` now thread a per-attempt `rollout_id` into
+  the cache key, so retries make distinct attempts instead of replaying one
+  cached response when caching is enabled. Nested wrappers (e.g.
+  `refine(best_of_n(mod))`) compose their ids cleanly instead of crashing with
+  a duplicate-argument error (#dsprrr-pcd, #dsprrr-wx6).
+
 ## Internal
 
 * Tests now isolate the on-disk cache to a temporary directory, so running
diff --git a/R/module-assert.R b/R/module-assert.R
index 2233b043..7842610f 100644
--- a/R/module-assert.R
+++ b/R/module-assert.R
@@ -118,9 +118,17 @@ AssertModule <- R6::R6Class(
     #' @param batch Named list or data frame of inputs
     #' @param .llm Optional ellmer chat object
     #' @param trace Logical whether to record trace information
+    #' @param rollout_id Optional id inherited from an enclosing wrapper. Taken
+    #'   as a formal (not via `...`) so nested wrappers don't pass it twice.
     #' @param ... Additional arguments passed to wrapped module
     #' @return Tibble with output, chat, metadata columns
-    forward = function(batch, .llm = NULL, trace = TRUE, ...) {
+    forward = function(
+      batch,
+      .llm = NULL,
+      trace = TRUE,
+      rollout_id = NULL,
+      ...
+    ) {
       # Handle both list and data frame inputs
       if (is.data.frame(batch)) {
         inputs <- as.list(batch[1, , drop = FALSE])
@@ -152,14 +160,15 @@ AssertModule <- R6::R6Class(
         }
 
         # Run the wrapped module. rollout_id partitions the cache per attempt so
-        # retries are not served identical cached responses.
+        # retries are not served identical cached responses. compose_* folds in
+        # any inherited id so nesting stays unique.
         result <- tryCatch(
           {
             self$module$forward(
               current_batch,
               .llm = .llm,
               trace = FALSE,
-              rollout_id = i,
+              rollout_id = compose_rollout_id(rollout_id, i),
               ...
             )
           },
diff --git a/R/module-predict.R b/R/module-predict.R
index 53106bd7..4460c08b 100644
--- a/R/module-predict.R
+++ b/R/module-predict.R
@@ -52,6 +52,9 @@ PredictModule <- R6::R6Class(
     #' @param .cache Logical or NULL. Per-call cache control. If NULL (default),
     #'   uses global cache configuration. If TRUE, attempts to use cache (no effect
     #'   if caching globally disabled). If FALSE, bypasses cache for this call.
+    #' @param rollout_id Optional value for cache partitioning. When set, the
+    #'   structured-output cache key includes it, so retries/attempts (e.g. from
+    #'   BestOfN, Refine, Assert) get distinct keys and explore fresh responses.
     #' @param ... Additional arguments
     #' @return Tibble with result, .chat, .metadata columns
     forward = function(
diff --git a/R/module-wrapper.R b/R/module-wrapper.R
index 75d4d458..71d17614 100644
--- a/R/module-wrapper.R
+++ b/R/module-wrapper.R
@@ -100,9 +100,17 @@ BestOfNModule <- R6::R6Class(
     #' @param batch Named list or data frame of inputs
     #' @param .llm Optional ellmer chat object
     #' @param trace Logical whether to record trace information
+    #' @param rollout_id Optional id inherited from an enclosing wrapper. Taken
+    #'   as a formal (not via `...`) so nested wrappers don't pass it twice.
     #' @param ... Additional arguments passed to wrapped module
     #' @return Tibble with output, chat, metadata columns
-    forward = function(batch, .llm = NULL, trace = TRUE, ...) {
+    forward = function(
+      batch,
+      .llm = NULL,
+      trace = TRUE,
+      rollout_id = NULL,
+      ...
+    ) {
       # Handle both list and data frame inputs
       if (is.data.frame(batch)) {
         inputs <- as.list(batch[1, , drop = FALSE])
@@ -126,14 +134,15 @@ BestOfNModule <- R6::R6Class(
       for (i in seq_len(self$N)) {
         # Run the wrapped module. rollout_id partitions the cache per attempt so
         # attempts 2..N are not served identical cached responses (which would
-        # make BestOfN explore nothing when caching is enabled).
+        # make BestOfN explore nothing when caching is enabled). compose_*
+        # folds in any inherited id so nesting stays unique and consistent.
         result <- tryCatch(
           {
             self$module$forward(
               batch,
               .llm = .llm,
               trace = FALSE,
-              rollout_id = i,
+              rollout_id = compose_rollout_id(rollout_id, i),
               ...
             )
           },
@@ -616,9 +625,17 @@ RefineModule <- R6::R6Class(
     #' @param batch Named list or data frame of inputs
     #' @param .llm Optional ellmer chat object
     #' @param trace Logical whether to record trace information
+    #' @param rollout_id Optional id inherited from an enclosing wrapper. Taken
+    #'   as a formal (not via `...`) so nested wrappers don't pass it twice.
     #' @param ... Additional arguments passed to wrapped module
     #' @return Tibble with output, chat, metadata columns
-    forward = function(batch, .llm = NULL, trace = TRUE, ...) {
+    forward = function(
+      batch,
+      .llm = NULL,
+      trace = TRUE,
+      rollout_id = NULL,
+      ...
+    ) {
       # Handle both list and data frame inputs
       if (is.data.frame(batch)) {
         inputs <- as.list(batch[1, , drop = FALSE])
@@ -653,14 +670,14 @@ RefineModule <- R6::R6Class(
 
         # Run the wrapped module. rollout_id partitions the cache per attempt so
         # retries are not served identical cached responses even before feedback
-        # changes the prompt.
+        # changes the prompt. compose_* folds in any inherited id for nesting.
         result <- tryCatch(
           {
             self$module$forward(
               current_batch,
               .llm = .llm,
               trace = FALSE,
-              rollout_id = i,
+              rollout_id = compose_rollout_id(rollout_id, i),
               ...
             )
           },
diff --git a/R/utils.R b/R/utils.R
index 7424cb3e..72c169bc 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,3 +1,19 @@
+#' Compose a per-attempt cache-partition id that nests cleanly
+#'
+#' Retrying wrappers (BestOfN, Refine, Assert) tag each attempt with a
+#' `rollout_id` so retries get distinct cache keys. When wrappers are nested
+#' (e.g. `refine(best_of_n(mod))`) the outer wrapper passes its own id down;
+#' combining it with the inner attempt index keeps every (outer, inner) pair
+#' unique while consuming the inherited id (so it is not matched twice).
+#'
+#' @param rollout_id Inherited id from an enclosing wrapper, or NULL.
+#' @param i Integer attempt index for this wrapper.
+#' @return A character id, e.g. `"2"` at the top level or `"2.1"` when nested.
+#' @noRd
+compose_rollout_id <- function(rollout_id, i) {
+  if (is.null(rollout_id)) as.character(i) else paste0(rollout_id, ".", i)
+}
+
 #' Check if object inherits from ellmer type
 #'
 #' @noRd
diff --git a/tests/testthat/test-module-wrapper.R b/tests/testthat/test-module-wrapper.R
index cdb55d72..17feed57 100644
--- a/tests/testthat/test-module-wrapper.R
+++ b/tests/testthat/test-module-wrapper.R
@@ -626,7 +626,8 @@ test_that("BestOfN passes a distinct rollout_id to each attempt (dsprrr-pcd)", {
   wrapper <- best_of_n(mock, N = 3, threshold = 99)
   wrapper$forward(list(question = "q"))
 
-  expect_equal(seen, c(1L, 2L, 3L))
+  # compose_rollout_id() returns character ids so they nest cleanly
+  expect_equal(seen, c("1", "2", "3"))
 })
 
 test_that("Refine passes a distinct rollout_id to each attempt (dsprrr-pcd)", {
@@ -650,5 +651,70 @@ test_that("Refine passes a distinct rollout_id to each attempt (dsprrr-pcd)", {
   wrapper <- refine(mock, N = 3, threshold = 99)
   wrapper$forward(list(question = "q"))
 
-  expect_equal(seen, c(1L, 2L, 3L))
+  expect_equal(seen, c("1", "2", "3"))
+})
+
+# Regression tests for dsprrr-wx6: nested wrappers used to crash because each
+# wrapper passed rollout_id = i explicitly while also spreading ..., so the
+# inner forward() received rollout_id twice.
+
+# Innermost mock that records every rollout_id it is handed.
+mock_recording_rollouts <- function(seen_env) {
+  mock <- create_mock_module()
+  mock$forward <- function(
+    batch,
+    .llm = NULL,
+    trace = TRUE,
+    rollout_id = NULL,
+    ...
+  ) {
+    seen_env$ids <- c(seen_env$ids, rollout_id %||% NA_character_)
+    tibble::tibble(
+      output = list(list(answer = "a")),
+      chat = list(NULL),
+      metadata = list(list(total_tokens = 1, cost = 0, model = "mock"))
+    )
+  }
+  mock
+}
+
+test_that("refine(best_of_n(mod)) nests without a duplicate-argument crash (dsprrr-wx6)", {
+  seen <- new.env()
+  seen$ids <- character(0)
+  nested <- refine(
+    best_of_n(mock_recording_rollouts(seen), N = 2, threshold = 99),
+    N = 2,
+    threshold = 99
+  )
+
+  expect_no_error(nested$forward(list(question = "q")))
+  # 2 outer x 2 inner attempts, each id unique and hierarchical
+  expect_equal(sort(seen$ids), c("1.1", "1.2", "2.1", "2.2"))
+})
+
+test_that("best_of_n(refine(mod)) nests without a crash (dsprrr-wx6)", {
+  seen <- new.env()
+  seen$ids <- character(0)
+  nested <- best_of_n(
+    refine(mock_recording_rollouts(seen), N = 2, threshold = 99),
+    N = 2,
+    threshold = 99
+  )
+
+  expect_no_error(nested$forward(list(question = "q")))
+  expect_equal(sort(seen$ids), c("1.1", "1.2", "2.1", "2.2"))
+})
+
+test_that("with_assertions(best_of_n(mod)) nests without a crash (dsprrr-wx6)", {
+  seen <- new.env()
+  seen$ids <- character(0)
+  nested <- with_assertions(
+    best_of_n(mock_recording_rollouts(seen), N = 2, threshold = 99),
+    assertions = list(assert_output(~TRUE, "always passes")),
+    max_retries = 1L
+  )
+
+  expect_no_error(nested$forward(list(question = "q")))
+  # assertion passes on the first outer attempt; inner best_of_n runs twice
+  expect_equal(sort(seen$ids), c("1.1", "1.2"))
 })

From 43c4e1160751b44fb276e4c7468d00c9a54b89c8 Mon Sep 17 00:00:00 2001
From: James Wade <github@jameshwade.com>
Date: Mon, 15 Jun 2026 18:01:11 -0400
Subject: [PATCH 7/7] docs: add pin_module_config NEWS bullet for wave 3
 (dsprrr-6qo)

The pipeline-persistence guard (dsprrr-07u) had no changelog entry. Add it
here so PR #116's NEWS matches its own diff.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 NEWS.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 1438d76c..52f88c88 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -39,6 +39,10 @@ First development changelog. dsprrr is experimental; the API may change.
   `refine(best_of_n(mod))`) compose their ids cleanly instead of crashing with
   a duplicate-argument error (#dsprrr-pcd, #dsprrr-wx6).
 
+* `pin_module_config()` now errors clearly for pipelines and other unsupported
+  module types instead of silently serialising them as an empty `PredictModule`
+  and dropping every step on restore (#dsprrr-07u).
+
 ## Internal
 
 * Tests now isolate the on-disk cache to a temporary directory, so running