From 95e2756272d2ecfff150e2939a85742ed98530a9 Mon Sep 17 00:00:00 2001 From: James Wade Date: Sun, 14 Jun 2026 18:00:25 -0400 Subject: [PATCH 1/7] fix: release-unblock + silent-failure correctness fixes (wave 1) Highest-value, lowest-risk findings from the codebase audit. Release readiness (dsprrr-w0e): - Lower the ellmer requirement to the released >= 0.4.1 and drop the CRAN-forbidden "Remotes: tidyverse/ellmer" entry. - Remove unused Suggests: future and lifecycle. - Add NEWS.md. Correctness: - print.dsprrr_batch_result now counts failed items. It read the error from item$error / the output object, but create_error_result() stores it in metadata$error with output = NA, so failed batches always printed "All items completed successfully" (dsprrr-8l0). - Unknown or misspelled signature types now error with a closest-match suggestion instead of silently becoming string fields. validate_type_string() was fully written but never called (dsprrr-47p). - module_trials() warns and returns an inspectable summary when every trial fails, instead of a cryptic "attempt to select less than one element" (dsprrr-hew). - .cache is accepted and validated by all module types and pipelines, not only PredictModule, where it previously fell into ... and was dropped with a spurious "unknown input" warning (dsprrr-jup). Test isolation: - Add tests/testthat/setup.R plus a DSPRRR_CACHE_PATH env var so the suite no longer writes the disk cache into the package source tree (dsprrr-63v). Co-Authored-By: Claude Opus 4.8 (1M context) --- DESCRIPTION | 6 +--- NEWS.md | 43 +++++++++++++++++++++++ R/cache.R | 14 ++++++-- R/module-base.R | 12 +++++-- R/optimize.R | 16 +++++++++ R/run.R | 47 ++++++++++++++++++-------- R/signature-parser.R | 11 ++++++ man/configure_cache.Rd | 2 +- tests/testthat/setup.R | 32 ++++++++++++++++++ tests/testthat/test-cache.R | 14 ++++++++ tests/testthat/test-optimize.R | 19 +++++++++++ tests/testthat/test-run.R | 34 +++++++++++++++++++ tests/testthat/test-signature-parser.R | 26 ++++++++++++++ 13 files changed, 250 insertions(+), 26 deletions(-) create mode 100644 NEWS.md create mode 100644 tests/testthat/setup.R diff --git a/DESCRIPTION b/DESCRIPTION index 9f0dc788..0966cff7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,7 @@ Imports: cachem, cli, digest, - ellmer (>= 0.4.0.9000), + ellmer (>= 0.4.1), glue, jsonlite, methods, @@ -38,8 +38,6 @@ Imports: tibble, withr Suggests: - future, - lifecycle, callr, coro, dials, @@ -60,8 +58,6 @@ Suggests: vcr (>= 2.0.0), vitals, yardstick -Remotes: - tidyverse/ellmer Collate: 'accessors.R' 'assertion-helpers.R' diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..8838be9e --- /dev/null +++ b/NEWS.md @@ -0,0 +1,43 @@ +# dsprrr (development version) + +First development changelog. dsprrr is experimental; the API may change. + +## Bug fixes + +* `BootstrapFewShot` now harvests demonstrations when the metric targets a + specific output field (e.g. `metric_exact_match(field = "answer")`). + Previously the single-module path passed a bare value to the metric, which + errored internally and silently bootstrapped zero demos (#dsprrr-s3b). + +* Failed items in a batch are now counted and reported. Previously + `print()` on a batch result always said "All items completed successfully" + because it looked for the error in the wrong place (#dsprrr-8l0). + +* Unknown or misspelled types in a signature string (e.g. `"q -> a: interger"`) + now raise an error suggesting the closest valid type, instead of silently + becoming a string field (#dsprrr-47p). + +* `optimize_grid()` now gives a clear error when every trial fails (for + example, when the API is unreachable) instead of a cryptic + "attempt to select less than one element" (#dsprrr-hew). + +* `.cache` is now honored by all module types and pipelines, not only + `PredictModule`. Previously it was silently dropped (with a spurious + "unknown input" warning) for every other module (#dsprrr-jup). + +* Evaluation now treats a failed prediction as a score of 0 rather than + dropping it from the mean, so optimizers no longer prefer configurations + that fail on most of the data (#dsprrr-tn1). + +## Internal + +* Tests now isolate the on-disk cache to a temporary directory, so running + the test suite no longer writes into the package source tree (#dsprrr-63v). + +## Packaging + +* Lowered the minimum `ellmer` requirement to the released `>= 0.4.1` and + removed the `Remotes:` entry, so the package installs from CRAN-released + dependencies (#dsprrr-w0e). + +* Removed unused `Suggests`: `future` and `lifecycle`. diff --git a/R/cache.R b/R/cache.R index 2c35605a..e2b990d3 100644 --- a/R/cache.R +++ b/R/cache.R @@ -12,6 +12,16 @@ NULL # ── Cache Configuration ────────────────────────────────────────────────────── +#' Default on-disk cache directory +#' +#' Honors the `DSPRRR_CACHE_PATH` environment variable so the test suite (and +#' users with read-only working directories) can redirect the disk cache +#' without writing into the current working directory. +#' @noRd +default_disk_cache_path <- function() { + Sys.getenv("DSPRRR_CACHE_PATH", unset = ".dsprrr_cache") +} + #' Configure dsprrr Cache #' #' @description @@ -69,7 +79,7 @@ configure_cache <- function( enable = TRUE, enable_memory = TRUE, enable_disk = TRUE, - disk_path = ".dsprrr_cache", + disk_path = default_disk_cache_path(), memory_max_entries = 1000L, disk_max_size = 500 * 1024^2, disk_max_age = Inf @@ -245,7 +255,7 @@ get_cache_config <- function() { enable = !env_disabled, enable_memory = TRUE, enable_disk = TRUE, - disk_path = ".dsprrr_cache", + disk_path = default_disk_cache_path(), memory_max_entries = 1000L, disk_max_size = 500 * 1024^2, disk_max_age = Inf diff --git a/R/module-base.R b/R/module-base.R index f652f756..f097ac64 100644 --- a/R/module-base.R +++ b/R/module-base.R @@ -83,7 +83,8 @@ Module <- R6::R6Class( .verbose = FALSE, .parallel = FALSE, .progress = TRUE, - .return_format = "simple" + .return_format = "simple", + .cache = NULL ) { inputs <- list(...) @@ -131,7 +132,12 @@ Module <- R6::R6Class( results <- vector("list", max_length) for (i in seq_len(max_length)) { input_set <- lapply(inputs, `[[`, i) - result <- self$forward(input_set, .llm = .llm, trace = TRUE) + result <- self$forward( + input_set, + .llm = .llm, + trace = TRUE, + .cache = .cache + ) if (.return_format == "simple") { results[[i]] <- result$output[[1]] @@ -148,7 +154,7 @@ Module <- R6::R6Class( } # Single input processing - result <- self$forward(inputs, .llm = .llm, trace = TRUE) + result <- self$forward(inputs, .llm = .llm, trace = TRUE, .cache = .cache) if (.return_format == "simple") { return(result$output[[1]]) diff --git a/R/optimize.R b/R/optimize.R index ca96401f..f041d13a 100644 --- a/R/optimize.R +++ b/R/optimize.R @@ -458,6 +458,22 @@ module_trials <- function( } scores <- trials$score + if (all(is.na(scores))) { + cli::cli_warn(c( + "All {nrow(trials)} optimization trial{?s} failed (no valid scores).", + "i" = "Check that the LLM is reachable and the metric returns numeric scores.", + "i" = "Inspect the {.field trials} list-column for per-trial details." + )) + return(tibble::tibble( + n_trials = nrow(trials), + best_trial = NA_integer_, + best_score = NA_real_, + mean_score = NA_real_, + std_error = NA_real_, + best_params = list(NULL), + trials = list(trials) + )) + } best_idx <- if (objective == "maximize") { which.max(scores) } else { diff --git a/R/run.R b/R/run.R index 29f1273d..07644cd8 100644 --- a/R/run.R +++ b/R/run.R @@ -72,6 +72,27 @@ run <- function(module, ...) { UseMethod("run") } +#' Validate the `.cache` argument +#' +#' Shared by [run()] methods so every module type rejects malformed `.cache` +#' values the same way. +#' @noRd +validate_cache_arg <- function(.cache) { + if (!is.null(.cache)) { + if (!is.logical(.cache) || length(.cache) != 1 || is.na(.cache)) { + cache_value <- .cache + cli::cli_abort(c( + "{.arg .cache} must be {.code TRUE}, {.code FALSE}, or {.code NULL}", + "x" = "You provided: {.cls {class(cache_value)}} with value {.val {cache_value}}", + "i" = "{.code TRUE} attempts to use cache (if available)", + "i" = "{.code FALSE} bypasses cache for this call", + "i" = "{.code NULL} uses global cache configuration (default)" + )) + } + } + invisible(.cache) +} + #' @export run.Module <- function( module, @@ -82,9 +103,11 @@ run.Module <- function( .parallel_method = c("ellmer", "mirai"), .progress = TRUE, .return_format = "simple", - .show_prompt = FALSE + .show_prompt = FALSE, + .cache = NULL ) { .parallel_method <- match.arg(.parallel_method) + validate_cache_arg(.cache) # Show prompt preview if requested if (.show_prompt) { @@ -98,7 +121,8 @@ run.Module <- function( .verbose = .verbose, .parallel = .parallel, .progress = .progress, - .return_format = .return_format + .return_format = .return_format, + .cache = .cache ) } @@ -118,18 +142,7 @@ run.PredictModule <- function( .parallel_method <- match.arg(.parallel_method) # Validate .cache parameter - if (!is.null(.cache)) { - if (!is.logical(.cache) || length(.cache) != 1 || is.na(.cache)) { - cache_value <- .cache - cli::cli_abort(c( - "{.arg .cache} must be {.code TRUE}, {.code FALSE}, or {.code NULL}", - "x" = "You provided: {.cls {class(cache_value)}} with value {.val {cache_value}}", - "i" = "{.code TRUE} attempts to use cache (if available)", - "i" = "{.code FALSE} bypasses cache for this call", - "i" = "{.code NULL} uses global cache configuration (default)" - )) - } - } + validate_cache_arg(.cache) # Show prompt preview if requested if (.show_prompt) { @@ -1499,7 +1512,11 @@ print.dsprrr_batch_result <- function(x, ...) { n_errors <- sum(vapply( x, function(item) { - isTRUE(item$error) || inherits(item$output, "error") + # Errors from create_error_result() are stored in metadata$error with + # output = NA; also tolerate other shapes defensively. + !is.null(item$metadata$error) || + isTRUE(item$error) || + inherits(item$output, "error") }, logical(1) )) diff --git a/R/signature-parser.R b/R/signature-parser.R index 0ad63c63..a5e5f455 100644 --- a/R/signature-parser.R +++ b/R/signature-parser.R @@ -393,6 +393,10 @@ parse_type_string <- function(type_str, field_name = NULL) { # Handle common type patterns type_str <- trimws(type_str) + # Catch unknown / misspelled simple types early with a helpful error, + # rather than silently falling through to the type_string() default below. + validate_type_string(type_str) + # Handle Optional types if (grepl("^Optional\\[", type_str)) { inner_type_str <- sub("^Optional\\[(.*)\\]$", "\\1", type_str) @@ -718,8 +722,15 @@ validate_type_string <- function(type_str, context = "output") { if (!is_known && !grepl("\\[|\\(", type_str)) { # Unknown simple type + suggestion <- find_closest_match( + type_str, + c(known_types, known_constructors) + ) cli::cli_abort(c( "Unknown type: {.val {type_str}}", + if (!is.null(suggestion)) { + c("i" = "Did you mean {.code {suggestion}}?") + }, "i" = "Available simple types: {.code string}, {.code number}, {.code integer}, {.code boolean}", "i" = "Available complex types: {.code enum('a', 'b')}, {.code array(string)}, {.code object}" )) diff --git a/man/configure_cache.Rd b/man/configure_cache.Rd index 43441a6a..ec863fe6 100644 --- a/man/configure_cache.Rd +++ b/man/configure_cache.Rd @@ -8,7 +8,7 @@ configure_cache( enable = TRUE, enable_memory = TRUE, enable_disk = TRUE, - disk_path = ".dsprrr_cache", + disk_path = default_disk_cache_path(), memory_max_entries = 1000L, disk_max_size = 500 * 1024^2, disk_max_age = Inf diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R new file mode 100644 index 00000000..255eb6a7 --- /dev/null +++ b/tests/testthat/setup.R @@ -0,0 +1,32 @@ +# Isolate the on-disk cache during the test suite. +# +# Without this, the default disk cache writes into the working directory +# (tests/testthat/.dsprrr_cache during R CMD check), which pollutes the package +# source tree and leaks state across runs -- a documented source of flaky tests. +# +# We redirect the cache to a temporary directory via DSPRRR_CACHE_PATH (so even +# bare configure_cache() calls land there) and disable the disk tier by default. +# Cache-behavior tests that need disk caching opt back in with their own +# tempdirs, which take precedence over this default. + +dsprrr_test_cache_dir <- file.path(tempdir(), "dsprrr-test-cache") +dir.create(dsprrr_test_cache_dir, showWarnings = FALSE, recursive = TRUE) + +withr::local_envvar( + DSPRRR_CACHE_PATH = dsprrr_test_cache_dir, + .local_envir = testthat::teardown_env() +) + +dsprrr::configure_cache( + enable_disk = FALSE, + disk_path = dsprrr_test_cache_dir +) + +withr::defer( + { + dsprrr::clear_cache() + unlink(dsprrr_test_cache_dir, recursive = TRUE) + dsprrr::configure_cache() + }, + envir = testthat::teardown_env() +) diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index 424a066a..32d2ab87 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -1123,3 +1123,17 @@ test_that("dsprrr_sitrep shows disabled cache", { expect_false(result$cache_enabled) }) + +test_that(".cache is accepted and validated for non-Predict modules (dsprrr-jup)", { + # Regression: only run.PredictModule declared .cache; for every other module + # type and pipelines it fell into ... -> treated as an unknown signature input + # (spurious warning) and dropped. + mod <- module_fn("text -> answer", function(text) paste("Echo:", text)) + + expect_no_warning(res <- run(mod, text = "hi", .cache = FALSE)) + expect_equal(res$answer, "Echo: hi") + + # Invalid .cache values are rejected the same way as for PredictModule. + expect_error(run(mod, text = "hi", .cache = "yes"), "must be") + expect_error(run(mod, text = "hi", .cache = NA), "must be") +}) diff --git a/tests/testthat/test-optimize.R b/tests/testthat/test-optimize.R index aefe9c58..da486f72 100644 --- a/tests/testthat/test-optimize.R +++ b/tests/testthat/test-optimize.R @@ -248,3 +248,22 @@ test_that("module_metric_summary handles modules without trials", { metrics <- module_metrics(mod) expect_equal(nrow(metrics), 0) }) + +test_that("module_trials() warns and returns an inspectable summary when all trials fail (dsprrr-hew)", { + # Regression: which.max() on all-NA scores returns integer(0), so indexing + # threw "attempt to select less than one element" instead of a clear message. + mod <- module(signature("question -> answer"), type = "predict") + mod$state$trials <- tibble::tibble( + trial_id = 1:3, + score = NA_real_, + parameters = list(list(a = 1), list(a = 2), list(a = 3)) + ) + + expect_warning( + summary <- module_trials(mod), + "trial.*failed" + ) + expect_equal(summary$n_trials, 3L) + expect_true(is.na(summary$best_score)) + expect_true(is.na(summary$best_trial)) +}) diff --git a/tests/testthat/test-run.R b/tests/testthat/test-run.R index 3f99514e..259f4116 100644 --- a/tests/testthat/test-run.R +++ b/tests/testthat/test-run.R @@ -1065,3 +1065,37 @@ test_that("mock_batch_chat handles character response directly", { expect_true(inherits(result, "Chat")) }) + +test_that("print.dsprrr_batch_result counts failed items (dsprrr-8l0)", { + # Regression: the error predicate read item$error / inherits(item$output, + # "error"), but create_error_result() stores the message in metadata$error + # with output = NA, so n_errors was always 0 and failures printed as success. + result <- structure( + list( + list(output = "ok", chat = NULL, metadata = list()), + list( + output = NA, + chat = NULL, + metadata = list(error = "intentional failure", batch_index = 2L) + ) + ), + class = c("dsprrr_batch_result", "list") + ) + + out <- cli::cli_fmt(print(result)) + expect_true(any(grepl("Errors", out))) + expect_true(any(grepl("1 of 2", out))) + expect_false(any(grepl("All items completed successfully", out))) +}) + +test_that("print.dsprrr_batch_result reports success when there are no errors", { + result <- structure( + list( + list(output = "a", chat = NULL, metadata = list()), + list(output = "b", chat = NULL, metadata = list()) + ), + class = c("dsprrr_batch_result", "list") + ) + out <- cli::cli_fmt(print(result)) + expect_true(any(grepl("All items completed successfully", out))) +}) diff --git a/tests/testthat/test-signature-parser.R b/tests/testthat/test-signature-parser.R index b7302775..2613d00a 100644 --- a/tests/testthat/test-signature-parser.R +++ b/tests/testthat/test-signature-parser.R @@ -356,3 +356,29 @@ test_that("find_closest_match works correctly", { # No close match expect_null(dsprrr:::find_closest_match("zzzzz", candidates)) }) + +test_that("unknown / misspelled types raise an error instead of silently becoming strings (dsprrr-47p)", { + # Regression: validate_type_string() existed but was never called, so typo'd + # types silently fell through to the type_string() default. + expect_error(parse_signature("question -> answer: interger"), "Unknown type") + expect_error( + parse_signature("question -> answer: frobnicate"), + "Unknown type" + ) + expect_error(parse_signature("question -> answer: numbr"), "Unknown type") + + # The error suggests the closest valid type. + expect_error(parse_signature("q -> a: integ"), "Did you mean") +}) + +test_that("valid types still parse after type validation is wired in", { + expect_no_error(parse_signature("q -> a: string")) + expect_no_error(parse_signature("q -> a: integer")) + expect_no_error(parse_signature("q -> a: number")) + expect_no_error(parse_signature("q -> a: boolean")) + expect_no_error(parse_signature("q -> a: enum('x', 'y')")) + expect_no_error(parse_signature("q -> a: array(string)")) + # Python-style aliases remain valid. + expect_no_error(parse_signature("q -> a: str")) + expect_no_error(parse_signature("q -> a: int")) +}) From 6da38a3e6dc9d6a56c744c35d314c330bb2aaa6c Mon Sep 17 00:00:00 2001 From: James Wade Date: Sun, 14 Jun 2026 18:09:39 -0400 Subject: [PATCH 2/7] =?UTF-8?q?fix:=20optimizer=20correctness=20=E2=80=94?= =?UTF-8?q?=20bootstrap=20field=20metrics=20+=20failure-blind=20mean=20(wa?= =?UTF-8?q?ve=202)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BootstrapFewShot harvests demos with field-aware metrics (dsprrr-s3b): - The single-module compile path passed the bare cell value as `expected`, so a field-aware metric (e.g. metric_exact_match(field = "answer"), the documented default) errored inside extract_field(), was scored NA, and bootstrapped ZERO demos. It now passes the full training row when the metric declares a field, mirroring the pipeline path and evaluate(). Tracks metric_field separately from output_col so the distinction survives. evaluate() counts failed rows as 0 in mean_score (dsprrr-tn1): - mean_score used na.rm = TRUE, so failed rows dropped out of the number that drives all optimizer selection — a config correct on the 2 rows it answers could outrank one robust across 10. Failures now count as 0 (matching DSPy). The raw `scores` vector keeps NA and n_errors still reports failures. Both fixes have regression tests verified to fail without the change. Full suite: 3009 passing, no new failures. Co-Authored-By: Claude Opus 4.8 (1M context) --- R/evaluate.R | 10 +++++- R/teleprompter-bootstrap.R | 12 +++++-- tests/testthat/test-compile.R | 36 +++++++++++++++++++ tests/testthat/test-teleprompter-bootstrap.R | 37 ++++++++++++++++++++ 4 files changed, 91 insertions(+), 4 deletions(-) diff --git a/R/evaluate.R b/R/evaluate.R index 0d83017f..dfebd6ed 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -305,7 +305,15 @@ evaluate.Module <- function( evaluated <- epoch_results[[epochs]]$evaluated } - mean_score <- mean(scores, na.rm = TRUE) + # Failed rows (score NA) count as 0 in the headline mean, matching DSPy. + # Optimizers select on `mean_score`; dropping failures with na.rm would let a + # config that errors on the hard examples outrank a robust one. The raw + # `scores` vector keeps NA for diagnostics, and `n_errors` reports failures. + mean_score <- if (length(scores) == 0) { + NA_real_ + } else { + mean(ifelse(is.na(scores), 0, scores)) + } n_evaluated <- sum(!is.na(scores)) n_errors <- sum(is.na(scores)) diff --git a/R/teleprompter-bootstrap.R b/R/teleprompter-bootstrap.R index cd883546..7d1eb079 100644 --- a/R/teleprompter-bootstrap.R +++ b/R/teleprompter-bootstrap.R @@ -213,7 +213,8 @@ compile_bootstrap <- function( # Determine output column name # First try to get field from metric, then fall back to auto-detection - output_col <- get_metric_field(teleprompter@metric) %||% + metric_field <- get_metric_field(teleprompter@metric) + output_col <- metric_field %||% find_output_column(trainset, input_names) if (is.null(output_col)) { @@ -290,8 +291,13 @@ compile_bootstrap <- function( } } - # Get expected output - expected <- if (!is.null(output_col) && output_col %in% names(row)) { + # Get expected output. Field-aware metrics extract their field from + # `expected` themselves (as in evaluate()), so they must receive the full + # row, not the bare cell value -- otherwise extract_field() errors, the + # metric is scored NA, and no demos are ever harvested. + expected <- if (!is.null(metric_field) && metric_field %in% names(row)) { + row + } else if (!is.null(output_col) && output_col %in% names(row)) { row[[output_col]] } else { NULL diff --git a/tests/testthat/test-compile.R b/tests/testthat/test-compile.R index 9b41cad2..35990b06 100644 --- a/tests/testthat/test-compile.R +++ b/tests/testthat/test-compile.R @@ -420,3 +420,39 @@ test_that("dsprrr_evaluation print method handles errors", { output <- capture.output(print(results), type = "message") expect_true(any(grepl("Errors", output, fixed = TRUE))) }) + +test_that("evaluate() counts failed rows as 0 in mean_score (dsprrr-tn1)", { + # Regression: mean_score used na.rm = TRUE, so failed rows vanished from the + # number that drives optimizer selection. A failing row must count as 0 so a + # config that errors on hard examples cannot outrank a robust one. + sig <- Signature( + inputs = list(input(name = "text", class = S7::class_character)), + output_type = ellmer::type_string(), + instructions = "Classify" + ) + mod <- module(signature = sig, type = "predict") + + dataset <- data.frame( + text = c("a", "b"), + expected = c("a", "b") + ) + + # Scores row 1 as 1, errors on row 2 (-> NA -> counted as 0). + half_failing_metric <- function(pred, row) { + if (identical(row$text, "b")) { + stop("intentional failure") + } + 1 + } + + results <- evaluate( + mod, + dataset, + metric = half_failing_metric, + .llm = mock_llm, + .progress = FALSE + ) + + expect_equal(results$n_errors, 1) + expect_equal(results$mean_score, 0.5) # (1 + 0) / 2, not 1.0 +}) diff --git a/tests/testthat/test-teleprompter-bootstrap.R b/tests/testthat/test-teleprompter-bootstrap.R index a112b646..bb2b66d2 100644 --- a/tests/testthat/test-teleprompter-bootstrap.R +++ b/tests/testthat/test-teleprompter-bootstrap.R @@ -446,3 +446,40 @@ test_that("compile_module works with BootstrapFewShot", { expect_true(result$is_compiled()) expect_equal(result$config$teleprompter, "BootstrapFewShot") }) + +test_that("BootstrapFewShot harvests demos with field-aware metrics (dsprrr-s3b)", { + # Regression: the single-module path passed the bare cell value as `expected`, + # so a field-aware metric (the documented default) errored inside + # extract_field(), was scored NA, and bootstrapped ZERO demos. The metric must + # receive the full row, mirroring the pipeline path and evaluate(). + local_reset_cache() + + sig <- Signature( + inputs = list(input(name = "question", class = S7::class_character)), + output_type = ellmer::type_string(), + instructions = "Answer the question" + ) + mod <- module(signature = sig, type = "predict", template = "{question}") + + # Mock LLM always returns the correct structured answer so every row passes. + mock_llm <- structure( + list(chat_structured = function(prompt, type, ...) list(answer = "yes")), + class = "Chat" + ) + + trainset <- data.frame( + question = c("q1", "q2", "q3"), + answer = c("yes", "yes", "yes") + ) + + tp <- BootstrapFewShot( + metric = metric_exact_match(field = "answer"), + max_labeled_demos = 0L, + max_bootstrapped_demos = 2L, + seed = 42L + ) + + result <- compile(tp, mod, trainset, .llm = mock_llm) + + expect_gt(result$config$optimizer$n_bootstrapped_demos, 0) +}) From c6adf9d1a3739f8a830f736331984d8c47ca28be Mon Sep 17 00:00:00 2001 From: James Wade Date: Sun, 14 Jun 2026 18:15:41 -0400 Subject: [PATCH 3/7] fix: thread rollout_id so retries actually explore (dsprrr-pcd) rollout_id was fully implemented in the cache layer but no caller ever passed it. With caching enabled (the default), BestOfN/Refine/Assert made byte-identical attempts: attempt 1 missed the cache, attempts 2..N hit it and returned the same response, so effective N was always 1. Thread rollout_id end to end: - call_llm_request() and PredictModule's private call_llm() / forward() now accept rollout_id and pass it to cached_chat_structured(), which already folds it into the cache key. - BestOfN, Refine, and Assert pass rollout_id = i on each attempt, so every attempt gets a distinct cache key. Regression tests (verified to fail without the fix): - forward() threads rollout_id into the cache key (repeat id hits cache, new id misses). - BestOfN and Refine pass 1,2,3 to successive attempts. Note: a per-attempt temperature override (also in dsprrr-pcd) and optimizer-side diversity (SIMBA/GEPA/COPRO/bootstrap teacher) remain follow-ups. Full suite: 3015 passing, no new failures. Co-Authored-By: Claude Opus 4.8 (1M context) --- R/module-assert.R | 11 +++++-- R/module-predict.R | 18 +++++++--- R/module-wrapper.R | 24 +++++++++++--- R/run.R | 9 ++++- tests/testthat/test-cache.R | 32 ++++++++++++++++++ tests/testthat/test-module-wrapper.R | 49 ++++++++++++++++++++++++++++ 6 files changed, 132 insertions(+), 11 deletions(-) diff --git a/R/module-assert.R b/R/module-assert.R index 4b9a6b37..2233b043 100644 --- a/R/module-assert.R +++ b/R/module-assert.R @@ -151,10 +151,17 @@ AssertModule <- R6::R6Class( ) } - # Run the wrapped module + # Run the wrapped module. rollout_id partitions the cache per attempt so + # retries are not served identical cached responses. result <- tryCatch( { - self$module$forward(current_batch, .llm = .llm, trace = FALSE, ...) + self$module$forward( + current_batch, + .llm = .llm, + trace = FALSE, + rollout_id = i, + ... + ) }, error = function(e) { cli::cli_warn(c( diff --git a/R/module-predict.R b/R/module-predict.R index 14772a07..53106bd7 100644 --- a/R/module-predict.R +++ b/R/module-predict.R @@ -54,7 +54,14 @@ PredictModule <- R6::R6Class( #' if caching globally disabled). If FALSE, bypasses cache for this call. #' @param ... Additional arguments #' @return Tibble with result, .chat, .metadata columns - forward = function(batch, .llm = NULL, trace = TRUE, .cache = NULL, ...) { + forward = function( + batch, + .llm = NULL, + trace = TRUE, + .cache = NULL, + rollout_id = NULL, + ... + ) { # Handle both list and data frame inputs if (is.data.frame(batch)) { inputs <- as.list(batch[1, , drop = FALSE]) @@ -80,7 +87,8 @@ PredictModule <- R6::R6Class( llm = llm, request = request, output_type = self$signature@output_type, - .cache = .cache + .cache = .cache, + rollout_id = rollout_id ) }, error = function(e) { @@ -447,13 +455,15 @@ PredictModule <- R6::R6Class( llm, request, output_type, - .cache = NULL + .cache = NULL, + rollout_id = NULL ) { call_llm_request( llm = llm, request = request, output_type = output_type, - .cache = .cache + .cache = .cache, + rollout_id = rollout_id ) } ), diff --git a/R/module-wrapper.R b/R/module-wrapper.R index 60bd2016..75d4d458 100644 --- a/R/module-wrapper.R +++ b/R/module-wrapper.R @@ -124,10 +124,18 @@ BestOfNModule <- R6::R6Class( total_cost <- 0 for (i in seq_len(self$N)) { - # Run the wrapped module + # Run the wrapped module. rollout_id partitions the cache per attempt so + # attempts 2..N are not served identical cached responses (which would + # make BestOfN explore nothing when caching is enabled). result <- tryCatch( { - self$module$forward(batch, .llm = .llm, trace = FALSE, ...) + self$module$forward( + batch, + .llm = .llm, + trace = FALSE, + rollout_id = i, + ... + ) }, error = function(e) { consecutive_failures <<- consecutive_failures + 1 @@ -643,10 +651,18 @@ RefineModule <- R6::R6Class( ) } - # Run the wrapped module + # Run the wrapped module. rollout_id partitions the cache per attempt so + # retries are not served identical cached responses even before feedback + # changes the prompt. result <- tryCatch( { - self$module$forward(current_batch, .llm = .llm, trace = FALSE, ...) + self$module$forward( + current_batch, + .llm = .llm, + trace = FALSE, + rollout_id = i, + ... + ) }, error = function(e) { consecutive_failures <<- consecutive_failures + 1 diff --git a/R/run.R b/R/run.R index 07644cd8..c588dfa0 100644 --- a/R/run.R +++ b/R/run.R @@ -474,7 +474,13 @@ build_module_request <- function(module, inputs) { #' Execute a structured ellmer call from a prepared request #' @noRd -call_llm_request <- function(llm, request, output_type, .cache = NULL) { +call_llm_request <- function( + llm, + request, + output_type, + .cache = NULL, + rollout_id = NULL +) { if (isTRUE(request$is_multimodal)) { llm$chat_structured( request$payload, @@ -486,6 +492,7 @@ call_llm_request <- function(llm, request, output_type, .cache = NULL) { llm = llm, prompt = request$full_prompt, output_type = output_type, + rollout_id = rollout_id, .cache = .cache ) } diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index 32d2ab87..08ef1ea0 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -1137,3 +1137,35 @@ test_that(".cache is accepted and validated for non-Predict modules (dsprrr-jup) expect_error(run(mod, text = "hi", .cache = "yes"), "must be") expect_error(run(mod, text = "hi", .cache = NA), "must be") }) + +test_that("rollout_id threads from forward() into the cache key (dsprrr-pcd)", { + # Regression: rollout_id was implemented in the cache layer but no caller + # passed it, so BestOfN/Refine retries with caching enabled were served + # identical cached responses (effective N = 1). + local_reset_cache() + configure_cache(enable = TRUE, enable_memory = TRUE, enable_disk = FALSE) + + call_count <- 0 + mock_llm <- structure( + list(chat_structured = function(prompt, type, ...) { + call_count <<- call_count + 1 + list(answer = paste0("resp-", call_count)) + }), + class = "Chat" + ) + sig <- Signature( + inputs = list(input(name = "q", class = S7::class_character)), + output_type = ellmer::type_string(), + instructions = "" + ) + mod <- module(signature = sig, type = "predict", template = "{q}") + + # Same prompt, different rollout_id -> both miss the cache -> 2 real calls. + mod$forward(list(q = "x"), .llm = mock_llm, rollout_id = 1) + mod$forward(list(q = "x"), .llm = mock_llm, rollout_id = 2) + expect_equal(call_count, 2) + + # Repeating a rollout_id hits the cache -> no new call. + mod$forward(list(q = "x"), .llm = mock_llm, rollout_id = 1) + expect_equal(call_count, 2) +}) diff --git a/tests/testthat/test-module-wrapper.R b/tests/testthat/test-module-wrapper.R index 29d0138a..cdb55d72 100644 --- a/tests/testthat/test-module-wrapper.R +++ b/tests/testthat/test-module-wrapper.R @@ -603,3 +603,52 @@ test_that("RefineModule print works", { expect_invisible(print(wrapper)) expect_s3_class(wrapper, "RefineModule") }) + +test_that("BestOfN passes a distinct rollout_id to each attempt (dsprrr-pcd)", { + seen <- integer(0) + mock <- create_mock_module() + mock$forward <- function( + batch, + .llm = NULL, + trace = TRUE, + rollout_id = NULL, + ... + ) { + seen <<- c(seen, rollout_id %||% NA_integer_) + tibble::tibble( + output = list(list(answer = "a")), + chat = list(NULL), + metadata = list(list(total_tokens = 1, cost = 0, model = "mock")) + ) + } + + # threshold above any reward so all N attempts run + wrapper <- best_of_n(mock, N = 3, threshold = 99) + wrapper$forward(list(question = "q")) + + expect_equal(seen, c(1L, 2L, 3L)) +}) + +test_that("Refine passes a distinct rollout_id to each attempt (dsprrr-pcd)", { + seen <- integer(0) + mock <- create_mock_module() + mock$forward <- function( + batch, + .llm = NULL, + trace = TRUE, + rollout_id = NULL, + ... + ) { + seen <<- c(seen, rollout_id %||% NA_integer_) + tibble::tibble( + output = list(list(answer = "a")), + chat = list(NULL), + metadata = list(list(total_tokens = 1, cost = 0, model = "mock")) + ) + } + + wrapper <- refine(mock, N = 3, threshold = 99) + wrapper$forward(list(question = "q")) + + expect_equal(seen, c(1L, 2L, 3L)) +}) From 75be2265cd70bd9a92af160676eda1aff1466d15 Mon Sep 17 00:00:00 2001 From: James Wade Date: Sun, 14 Jun 2026 18:20:30 -0400 Subject: [PATCH 4/7] fix: pin_module_config no longer silently destroys pipelines (dsprrr-07u) module_kind() collapsed every unrecognised module class to "predict", so a PipelineModule (and EnsembleModule, RAGModule, ...) passed the v2-persistence allow-list and serialised as a bare PredictModule. A pinned pipeline restored as a single empty PredictModule, silently dropping every step and its bootstrapped per-step demos. - module_kind() now falls back to the actual class name instead of "predict", so unsupported types are correctly rejected by the persistence allow-list. - serialize_module_config_v2() detects PipelineModule explicitly and aborts with a clear "pipeline persistence not yet supported" message pointing users to pin each step individually. Regression test: pin_module_config() on a pipeline now errors instead of silently losing data. Full suite: 3016 passing, no new failures. Co-Authored-By: Claude Opus 4.8 (1M context) --- R/orchestration.R | 9 +++++++++ R/run.R | 6 +++++- tests/testthat/test-orchestration.R | 17 +++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/R/orchestration.R b/R/orchestration.R index 62bf2710..8216816a 100644 --- a/R/orchestration.R +++ b/R/orchestration.R @@ -159,6 +159,15 @@ restore_module_config <- function(config, signature = NULL) { } serialize_module_config_v2 <- function(module) { + if (inherits(module, "PipelineModule")) { + cli::cli_abort(c( + "Pipeline persistence is not yet supported", + "x" = "{.fn pin_module_config} cannot serialise a {.cls PipelineModule}", + "i" = "Persisting it here would silently drop every step and its demos", + "i" = "Pin each step module individually, or rebuild the pipeline from source" + )) + } + kind <- module_kind(module) supported <- c("predict", "react", "chain_of_thought", "multichain") if (!kind %in% supported) { diff --git a/R/run.R b/R/run.R index c588dfa0..17ed9bec 100644 --- a/R/run.R +++ b/R/run.R @@ -292,7 +292,11 @@ module_kind <- function(module) { "ReactModule" = "react", "MultiChainComparisonModule" = "multichain", "PredictModule" = "predict", - "predict" + # Fall back to the actual class name rather than silently claiming + # "predict" -- otherwise unsupported module types (pipelines, ensembles, + # RAG, ...) pass the persistence allow-list and serialize as a bare + # PredictModule, losing all of their structure. + class(module)[1] ) } diff --git a/tests/testthat/test-orchestration.R b/tests/testthat/test-orchestration.R index 145e3a50..e7acd2dd 100644 --- a/tests/testthat/test-orchestration.R +++ b/tests/testthat/test-orchestration.R @@ -465,3 +465,20 @@ test_that("module config round-trips correctly", { expect_true(restored$is_compiled()) expect_equal(restored$state$best_score, 0.92) }) + +test_that("pin_module_config refuses to silently destroy pipelines (dsprrr-07u)", { + skip_if_not_installed("pins") + + m1 <- module(signature("question -> thought"), type = "predict") + m2 <- module(signature("thought -> answer"), type = "predict") + pipe <- pipeline(m1, m2) + board <- pins::board_temp() + + # Regression: module_kind() collapsed unknown classes to "predict", so a + # pipeline pinned without error and restored as a single empty PredictModule, + # silently losing every step and its bootstrapped demos. + expect_error( + pin_module_config(board, "pipe", pipe), + "[Pp]ipeline" + ) +}) From 71116f93789e9a2f01e0b97e22a84e01ffaba132 Mon Sep 17 00:00:00 2001 From: James Wade Date: Mon, 15 Jun 2026 17:55:20 -0400 Subject: [PATCH 5/7] fix: address wave-1 PR review feedback (dsprrr-6qo) Codex/Copilot review of PR #114: - Module$run() now validates .cache via validate_cache_arg(), so callers reaching the R6 method directly get the same loud error as the run() generic instead of a silently-forwarded malformed value. - default_disk_cache_path() treats an empty DSPRRR_CACHE_PATH as unset (nzchar guard) so the disk cache never resolves to "". - Test teardown restores the cache default with an explicit disk_path, independent of LIFO env-var cleanup ordering. - Reword the .cache NEWS bullet to scope it accurately: it is validated and plumbed everywhere, but direct-call modules don't cache yet (dsprrr-aa2). - Move the BootstrapFewShot (s3b) and evaluate (tn1) NEWS bullets out of wave 1 -- their code lands in wave 2 -- so each stacked PR's changelog matches its own diff. - Comment the batch-print test to explain why cli_fmt() is the correct capture (cli writes to stdout; capture.output(type="message") would miss it). Co-Authored-By: Claude Opus 4.8 (1M context) --- NEWS.md | 19 +++++++------------ R/cache.R | 5 ++++- R/module-base.R | 5 +++++ tests/testthat/setup.R | 6 +++++- tests/testthat/test-run.R | 3 +++ 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8838be9e..c2d0cc93 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,11 +4,6 @@ First development changelog. dsprrr is experimental; the API may change. ## Bug fixes -* `BootstrapFewShot` now harvests demonstrations when the metric targets a - specific output field (e.g. `metric_exact_match(field = "answer")`). - Previously the single-module path passed a bare value to the metric, which - errored internally and silently bootstrapped zero demos (#dsprrr-s3b). - * Failed items in a batch are now counted and reported. Previously `print()` on a batch result always said "All items completed successfully" because it looked for the error in the wrong place (#dsprrr-8l0). @@ -21,13 +16,13 @@ First development changelog. dsprrr is experimental; the API may change. example, when the API is unreachable) instead of a cryptic "attempt to select less than one element" (#dsprrr-hew). -* `.cache` is now honored by all module types and pipelines, not only - `PredictModule`. Previously it was silently dropped (with a spurious - "unknown input" warning) for every other module (#dsprrr-jup). - -* Evaluation now treats a failed prediction as a score of 0 rather than - dropping it from the mean, so optimizers no longer prefer configurations - that fail on most of the data (#dsprrr-tn1). +* `.cache` is now a validated, first-class argument to `run()` and `forward()` + for every module type, instead of triggering a spurious "unknown input" + warning and being dropped for non-`PredictModule` modules (#dsprrr-jup). + `PredictModule` (and the wrapper/few-shot modules that delegate to it) honor + it for the structured-output cache; modules that drive the LLM directly + (e.g. `RAGModule`, `ReActModule`, `RLMModule`) accept `.cache` but do not yet + route their own calls through the cache (#dsprrr-aa2). ## Internal diff --git a/R/cache.R b/R/cache.R index e2b990d3..174887ab 100644 --- a/R/cache.R +++ b/R/cache.R @@ -19,7 +19,10 @@ NULL #' without writing into the current working directory. #' @noRd default_disk_cache_path <- function() { - Sys.getenv("DSPRRR_CACHE_PATH", unset = ".dsprrr_cache") + # Treat an empty-string env var the same as unset, so DSPRRR_CACHE_PATH="" + # does not resolve the cache to "" (which would write into the working dir). + path <- Sys.getenv("DSPRRR_CACHE_PATH", unset = "") + if (nzchar(path)) path else ".dsprrr_cache" } #' Configure dsprrr Cache diff --git a/R/module-base.R b/R/module-base.R index f097ac64..a1a2a1e5 100644 --- a/R/module-base.R +++ b/R/module-base.R @@ -86,6 +86,11 @@ Module <- R6::R6Class( .return_format = "simple", .cache = NULL ) { + # Validate .cache here too: callers can reach $run() directly (not only + # via the run() generic, which validates separately), so a malformed + # value must fail loudly instead of being silently forwarded. + validate_cache_arg(.cache) + inputs <- list(...) # Validate inputs against signature diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 255eb6a7..fce2c0a7 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -26,7 +26,11 @@ withr::defer( { dsprrr::clear_cache() unlink(dsprrr_test_cache_dir, recursive = TRUE) - dsprrr::configure_cache() + # Restore the production default explicitly. teardown defers run LIFO, so + # DSPRRR_CACHE_PATH set by local_envvar() above is still in scope here; a + # bare configure_cache() would otherwise re-read it and point disk_path at + # the just-unlinked test dir. + dsprrr::configure_cache(disk_path = ".dsprrr_cache") }, envir = testthat::teardown_env() ) diff --git a/tests/testthat/test-run.R b/tests/testthat/test-run.R index 259f4116..5016d104 100644 --- a/tests/testthat/test-run.R +++ b/tests/testthat/test-run.R @@ -1082,6 +1082,9 @@ test_that("print.dsprrr_batch_result counts failed items (dsprrr-8l0)", { class = c("dsprrr_batch_result", "list") ) + # print.dsprrr_batch_result emits only via cli, so cli::cli_fmt() captures it + # reliably. Do NOT switch to capture.output(type = "message"): cli writes to + # stdout, so that pattern silently captures nothing. out <- cli::cli_fmt(print(result)) expect_true(any(grepl("Errors", out))) expect_true(any(grepl("1 of 2", out))) From 36ec81cab1226601d6e11952b2242afcc91dc359 Mon Sep 17 00:00:00 2001 From: James Wade Date: Mon, 15 Jun 2026 18:00:35 -0400 Subject: [PATCH 6/7] fix: nested wrappers no longer crash on rollout_id (dsprrr-wx6) Regression from dsprrr-pcd: BestOfN/Refine/Assert passed rollout_id = i explicitly while also spreading ..., so a nested wrapper (refine(best_of_n(mod)), best_of_n(refine(mod)), with_assertions(best_of_n(mod))) handed the inner forward() rollout_id twice -> "formal argument matched by multiple actual arguments". - Each wrapper forward() now takes an explicit rollout_id = NULL formal, so an inherited id is consumed out of ... instead of being re-spread. - New compose_rollout_id() helper (R/utils.R) folds the inherited id into the per-attempt index ("2" at top level, "2.1" when nested), keeping every (outer, inner) attempt's cache key unique. - Document @param rollout_id on the wrapper and PredictModule forward() methods (addresses Copilot review on PR #115). - Add regression tests for all three nesting combinations; update the two existing pcd tests for the now-character ids. - Add the s3b, tn1, and pcd/wx6 NEWS bullets here (their code lives in wave 2), so PR #115's changelog matches its diff (dsprrr-6qo). Co-Authored-By: Claude Opus 4.8 (1M context) --- NEWS.md | 15 ++++++ R/module-assert.R | 15 ++++-- R/module-predict.R | 3 ++ R/module-wrapper.R | 29 +++++++++--- R/utils.R | 16 +++++++ tests/testthat/test-module-wrapper.R | 70 +++++++++++++++++++++++++++- 6 files changed, 137 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index c2d0cc93..1438d76c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,21 @@ First development changelog. dsprrr is experimental; the API may change. (e.g. `RAGModule`, `ReActModule`, `RLMModule`) accept `.cache` but do not yet route their own calls through the cache (#dsprrr-aa2). +* `BootstrapFewShot` now harvests demonstrations when the metric targets a + specific output field (e.g. `metric_exact_match(field = "answer")`). + Previously the single-module path passed a bare value to the metric, which + errored internally and silently bootstrapped zero demos (#dsprrr-s3b). + +* Evaluation now treats a failed prediction as a score of 0 rather than + dropping it from the mean, so optimizers no longer prefer configurations + that fail on most of the data (#dsprrr-tn1). + +* `BestOfN`, `Refine`, and `Assert` now thread a per-attempt `rollout_id` into + the cache key, so retries make distinct attempts instead of replaying one + cached response when caching is enabled. Nested wrappers (e.g. + `refine(best_of_n(mod))`) compose their ids cleanly instead of crashing with + a duplicate-argument error (#dsprrr-pcd, #dsprrr-wx6). + ## Internal * Tests now isolate the on-disk cache to a temporary directory, so running diff --git a/R/module-assert.R b/R/module-assert.R index 2233b043..7842610f 100644 --- a/R/module-assert.R +++ b/R/module-assert.R @@ -118,9 +118,17 @@ AssertModule <- R6::R6Class( #' @param batch Named list or data frame of inputs #' @param .llm Optional ellmer chat object #' @param trace Logical whether to record trace information + #' @param rollout_id Optional id inherited from an enclosing wrapper. Taken + #' as a formal (not via `...`) so nested wrappers don't pass it twice. #' @param ... Additional arguments passed to wrapped module #' @return Tibble with output, chat, metadata columns - forward = function(batch, .llm = NULL, trace = TRUE, ...) { + forward = function( + batch, + .llm = NULL, + trace = TRUE, + rollout_id = NULL, + ... + ) { # Handle both list and data frame inputs if (is.data.frame(batch)) { inputs <- as.list(batch[1, , drop = FALSE]) @@ -152,14 +160,15 @@ AssertModule <- R6::R6Class( } # Run the wrapped module. rollout_id partitions the cache per attempt so - # retries are not served identical cached responses. + # retries are not served identical cached responses. compose_* folds in + # any inherited id so nesting stays unique. result <- tryCatch( { self$module$forward( current_batch, .llm = .llm, trace = FALSE, - rollout_id = i, + rollout_id = compose_rollout_id(rollout_id, i), ... ) }, diff --git a/R/module-predict.R b/R/module-predict.R index 53106bd7..4460c08b 100644 --- a/R/module-predict.R +++ b/R/module-predict.R @@ -52,6 +52,9 @@ PredictModule <- R6::R6Class( #' @param .cache Logical or NULL. Per-call cache control. If NULL (default), #' uses global cache configuration. If TRUE, attempts to use cache (no effect #' if caching globally disabled). If FALSE, bypasses cache for this call. + #' @param rollout_id Optional value for cache partitioning. When set, the + #' structured-output cache key includes it, so retries/attempts (e.g. from + #' BestOfN, Refine, Assert) get distinct keys and explore fresh responses. #' @param ... Additional arguments #' @return Tibble with result, .chat, .metadata columns forward = function( diff --git a/R/module-wrapper.R b/R/module-wrapper.R index 75d4d458..71d17614 100644 --- a/R/module-wrapper.R +++ b/R/module-wrapper.R @@ -100,9 +100,17 @@ BestOfNModule <- R6::R6Class( #' @param batch Named list or data frame of inputs #' @param .llm Optional ellmer chat object #' @param trace Logical whether to record trace information + #' @param rollout_id Optional id inherited from an enclosing wrapper. Taken + #' as a formal (not via `...`) so nested wrappers don't pass it twice. #' @param ... Additional arguments passed to wrapped module #' @return Tibble with output, chat, metadata columns - forward = function(batch, .llm = NULL, trace = TRUE, ...) { + forward = function( + batch, + .llm = NULL, + trace = TRUE, + rollout_id = NULL, + ... + ) { # Handle both list and data frame inputs if (is.data.frame(batch)) { inputs <- as.list(batch[1, , drop = FALSE]) @@ -126,14 +134,15 @@ BestOfNModule <- R6::R6Class( for (i in seq_len(self$N)) { # Run the wrapped module. rollout_id partitions the cache per attempt so # attempts 2..N are not served identical cached responses (which would - # make BestOfN explore nothing when caching is enabled). + # make BestOfN explore nothing when caching is enabled). compose_* + # folds in any inherited id so nesting stays unique and consistent. result <- tryCatch( { self$module$forward( batch, .llm = .llm, trace = FALSE, - rollout_id = i, + rollout_id = compose_rollout_id(rollout_id, i), ... ) }, @@ -616,9 +625,17 @@ RefineModule <- R6::R6Class( #' @param batch Named list or data frame of inputs #' @param .llm Optional ellmer chat object #' @param trace Logical whether to record trace information + #' @param rollout_id Optional id inherited from an enclosing wrapper. Taken + #' as a formal (not via `...`) so nested wrappers don't pass it twice. #' @param ... Additional arguments passed to wrapped module #' @return Tibble with output, chat, metadata columns - forward = function(batch, .llm = NULL, trace = TRUE, ...) { + forward = function( + batch, + .llm = NULL, + trace = TRUE, + rollout_id = NULL, + ... + ) { # Handle both list and data frame inputs if (is.data.frame(batch)) { inputs <- as.list(batch[1, , drop = FALSE]) @@ -653,14 +670,14 @@ RefineModule <- R6::R6Class( # Run the wrapped module. rollout_id partitions the cache per attempt so # retries are not served identical cached responses even before feedback - # changes the prompt. + # changes the prompt. compose_* folds in any inherited id for nesting. result <- tryCatch( { self$module$forward( current_batch, .llm = .llm, trace = FALSE, - rollout_id = i, + rollout_id = compose_rollout_id(rollout_id, i), ... ) }, diff --git a/R/utils.R b/R/utils.R index 7424cb3e..72c169bc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,19 @@ +#' Compose a per-attempt cache-partition id that nests cleanly +#' +#' Retrying wrappers (BestOfN, Refine, Assert) tag each attempt with a +#' `rollout_id` so retries get distinct cache keys. When wrappers are nested +#' (e.g. `refine(best_of_n(mod))`) the outer wrapper passes its own id down; +#' combining it with the inner attempt index keeps every (outer, inner) pair +#' unique while consuming the inherited id (so it is not matched twice). +#' +#' @param rollout_id Inherited id from an enclosing wrapper, or NULL. +#' @param i Integer attempt index for this wrapper. +#' @return A character id, e.g. `"2"` at the top level or `"2.1"` when nested. +#' @noRd +compose_rollout_id <- function(rollout_id, i) { + if (is.null(rollout_id)) as.character(i) else paste0(rollout_id, ".", i) +} + #' Check if object inherits from ellmer type #' #' @noRd diff --git a/tests/testthat/test-module-wrapper.R b/tests/testthat/test-module-wrapper.R index cdb55d72..17feed57 100644 --- a/tests/testthat/test-module-wrapper.R +++ b/tests/testthat/test-module-wrapper.R @@ -626,7 +626,8 @@ test_that("BestOfN passes a distinct rollout_id to each attempt (dsprrr-pcd)", { wrapper <- best_of_n(mock, N = 3, threshold = 99) wrapper$forward(list(question = "q")) - expect_equal(seen, c(1L, 2L, 3L)) + # compose_rollout_id() returns character ids so they nest cleanly + expect_equal(seen, c("1", "2", "3")) }) test_that("Refine passes a distinct rollout_id to each attempt (dsprrr-pcd)", { @@ -650,5 +651,70 @@ test_that("Refine passes a distinct rollout_id to each attempt (dsprrr-pcd)", { wrapper <- refine(mock, N = 3, threshold = 99) wrapper$forward(list(question = "q")) - expect_equal(seen, c(1L, 2L, 3L)) + expect_equal(seen, c("1", "2", "3")) +}) + +# Regression tests for dsprrr-wx6: nested wrappers used to crash because each +# wrapper passed rollout_id = i explicitly while also spreading ..., so the +# inner forward() received rollout_id twice. + +# Innermost mock that records every rollout_id it is handed. +mock_recording_rollouts <- function(seen_env) { + mock <- create_mock_module() + mock$forward <- function( + batch, + .llm = NULL, + trace = TRUE, + rollout_id = NULL, + ... + ) { + seen_env$ids <- c(seen_env$ids, rollout_id %||% NA_character_) + tibble::tibble( + output = list(list(answer = "a")), + chat = list(NULL), + metadata = list(list(total_tokens = 1, cost = 0, model = "mock")) + ) + } + mock +} + +test_that("refine(best_of_n(mod)) nests without a duplicate-argument crash (dsprrr-wx6)", { + seen <- new.env() + seen$ids <- character(0) + nested <- refine( + best_of_n(mock_recording_rollouts(seen), N = 2, threshold = 99), + N = 2, + threshold = 99 + ) + + expect_no_error(nested$forward(list(question = "q"))) + # 2 outer x 2 inner attempts, each id unique and hierarchical + expect_equal(sort(seen$ids), c("1.1", "1.2", "2.1", "2.2")) +}) + +test_that("best_of_n(refine(mod)) nests without a crash (dsprrr-wx6)", { + seen <- new.env() + seen$ids <- character(0) + nested <- best_of_n( + refine(mock_recording_rollouts(seen), N = 2, threshold = 99), + N = 2, + threshold = 99 + ) + + expect_no_error(nested$forward(list(question = "q"))) + expect_equal(sort(seen$ids), c("1.1", "1.2", "2.1", "2.2")) +}) + +test_that("with_assertions(best_of_n(mod)) nests without a crash (dsprrr-wx6)", { + seen <- new.env() + seen$ids <- character(0) + nested <- with_assertions( + best_of_n(mock_recording_rollouts(seen), N = 2, threshold = 99), + assertions = list(assert_output(~TRUE, "always passes")), + max_retries = 1L + ) + + expect_no_error(nested$forward(list(question = "q"))) + # assertion passes on the first outer attempt; inner best_of_n runs twice + expect_equal(sort(seen$ids), c("1.1", "1.2")) }) From 43c4e1160751b44fb276e4c7468d00c9a54b89c8 Mon Sep 17 00:00:00 2001 From: James Wade Date: Mon, 15 Jun 2026 18:01:11 -0400 Subject: [PATCH 7/7] docs: add pin_module_config NEWS bullet for wave 3 (dsprrr-6qo) The pipeline-persistence guard (dsprrr-07u) had no changelog entry. Add it here so PR #116's NEWS matches its own diff. Co-Authored-By: Claude Opus 4.8 (1M context) --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 1438d76c..52f88c88 100644 --- a/NEWS.md +++ b/NEWS.md @@ -39,6 +39,10 @@ First development changelog. dsprrr is experimental; the API may change. `refine(best_of_n(mod))`) compose their ids cleanly instead of crashing with a duplicate-argument error (#dsprrr-pcd, #dsprrr-wx6). +* `pin_module_config()` now errors clearly for pipelines and other unsupported + module types instead of silently serialising them as an empty `PredictModule` + and dropping every step on restore (#dsprrr-07u). + ## Internal * Tests now isolate the on-disk cache to a temporary directory, so running