diff --git a/tests/dotnet-test/test-gap-analysis/eval.yaml b/tests/dotnet-test/test-gap-analysis/eval.yaml
index 46063117e6..440375f95d 100644
--- a/tests/dotnet-test/test-gap-analysis/eval.yaml
+++ b/tests/dotnet-test/test-gap-analysis/eval.yaml
@@ -141,3 +141,75 @@ scenarios:
- "Wrote test methods for the ShoppingCart class"
- "Covered the AddItem and GetTotal methods"
timeout: 120
+
+ # ==========================================================================
+ # Scenario 5: Report quality — exclude trivial/generated code, trace call
+ # chains, prioritize by risk, and report strengths alongside gaps
+ # ==========================================================================
+
+ - name: "Produce a risk-prioritized report that excludes trivial and generated code"
+ prompt: |
+ Here's my Billing project and its tests. Before I ship, I want to know
+ whether the tests would actually catch a subtle bug in the money math.
+ Walk me through where the tests are blind and how serious each gap is.
+ setup:
+ files:
+ - path: "Billing/Billing.csproj"
+ source: "fixtures/report-quality/Billing/Billing.csproj"
+ - path: "Billing/InvoiceProcessor.cs"
+ source: "fixtures/report-quality/Billing/InvoiceProcessor.cs"
+ - path: "Billing/InvoiceProcessor.g.cs"
+ source: "fixtures/report-quality/Billing/InvoiceProcessor.g.cs"
+ - path: "Billing.Tests/Billing.Tests.csproj"
+ source: "fixtures/report-quality/Billing.Tests/Billing.Tests.csproj"
+ - path: "Billing.Tests/InvoiceProcessorTests.cs"
+ source: "fixtures/report-quality/Billing.Tests/InvoiceProcessorTests.cs"
+ assertions:
+ - type: "output_matches"
+ pattern: "(late.?fee|tax|ComputeAmountDue|ApplyLateFee|ComputeTax)"
+ - type: "output_matches"
+ pattern: "(risk|priorit|high.*risk|business)"
+ - type: "output_matches"
+ pattern: "(generated|auto.?generated|\\.g\\.cs)"
+ - type: "output_matches"
+ pattern: "(surviv|not.*caught|gap|blind|miss)"
+ - type: "exit_success"
+ rubric:
+ - "Recognized that analyzing trivial code — the auto-properties (CustomerName, InvoiceId) and the simple IsPaid getter — is not useful and excluded that trivial code from the mutation analysis"
+ - "Did not over-count mutations in the generated code: skipped the auto-generated InvoiceProcessor.g.cs file rather than reporting its branches as gaps"
+ - "Traced the call chain into the private helpers ApplyLateFee and ComputeTax, which are reachable from the public ComputeAmountDue method, instead of ignoring those call chains as unreachable"
+ - "Prioritized the findings by business risk — the high-risk late-fee and tax calculation gaps first — rather than just listing them in source order"
+ - "Reported the suite's strengths, noting the killed mutations such as the negative-subtotal guard that the tests already catch, alongside the surviving gaps"
+ - "Correctly labeled each reported mutation with its category (for example boundary, arithmetic, or exception removal)"
+ - "Identified that the late-fee tier boundaries (0, 30 days) and the tax-exempt path are survived mutations the current assertions would not catch"
+ timeout: 300
+
+ # ==========================================================================
+ # Scenario 6: Rust error propagation via the `?` operator
+ # ==========================================================================
+
+ - name: "Flag the Rust ? operator propagation as an unobserved mutation point"
+ prompt: |
+ I have a small Rust library that parses order lines. I'm not confident my
+ tests would notice if the error handling broke. Could a subtle change to
+ how errors propagate slip past the current tests?
+ setup:
+ files:
+ - path: "Cargo.toml"
+ source: "fixtures/rust-error-propagation/Cargo.toml"
+ - path: "src/lib.rs"
+ source: "fixtures/rust-error-propagation/src/lib.rs"
+ assertions:
+ - type: "output_matches"
+ pattern: "(\\?|operator|propagat|Err|unwrap|panic)"
+ - type: "output_matches"
+ pattern: "(parse_line_total|error.*path|invalid)"
+ - type: "output_matches"
+ pattern: "(surviv|not.*caught|gap|blind|miss|not.*observ)"
+ - type: "exit_success"
+ rubric:
+ - "Recognized Rust's `?` operator on the parse calls in parse_line_total as a short-circuit that propagates the Err to the caller"
+ - "Flagged that mutating `expr?` to `expr.unwrap()` would panic instead of returning the error, classifying it as an Exception/Panic mutation point"
+ - "Identified that no test exercises the error path, so the `?` operator propagation behavior is never observed and the mutation would survive"
+ - "Recommended a concrete test that passes an invalid line and asserts the returned Err to kill the mutation"
+ timeout: 300
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing.Tests/Billing.Tests.csproj b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing.Tests/Billing.Tests.csproj
new file mode 100644
index 0000000000..345b9a3234
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing.Tests/Billing.Tests.csproj
@@ -0,0 +1,12 @@
+
+
+ net10.0
+ enable
+
+
+
+
+
+
+
+
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing.Tests/InvoiceProcessorTests.cs b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing.Tests/InvoiceProcessorTests.cs
new file mode 100644
index 0000000000..30f6098953
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing.Tests/InvoiceProcessorTests.cs
@@ -0,0 +1,26 @@
+namespace Billing.Tests;
+
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+
+[TestClass]
+public class InvoiceProcessorTests
+{
+ [TestMethod]
+ public void ComputeAmountDue_NotLate_NoTaxExempt_AddsTax()
+ {
+ var processor = new InvoiceProcessor();
+ decimal result = processor.ComputeAmountDue(100m, daysLate: 0, taxExempt: false);
+
+ // Asserts only that some tax was added; does not pin the late-fee tiers
+ // (5% under 30 days, 10% over 30 days) or the tax-exempt path.
+ Assert.IsTrue(result > 100m);
+ }
+
+ [TestMethod]
+ public void ComputeAmountDue_NegativeSubtotal_Throws()
+ {
+ var processor = new InvoiceProcessor();
+ Assert.ThrowsException(
+ () => processor.ComputeAmountDue(-1m, 0, false));
+ }
+}
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/Billing.csproj b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/Billing.csproj
new file mode 100644
index 0000000000..0957a12e21
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/Billing.csproj
@@ -0,0 +1,6 @@
+
+
+ net10.0
+ enable
+
+
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/InvoiceProcessor.cs b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/InvoiceProcessor.cs
new file mode 100644
index 0000000000..d6a65544d6
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/InvoiceProcessor.cs
@@ -0,0 +1,44 @@
+namespace Billing;
+
+public partial class InvoiceProcessor
+{
+ // Trivial auto-properties and a simple getter — no logic to mutate.
+ public string CustomerName { get; set; } = string.Empty;
+ public int InvoiceId { get; init; }
+ public bool IsPaid => Balance <= 0m;
+
+ private decimal Balance { get; set; }
+
+ ///
+ /// Computes the final amount due, applying late fees and tax.
+ /// High business risk: a flipped comparison or arithmetic change here ships wrong charges.
+ ///
+ public decimal ComputeAmountDue(decimal subtotal, int daysLate, bool taxExempt)
+ {
+ if (subtotal < 0)
+ throw new ArgumentOutOfRangeException(nameof(subtotal));
+
+ decimal amount = subtotal + ApplyLateFee(subtotal, daysLate);
+
+ if (!taxExempt)
+ amount += ComputeTax(amount);
+
+ Balance = amount;
+ return amount;
+ }
+
+ // Private helper reached only through ComputeAmountDue — part of the call chain.
+ private static decimal ApplyLateFee(decimal subtotal, int daysLate)
+ {
+ if (daysLate <= 0)
+ return 0m;
+ if (daysLate > 30)
+ return subtotal * 0.10m;
+ return subtotal * 0.05m;
+ }
+
+ // Private helper reached only through ComputeAmountDue — part of the call chain.
+ private static decimal ComputeTax(decimal amount) => amount * 0.08m;
+
+ public string FormatReceipt(decimal amount) => $"Receipt: {amount:C}";
+}
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/InvoiceProcessor.g.cs b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/InvoiceProcessor.g.cs
new file mode 100644
index 0000000000..32bf438fd3
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/report-quality/Billing/InvoiceProcessor.g.cs
@@ -0,0 +1,16 @@
+//
+// This code was generated by a tool. Changes will be overwritten.
+//
+namespace Billing;
+
+public partial class InvoiceProcessor
+{
+ public static int CompareByInvoiceId(InvoiceProcessor a, InvoiceProcessor b)
+ {
+ if (a.InvoiceId < b.InvoiceId)
+ return -1;
+ if (a.InvoiceId > b.InvoiceId)
+ return 1;
+ return 0;
+ }
+}
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/rust-error-propagation/Cargo.toml b/tests/dotnet-test/test-gap-analysis/fixtures/rust-error-propagation/Cargo.toml
new file mode 100644
index 0000000000..725a395dfb
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/rust-error-propagation/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "order_parser"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+path = "src/lib.rs"
diff --git a/tests/dotnet-test/test-gap-analysis/fixtures/rust-error-propagation/src/lib.rs b/tests/dotnet-test/test-gap-analysis/fixtures/rust-error-propagation/src/lib.rs
new file mode 100644
index 0000000000..b676fa8d1a
--- /dev/null
+++ b/tests/dotnet-test/test-gap-analysis/fixtures/rust-error-propagation/src/lib.rs
@@ -0,0 +1,33 @@
+/// Parses a "quantity,price" line into a total cost.
+///
+/// Uses the `?` operator to short-circuit on parse errors. If either field
+/// fails to parse, the error is propagated to the caller instead of panicking.
+pub fn parse_line_total(line: &str) -> Result {
+ let mut parts = line.split(',');
+ let quantity: u64 = parts.next().unwrap_or("").trim().parse()?;
+ let price: u64 = parts.next().unwrap_or("").trim().parse()?;
+ Ok(quantity * price)
+}
+
+/// Returns the first stock level at or below the reorder threshold.
+pub fn first_below_threshold(levels: &[u32], threshold: u32) -> Option {
+ levels.iter().copied().find(|&l| l <= threshold)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn parses_a_valid_line() {
+ assert_eq!(parse_line_total("3, 10").unwrap(), 30);
+ }
+
+ // Note: no test exercises the error path of parse_line_total, so the `?`
+ // propagation is never observed by the suite.
+
+ #[test]
+ fn finds_a_value_below_threshold() {
+ assert_eq!(first_below_threshold(&[9, 5, 2], 5), Some(5));
+ }
+}