Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions tests/dotnet-test/test-gap-analysis/eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,75 @@ scenarios:
- "Wrote test methods for the ShoppingCart class"
- "Covered the AddItem and GetTotal methods"
timeout: 120

# ==========================================================================
# Scenario 5: Report quality — exclude trivial/generated code, trace call
# chains, prioritize by risk, and report strengths alongside gaps
# ==========================================================================

- name: "Produce a risk-prioritized report that excludes trivial and generated code"
prompt: |
Here's my Billing project and its tests. Before I ship, I want to know
whether the tests would actually catch a subtle bug in the money math.
Walk me through where the tests are blind and how serious each gap is.
setup:
files:
- path: "Billing/Billing.csproj"
source: "fixtures/report-quality/Billing/Billing.csproj"
- path: "Billing/InvoiceProcessor.cs"
source: "fixtures/report-quality/Billing/InvoiceProcessor.cs"
- path: "Billing/InvoiceProcessor.g.cs"
source: "fixtures/report-quality/Billing/InvoiceProcessor.g.cs"
- path: "Billing.Tests/Billing.Tests.csproj"
source: "fixtures/report-quality/Billing.Tests/Billing.Tests.csproj"
- path: "Billing.Tests/InvoiceProcessorTests.cs"
source: "fixtures/report-quality/Billing.Tests/InvoiceProcessorTests.cs"
assertions:
- type: "output_matches"
pattern: "(late.?fee|tax|ComputeAmountDue|ApplyLateFee|ComputeTax)"
- type: "output_matches"
pattern: "(risk|priorit|high.*risk|business)"
- type: "output_matches"
pattern: "(generated|auto.?generated|\\.g\\.cs)"
- type: "output_matches"
pattern: "(surviv|not.*caught|gap|blind|miss)"
- type: "exit_success"
rubric:
- "Recognized that analyzing trivial code — the auto-properties (CustomerName, InvoiceId) and the simple IsPaid getter — is not useful and excluded that trivial code from the mutation analysis"
- "Did not over-count mutations in the generated code: skipped the auto-generated InvoiceProcessor.g.cs file rather than reporting its branches as gaps"
- "Traced the call chain into the private helpers ApplyLateFee and ComputeTax, which are reachable from the public ComputeAmountDue method, instead of ignoring those call chains as unreachable"
- "Prioritized the findings by business risk — the high-risk late-fee and tax calculation gaps first — rather than just listing them in source order"
- "Reported the suite's strengths, noting the killed mutations such as the negative-subtotal guard that the tests already catch, alongside the surviving gaps"
- "Correctly labeled each reported mutation with its category (for example boundary, arithmetic, or exception removal)"
- "Identified that the late-fee tier boundaries (0, 30 days) and the tax-exempt path are survived mutations the current assertions would not catch"
timeout: 300

# ==========================================================================
# Scenario 6: Rust error propagation via the `?` operator
# ==========================================================================

- name: "Flag the Rust ? operator propagation as an unobserved mutation point"
prompt: |
I have a small Rust library that parses order lines. I'm not confident my
tests would notice if the error handling broke. Could a subtle change to
how errors propagate slip past the current tests?
setup:
files:
- path: "Cargo.toml"
source: "fixtures/rust-error-propagation/Cargo.toml"
- path: "src/lib.rs"
source: "fixtures/rust-error-propagation/src/lib.rs"
assertions:
- type: "output_matches"
pattern: "(\\?|operator|propagat|Err|unwrap|panic)"
- type: "output_matches"
pattern: "(parse_line_total|error.*path|invalid)"
- type: "output_matches"
pattern: "(surviv|not.*caught|gap|blind|miss|not.*observ)"
- type: "exit_success"
rubric:
- "Recognized Rust's `?` operator on the parse calls in parse_line_total as a short-circuit that propagates the Err to the caller"
- "Flagged that mutating `expr?` to `expr.unwrap()` would panic instead of returning the error, classifying it as an Exception/Panic mutation point"
- "Identified that no test exercises the error path, so the `?` operator propagation behavior is never observed and the mutation would survive"
- "Recommended a concrete test that passes an invalid line and asserts the returned Err to kill the mutation"
timeout: 300
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="MSTest" Version="4.1.0" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Billing\Billing.csproj" />
</ItemGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
namespace Billing.Tests;

using Microsoft.VisualStudio.TestTools.UnitTesting;

[TestClass]
public class InvoiceProcessorTests
{
[TestMethod]
public void ComputeAmountDue_NotLate_NoTaxExempt_AddsTax()
{
var processor = new InvoiceProcessor();
decimal result = processor.ComputeAmountDue(100m, daysLate: 0, taxExempt: false);

// Asserts only that some tax was added; does not pin the late-fee tiers
// (5% under 30 days, 10% over 30 days) or the tax-exempt path.
Assert.IsTrue(result > 100m);
}

[TestMethod]
public void ComputeAmountDue_NegativeSubtotal_Throws()
{
var processor = new InvoiceProcessor();
Assert.ThrowsException<ArgumentOutOfRangeException>(
() => processor.ComputeAmountDue(-1m, 0, false));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
namespace Billing;

public partial class InvoiceProcessor
{
// Trivial auto-properties and a simple getter — no logic to mutate.
public string CustomerName { get; set; } = string.Empty;
public int InvoiceId { get; init; }
public bool IsPaid => Balance <= 0m;

private decimal Balance { get; set; }

/// <summary>
/// Computes the final amount due, applying late fees and tax.
/// High business risk: a flipped comparison or arithmetic change here ships wrong charges.
/// </summary>
public decimal ComputeAmountDue(decimal subtotal, int daysLate, bool taxExempt)
{
if (subtotal < 0)
throw new ArgumentOutOfRangeException(nameof(subtotal));

decimal amount = subtotal + ApplyLateFee(subtotal, daysLate);

if (!taxExempt)
amount += ComputeTax(amount);

Balance = amount;
return amount;
}

// Private helper reached only through ComputeAmountDue — part of the call chain.
private static decimal ApplyLateFee(decimal subtotal, int daysLate)
{
if (daysLate <= 0)
return 0m;
if (daysLate > 30)
return subtotal * 0.10m;
return subtotal * 0.05m;
}

// Private helper reached only through ComputeAmountDue — part of the call chain.
private static decimal ComputeTax(decimal amount) => amount * 0.08m;

public string FormatReceipt(decimal amount) => $"Receipt: {amount:C}";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// <auto-generated>
// This code was generated by a tool. Changes will be overwritten.
// </auto-generated>
namespace Billing;

public partial class InvoiceProcessor
{
public static int CompareByInvoiceId(InvoiceProcessor a, InvoiceProcessor b)
{
if (a.InvoiceId < b.InvoiceId)
return -1;
if (a.InvoiceId > b.InvoiceId)
return 1;
return 0;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[package]
name = "order_parser"
version = "0.1.0"
edition = "2021"

[lib]
path = "src/lib.rs"
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/// Parses a "quantity,price" line into a total cost.
///
/// Uses the `?` operator to short-circuit on parse errors. If either field
/// fails to parse, the error is propagated to the caller instead of panicking.
pub fn parse_line_total(line: &str) -> Result<u64, std::num::ParseIntError> {
let mut parts = line.split(',');
let quantity: u64 = parts.next().unwrap_or("").trim().parse()?;
let price: u64 = parts.next().unwrap_or("").trim().parse()?;
Ok(quantity * price)
}

/// Returns the first stock level at or below the reorder threshold.
pub fn first_below_threshold(levels: &[u32], threshold: u32) -> Option<u32> {
levels.iter().copied().find(|&l| l <= threshold)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn parses_a_valid_line() {
assert_eq!(parse_line_total("3, 10").unwrap(), 30);
}

// Note: no test exercises the error path of parse_line_total, so the `?`
// propagation is never observed by the suite.

#[test]
fn finds_a_value_below_threshold() {
assert_eq!(first_below_threshold(&[9, 5, 2], 5), Some(5));
}
}
Loading