diff --git a/tests/dotnet-test/grade-tests/eval.yaml b/tests/dotnet-test/grade-tests/eval.yaml index ccabdfc6c2..c20298386c 100644 --- a/tests/dotnet-test/grade-tests/eval.yaml +++ b/tests/dotnet-test/grade-tests/eval.yaml @@ -125,3 +125,101 @@ scenarios: - "Optionally pointed the user at `test-quality-auditor` agent or `test-anti-patterns` skill for full-suite analysis" reject_tools: ["edit", "create"] timeout: 120 + + # ========================================================================== + # Scenario 4: Go table-driven tests — idiomatic loop must NOT be flagged as + # conditional logic, and every grade must rest on an observable signal + # ========================================================================== + + - name: "Grade Go table-driven tests without misreading the loop as branching" + prompt: | + Please grade each of the following Go test functions individually for + test quality and produce a compact per-test table (one row per test) + plus a short summary. They live in `calculator_test.go` and the code + under test is in `calculator.go`. Do not modify any files. + + Test functions to grade: + - TestAdd_TableDriven + - TestDivide_ByZero + - TestParse_NoError + - TestReset_NoAssertions + setup: + files: + - path: "go.mod" + source: "fixtures/go-table-driven/go.mod" + - path: "calculator.go" + source: "fixtures/go-table-driven/calculator.go" + - path: "calculator_test.go" + source: "fixtures/go-table-driven/calculator_test.go" + assertions: + - type: "output_matches" + pattern: "(\\|\\s*Test\\s*\\|\\s*Grade\\s*\\|)" + - type: "output_matches" + pattern: "(TestAdd_TableDriven.*\\|\\s*A\\s*\\|)" + - type: "output_matches" + pattern: "(TestDivide_ByZero.*\\|\\s*A\\s*\\|)" + - type: "output_matches" + pattern: "(TestParse_NoError.*\\|\\s*C\\s*\\|)" + - type: "output_matches" + pattern: "(TestReset_NoAssertions.*\\|\\s*F\\s*\\|)" + - type: "exit_success" + rubric: + - "Detected Go and its standard `testing` package and loaded the Go language extension before grading" + - "Graded `TestAdd_TableDriven` as A — recognized the idiomatic table-driven subtests driven by `t.Run`" + - "Did NOT flag the Go table-driven `for ... range` / `if got != tt.want` loop as conditional logic or branching — it is the idiomatic assertion pattern and incurs no deduction" + - "Graded `TestDivide_ByZero` as A — the returned error is checked, which is a complete assertion of the error path" + - "Graded `TestParse_NoError` as C — only checks that no error came back and never verifies the parsed value (trivial assertion)" + - "Graded `TestReset_NoAssertions` as F — calls the function but never asserts via `t.Error`/`t.Fatal`" + - "Justified every grade with at least one observable signal from the captured test body rather than a speculative or hypothetical deduction" + - "Did NOT inflate deductions to justify a lower grade — started each test at A and deducted only for observable issues" + reject_tools: ["edit", "create"] + timeout: 300 + + # ========================================================================== + # Scenario 5: Production code unavailable — behavioral concerns are Unverified, + # not deductions, and the PR-comment report stays compact + # ========================================================================== + + - name: "Grade tests when the production code under test is unavailable" + prompt: | + Please grade each of the following test methods individually for test + quality and produce a compact per-test table (one row per test) plus a + short summary that we can post as a PR comment. They live in + `Payments.Tests/PaymentGatewayTests.cs`. The production project + `Payments.Core` is not in this workspace, so its source is unavailable. + Do not modify any files. + + Test methods to grade: + - Payments.Tests.PaymentGatewayTests.Charge_ValidCard_ReturnsApprovedResult + - Payments.Tests.PaymentGatewayTests.Charge_NegativeAmount_ThrowsArgumentOutOfRange + - Payments.Tests.PaymentGatewayTests.Refund_ExistingCharge_ReturnsReceipt + - Payments.Tests.PaymentGatewayTests.Settle_PendingBatch_Runs + setup: + files: + - path: "Payments.Tests/Payments.Tests.csproj" + source: "fixtures/production-unavailable/Payments.Tests/Payments.Tests.csproj" + - path: "Payments.Tests/PaymentGatewayTests.cs" + source: "fixtures/production-unavailable/Payments.Tests/PaymentGatewayTests.cs" + assertions: + - type: "output_matches" + pattern: "(\\|\\s*Test\\s*\\|\\s*Grade\\s*\\|)" + - type: "output_matches" + pattern: "(Charge_ValidCard_ReturnsApprovedResult.*\\|\\s*A\\s*\\|)" + - type: "output_matches" + pattern: "(Charge_NegativeAmount_ThrowsArgumentOutOfRange.*\\|\\s*A\\s*\\|)" + - type: "output_matches" + pattern: "(Refund_ExistingCharge_ReturnsReceipt.*\\|\\s*C\\s*\\|)" + - type: "output_matches" + pattern: "(Settle_PendingBatch_Runs.*\\|\\s*F\\s*\\|)" + - type: "output_matches" + pattern: "(?i)(unverified)" + - type: "exit_success" + rubric: + - "Did NOT penalize the tests because the production code under test (`Payments.Core`) is unavailable — marked behavioral concerns about uncovered behaviors as `Unverified` instead of deducting" + - "Graded `Charge_ValidCard_ReturnsApprovedResult` as A on the observable signal of its equality assertions, without inventing weaknesses that would need the unavailable production source to judge" + - "Graded `Charge_NegativeAmount_ThrowsArgumentOutOfRange` as A — the exception assertion is complete on its own" + - "Graded `Refund_ExistingCharge_ReturnsReceipt` as C — only a trivial `IsNotNull` on the receipt" + - "Graded `Settle_PendingBatch_Runs` as F — no assertions at all" + - "Kept the report compact and PR-comment-friendly — did not spill a giant (e.g. 500-row) table into the PR comment, and noted that any overflow would collapse into a `
` block per the row cap" + reject_tools: ["edit", "create"] + timeout: 300 diff --git a/tests/dotnet-test/grade-tests/fixtures/go-table-driven/calculator.go b/tests/dotnet-test/grade-tests/fixtures/go-table-driven/calculator.go new file mode 100644 index 0000000000..5351ecd593 --- /dev/null +++ b/tests/dotnet-test/grade-tests/fixtures/go-table-driven/calculator.go @@ -0,0 +1,31 @@ +package calc + +import "errors" + +// Add returns the sum of a and b. +func Add(a, b int) int { + return a + b +} + +// Divide returns a/b, or an error when b is zero. +func Divide(a, b int) (int, error) { + if b == 0 { + return 0, errors.New("division by zero") + } + return a / b, nil +} + +// Parse converts s to an int. It is intentionally simple for the fixture. +func Parse(s string) (int, error) { + n := 0 + for _, r := range s { + if r < '0' || r > '9' { + return 0, errors.New("not a number") + } + n = n*10 + int(r-'0') + } + return n, nil +} + +// Reset clears the accumulator (no return value). +func Reset() {} diff --git a/tests/dotnet-test/grade-tests/fixtures/go-table-driven/calculator_test.go b/tests/dotnet-test/grade-tests/fixtures/go-table-driven/calculator_test.go new file mode 100644 index 0000000000..349795f542 --- /dev/null +++ b/tests/dotnet-test/grade-tests/fixtures/go-table-driven/calculator_test.go @@ -0,0 +1,62 @@ +package calc + +import "testing" + +// ============================================================ +// STRONG TEST: idiomatic table-driven test with subtests. +// The `for` loop and the `if got != tt.want` comparison are +// the canonical Go assertion pattern, NOT branching/conditional +// logic in the test under grade. +// Expected grade: A (90–100) +// ============================================================ +func TestAdd_TableDriven(t *testing.T) { + tests := []struct { + name string + a, b int + want int + }{ + {"positives", 2, 3, 5}, + {"with zero", 0, 7, 7}, + {"negatives", -4, -6, -10}, + {"mixed sign", -2, 5, 3}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := Add(tt.a, tt.b) + if got != tt.want { + t.Errorf("Add(%d, %d) = %d, want %d", tt.a, tt.b, got, tt.want) + } + }) + } +} + +// ============================================================ +// STRONG TEST: error path verified by checking the returned error. +// Expected grade: A (90–100) +// ============================================================ +func TestDivide_ByZero(t *testing.T) { + _, err := Divide(10, 0) + if err == nil { + t.Fatal("expected an error dividing by zero, got nil") + } +} + +// ============================================================ +// WEAK TEST: only checks that no error came back — does not +// verify the parsed value. Trivial assertion. +// Expected grade: C (70–79) +// ============================================================ +func TestParse_NoError(t *testing.T) { + _, err := Parse("123") + if err != nil { + t.Errorf("unexpected error: %v", err) + } +} + +// ============================================================ +// BAD TEST: calls the function but never asserts anything. +// Expected grade: F (0–59) +// ============================================================ +func TestReset_NoAssertions(t *testing.T) { + Reset() +} diff --git a/tests/dotnet-test/grade-tests/fixtures/go-table-driven/go.mod b/tests/dotnet-test/grade-tests/fixtures/go-table-driven/go.mod new file mode 100644 index 0000000000..766dd12cce --- /dev/null +++ b/tests/dotnet-test/grade-tests/fixtures/go-table-driven/go.mod @@ -0,0 +1,3 @@ +module calc + +go 1.22 diff --git a/tests/dotnet-test/grade-tests/fixtures/production-unavailable/Payments.Tests/PaymentGatewayTests.cs b/tests/dotnet-test/grade-tests/fixtures/production-unavailable/Payments.Tests/PaymentGatewayTests.cs new file mode 100644 index 0000000000..d99a947ced --- /dev/null +++ b/tests/dotnet-test/grade-tests/fixtures/production-unavailable/Payments.Tests/PaymentGatewayTests.cs @@ -0,0 +1,61 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; +using Payments.Core; // Production assembly is NOT present in this fixture. + +namespace Payments.Tests; + +[TestClass] +public class PaymentGatewayTests +{ + // ============================================================ + // STRONG TEST: AAA structure, equality assertion on the result. + // Expected grade: A (90–100) + // ============================================================ + [TestMethod] + public void Charge_ValidCard_ReturnsApprovedResult() + { + var gateway = new PaymentGateway(); + + var result = gateway.Charge("4111111111111111", 49.99m); + + Assert.AreEqual(PaymentStatus.Approved, result.Status); + Assert.AreEqual(49.99m, result.AmountCharged); + } + + // ============================================================ + // STRONG TEST: exception path is complete on its own. + // Expected grade: A (90–100) + // ============================================================ + [TestMethod] + public void Charge_NegativeAmount_ThrowsArgumentOutOfRange() + { + var gateway = new PaymentGateway(); + + Assert.ThrowsException( + () => gateway.Charge("4111111111111111", -1m)); + } + + // ============================================================ + // WEAK TEST: only a not-null check on the returned receipt. + // Expected grade: C (70–79) + // ============================================================ + [TestMethod] + public void Refund_ExistingCharge_ReturnsReceipt() + { + var gateway = new PaymentGateway(); + + var receipt = gateway.Refund("txn-123"); + + Assert.IsNotNull(receipt); + } + + // ============================================================ + // BAD TEST: no assertions at all. + // Expected grade: F (0–59) + // ============================================================ + [TestMethod] + public void Settle_PendingBatch_Runs() + { + var gateway = new PaymentGateway(); + gateway.SettleBatch(); + } +} diff --git a/tests/dotnet-test/grade-tests/fixtures/production-unavailable/Payments.Tests/Payments.Tests.csproj b/tests/dotnet-test/grade-tests/fixtures/production-unavailable/Payments.Tests/Payments.Tests.csproj new file mode 100644 index 0000000000..71ea7b2df6 --- /dev/null +++ b/tests/dotnet-test/grade-tests/fixtures/production-unavailable/Payments.Tests/Payments.Tests.csproj @@ -0,0 +1,19 @@ + + + + net8.0 + enable + false + + + + + + + + + + +