diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
index 26fd722..ed753c1 100644
--- a/IMPLEMENTATION_PLAN.md
+++ b/IMPLEMENTATION_PLAN.md
@@ -1085,7 +1085,15 @@ func BenchmarkCancel_Atomic_Done_Parallel(b *testing.B) {
 }
 ```
 
-### 4.2 `internal/queue/queue_bench_test.go`
+### 4.2 Queue Benchmarks: Goroutine Topology
+
+Queue performance varies dramatically based on how goroutines interact. We benchmark three scenarios:
+
+#### 4.2.1 Single Goroutine (Baseline)
+
+Push+pop in the same goroutine—no lock contention:
+
+**File:** `internal/queue/queue_bench_test.go`
 
 ```go
 package queue_test
@@ -1099,6 +1107,7 @@ import (
 var sinkInt int
 var sinkOK bool
 
+// Single goroutine: push+pop in same routine (no contention)
 func BenchmarkQueue_Channel_PushPop_Direct(b *testing.B) {
     q := queue.NewChannel[int](1024)
     b.ReportAllocs()
@@ -1128,38 +1137,250 @@ func BenchmarkQueue_RingBuffer_PushPop_Direct(b *testing.B) {
     sinkInt = val
     sinkOK = ok
 }
+```
+
+**Expected results:**
+
+| Implementation | Latency | Notes |
+|----------------|---------|-------|
+| Channel | ~39 ns | Go channel with no contention |
+| RingBuffer (guarded) | ~36 ns | SPSC guards add overhead |
+| RingBuffer (unguarded) | ~9.5 ns | True lock-free performance |
+
+#### 4.2.2 SPSC: 1 Producer → 1 Consumer (2 Goroutines)
+
+The classic producer/consumer pattern—one goroutine writes, another reads:
+
+**File:** `internal/combined/combined_bench_test.go`
+
+```go
+// BenchmarkPipeline_Channel benchmarks 2-goroutine SPSC with channels.
+func BenchmarkPipeline_Channel(b *testing.B) {
+    q := queue.NewChannel[int](1024)
+    done := make(chan struct{})
+
+    // Consumer goroutine
+    go func() {
+        for {
+            select {
+            case <-done:
+                return
+            default:
+                q.Pop()
+            }
+        }
+    }()
 
-func BenchmarkQueue_Channel_PushPop_Interface(b *testing.B) {
-    var q queue.Queue[int] = queue.NewChannel[int](1024)
     b.ReportAllocs()
     b.ResetTimer()
 
-    var val int
-    var ok bool
+    // Producer (benchmark loop)
     for i := 0; i < b.N; i++ {
-        q.Push(i)
-        val, ok = q.Pop()
+        for !q.Push(i) {
+            // Spin until push succeeds
+        }
     }
-    sinkInt = val
-    sinkOK = ok
+
+    b.StopTimer()
+    close(done)
 }
 
-func BenchmarkQueue_RingBuffer_PushPop_Interface(b *testing.B) {
-    var q queue.Queue[int] = queue.NewRingBuffer[int](1024)
+// BenchmarkPipeline_RingBuffer benchmarks 2-goroutine SPSC with ring buffer.
+func BenchmarkPipeline_RingBuffer(b *testing.B) {
+    q := queue.NewRingBuffer[int](1024)
+    done := make(chan struct{})
+
+    // Consumer goroutine (single consumer - SPSC contract)
+    go func() {
+        for {
+            select {
+            case <-done:
+                return
+            default:
+                q.Pop()
+            }
+        }
+    }()
+
     b.ReportAllocs()
     b.ResetTimer()
 
-    var val int
-    var ok bool
+    // Producer (single producer - SPSC contract)
     for i := 0; i < b.N; i++ {
-        q.Push(i)
-        val, ok = q.Pop()
+        for !q.Push(i) {}
     }
-    sinkInt = val
-    sinkOK = ok
+
+    b.StopTimer()
+    close(done)
+}
+```
+
+**Expected results:**
+
+| Implementation | Latency | Speedup |
+|----------------|---------|---------|
+| Channel | ~128 ns | baseline |
+| RingBuffer (guarded) | ~147 ns | 0.9x (slower due to guards!) |
+| RingBuffer (unguarded) | ~39 ns | **3.3x** |
+
+#### 4.2.3 MPSC: N Producers → 1 Consumer (Channels Only)
+
+Multiple producers sending to one consumer—a very common Go pattern:
+
+```go
+// BenchmarkMPSC_Channel_2Producers benchmarks 2 producers -> 1 consumer.
+func BenchmarkMPSC_Channel_2Producers(b *testing.B) {
+    ch := make(chan int, 1024)
+    done := make(chan struct{})
+    consumerDone := make(chan struct{})
+
+    // Consumer goroutine
+    go func() {
+        defer close(consumerDone)
+        for {
+            select {
+            case <-done:
+                return
+            case <-ch:
+            default:
+            }
+        }
+    }()
+
+    b.ReportAllocs()
+    b.ResetTimer()
+
+    b.RunParallel(func(pb *testing.PB) {
+        i := 0
+        for pb.Next() {
+            for {
+                select {
+                case ch <- i:
+                    goto sent
+                default:
+                }
+            }
+        sent:
+            i++
+        }
+    })
+
+    b.StopTimer()
+    close(done)
+    <-consumerDone
 }
 ```
 
+**Expected results (showing channel lock contention):**
+
+| Producers | Channel Latency | vs SPSC |
+|-----------|-----------------|---------|
+| 1 (SPSC) | ~128 ns | baseline |
+| 2 | ~5.9 µs | 46x slower |
+| 4 | ~26 µs | 200x slower |
+| 8 | ~49 µs | 380x slower |
+
+> **Why this matters:** Channel lock contention scales poorly. For high-throughput MPSC, use go-lock-free-ring.
+
+#### 4.2.4 go-lock-free-ring Comparison
+
+The [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) library provides a sharded MPSC ring buffer that dramatically outperforms channels under contention.
+
+**File:** `internal/combined/lockfreering_bench_test.go`
+
+```go
+import ring "github.com/randomizedcoder/go-lock-free-ring"
+
+// BenchmarkLFR_SPSC_ShardedRing1 - go-lock-free-ring with 1 shard
+func BenchmarkLFR_SPSC_ShardedRing1(b *testing.B) {
+    r, _ := ring.NewShardedRing(1024, 1)
+    done := make(chan struct{})
+
+    go func() {
+        for {
+            select {
+            case <-done:
+                return
+            default:
+                r.TryRead()
+            }
+        }
+    }()
+
+    b.ResetTimer()
+    for i := 0; i < b.N; i++ {
+        for !r.Write(0, i) {}
+    }
+    b.StopTimer()
+    close(done)
+}
+
+// BenchmarkLFR_MPSC_ShardedRing_8P_8S - 8 producers, 8 shards
+func BenchmarkLFR_MPSC_ShardedRing_8P_8S(b *testing.B) {
+    r, _ := ring.NewShardedRing(2048, 8)
+    done := make(chan struct{})
+    consumerDone := make(chan struct{})
+
+    go func() {
+        defer close(consumerDone)
+        for {
+            select {
+            case <-done:
+                return
+            default:
+                r.TryRead()
+            }
+        }
+    }()
+
+    var producerID atomic.Uint64
+    b.SetParallelism(8)
+    b.ResetTimer()
+
+    b.RunParallel(func(pb *testing.PB) {
+        pid := producerID.Add(1) - 1
+        i := 0
+        for pb.Next() {
+            for !r.Write(pid, i) {}
+            i++
+        }
+    })
+
+    b.StopTimer()
+    close(done)
+    <-consumerDone
+}
+```
+
+**Comparison Results:**
+
+##### SPSC (1 Producer → 1 Consumer)
+
+| Implementation | Latency | Allocs | Speedup |
+|----------------|---------|--------|---------|
+| Channel | 248 ns | 0 | baseline |
+| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x |
+| **Our SPSC Ring (unguarded)** | **36.5 ns** | **0** | **6.8x** |
+
+> For pure SPSC, our simple ring buffer wins due to minimal overhead and zero allocations.
+
+##### MPSC (N Producers → 1 Consumer)
+
+| Producers | Channel | go-lock-free-ring | Speedup |
+|-----------|---------|-------------------|---------|
+| 4 | 35.3 µs | 539 ns | **65x** |
+| 8 | 47.1 µs | 464 ns | **101x** |
+
+> The sharded design of go-lock-free-ring eliminates lock contention, providing **65-100x** speedup.
+
+##### Choosing the Right Queue
+
+| Pattern | Best Choice | Why |
+|---------|-------------|-----|
+| 1 producer, 1 consumer | Our SPSC Ring | Fastest, zero allocs |
+| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention |
+| Simple/infrequent | Channel | Simplicity matters more |
+
 ### 4.3 `internal/tick/tick_bench_test.go`
 
 ```go
@@ -1392,79 +1613,30 @@ func BenchmarkCombined_CancelTick_Optimized(b *testing.B) {
 
 > **Why this matters:** Isolated benchmarks often show 10-20x speedups, but real loops have multiple operations. The combined benchmark shows the *actual* end-to-end improvement you'll see in production.
 
-### 4.5 Two-Goroutine SPSC Pipeline Benchmark
+### 4.5 Queue Benchmark Summary
 
-The **most representative** benchmark for real Go systems—a producer/consumer pipeline:
+> **Note:** The 2-goroutine SPSC pipeline and MPSC benchmarks are now documented in section 4.2.2 and 4.2.3 respectively, as part of the comprehensive queue benchmark suite.
 
-```go
-// internal/combined/pipeline_bench_test.go
-package combined_test
+**Key benchmark commands:**
 
-import (
-    "testing"
-
-    "github.com/randomizedcoder/some-go-benchmarks/internal/queue"
-)
-
-func BenchmarkPipeline_Channel(b *testing.B) {
-    q := queue.NewChannel[int](1024)
-    done := make(chan struct{})
-
-    // Consumer
-    go func() {
-        for {
-            select {
-            case <-done:
-                return
-            default:
-                q.Pop()
-            }
-        }
-    }()
-
-    b.ReportAllocs()
-    b.ResetTimer()
-
-    for i := 0; i < b.N; i++ {
-        for !q.Push(i) {
-            // Spin until push succeeds
-        }
-    }
-
-    b.StopTimer()
-    close(done)
-}
-
-func BenchmarkPipeline_RingBuffer(b *testing.B) {
-    q := queue.NewRingBuffer[int](1024)
-    done := make(chan struct{})
+```bash
+# Single-goroutine (baseline)
+go test -bench=BenchmarkQueue -benchmem ./internal/queue
 
-    // Consumer (single goroutine - SPSC contract)
-    go func() {
-        for {
-            select {
-            case <-done:
-                return
-            default:
-                q.Pop()
-            }
-        }
-    }()
+# 2-goroutine SPSC pipeline
+go test -bench=BenchmarkPipeline -benchmem ./internal/combined
 
-    b.ReportAllocs()
-    b.ResetTimer()
+# MPSC (multiple producers)
+go test -bench=BenchmarkMPSC -benchmem ./internal/combined
+```
 
-    // Producer (single goroutine - SPSC contract)
-    for i := 0; i < b.N; i++ {
-        for !q.Push(i) {
-            // Spin until push succeeds
-        }
-    }
+**What these benchmarks reveal:**
 
-    b.StopTimer()
-    close(done)
-}
-```
+| Pattern | Best Use Case |
+|---------|---------------|
+| Single goroutine | Testing raw queue overhead |
+| SPSC (2 goroutines) | Classic producer/consumer pipelines |
+| MPSC (N producers) | Fan-in patterns, worker pools |
 
 ### 4.6 Benchmark Methodology Validation
 
diff --git a/Makefile b/Makefile
index 14a9613..4917f0a 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,11 @@ build:
 test:
 	go test ./...
 
-# Run benchmarks with memory stats
+# =============================================================================
+# Benchmarks - All
+# =============================================================================
+
+# Run all benchmarks with memory stats
 bench:
 	go test -bench=. -benchmem ./internal/...
 
@@ -26,6 +30,42 @@ bench-variance:
 	@echo ""
 	@echo "Analyze with: benchstat bench_results.txt"
 
+# =============================================================================
+# Benchmarks - By Category
+# =============================================================================
+
+# Cancel benchmarks (context vs atomic)
+bench-cancel:
+	go test -bench=BenchmarkCancel -benchmem ./internal/cancel
+
+# Tick benchmarks (ticker implementations)
+bench-tick:
+	go test -bench=BenchmarkTick -benchmem ./internal/tick
+
+# Queue benchmarks (single goroutine)
+bench-queue:
+	go test -bench=BenchmarkQueue -benchmem ./internal/queue
+
+# Pipeline benchmarks (2-goroutine SPSC)
+bench-pipeline:
+	go test -bench=BenchmarkPipeline -benchmem ./internal/combined
+
+# MPSC benchmarks (multiple producers, channel contention)
+bench-mpsc:
+	go test -bench=BenchmarkMPSC -benchmem ./internal/combined
+
+# go-lock-free-ring comparison benchmarks
+bench-lfr:
+	go test -bench=BenchmarkLFR -benchmem ./internal/combined
+
+# Combined loop benchmarks (cancel + tick + queue)
+bench-combined:
+	go test -bench=BenchmarkCombined -benchmem ./internal/combined
+
+# =============================================================================
+# Testing & Quality
+# =============================================================================
+
 # Run tests with race detector
 race:
 	go test -race ./...
@@ -48,16 +88,34 @@ clean:
 check: build test race
 	@echo "All checks passed!"
 
+# =============================================================================
 # Help
+# =============================================================================
+
 help:
 	@echo "Available targets:"
-	@echo "  build         - Build all packages"
-	@echo "  test          - Run all tests"
-	@echo "  bench         - Run benchmarks with memory stats"
-	@echo "  bench-count   - Run benchmarks 10 times"
-	@echo "  bench-variance- Run benchmarks and save for benchstat"
-	@echo "  race          - Run tests with race detector"
-	@echo "  lint          - Run golangci-lint"
-	@echo "  bench-race    - Run benchmarks with race detector"
-	@echo "  clean         - Remove generated files"
-	@echo "  check         - Run build, test, and race"
+	@echo ""
+	@echo "Build & Test:"
+	@echo "  build          - Build all packages"
+	@echo "  test           - Run all tests"
+	@echo "  race           - Run tests with race detector"
+	@echo "  lint           - Run golangci-lint"
+	@echo "  check          - Run build, test, and race"
+	@echo ""
+	@echo "All Benchmarks:"
+	@echo "  bench          - Run all benchmarks with memory stats"
+	@echo "  bench-count    - Run benchmarks 10 times (for variance)"
+	@echo "  bench-variance - Run benchmarks and save for benchstat"
+	@echo "  bench-race     - Run benchmarks with race detector"
+	@echo ""
+	@echo "Category Benchmarks:"
+	@echo "  bench-cancel   - Cancel check: context vs atomic"
+	@echo "  bench-tick     - Tick check: ticker implementations"
+	@echo "  bench-queue    - Queue: single goroutine push+pop"
+	@echo "  bench-pipeline - Pipeline: 2-goroutine SPSC producer/consumer"
+	@echo "  bench-mpsc     - MPSC: N producers -> 1 consumer (channel contention)"
+	@echo "  bench-lfr      - go-lock-free-ring comparison (SPSC vs MPSC)"
+	@echo "  bench-combined - Combined loop: cancel + tick + queue"
+	@echo ""
+	@echo "Cleanup:"
+	@echo "  clean          - Remove generated files"
diff --git a/README.md b/README.md
index 5fcb5fe..0c0f8e8 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,27 @@ Measured on AMD Ryzen Threadripper PRO 3945WX, Go 1.25, Linux:
 |-----------|----------|-----------|---------|
 | Cancel check | 8.2 ns | 0.36 ns | **23x** |
 | Tick check | 86 ns | 5.6 ns | **15x** |
-| Queue push+pop | 37 ns | 36 ns | ~1x |
+
+### Queue Patterns: SPSC vs MPSC
+
+Queue performance depends heavily on your goroutine topology:
+
+**SPSC (1 Producer → 1 Consumer):**
+
+| Implementation | Latency | Speedup |
+|----------------|---------|---------|
+| Channel | 248 ns | baseline |
+| [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) (1 shard) | 114 ns | 2.2x |
+| Our SPSC Ring (unguarded) | **36.5 ns** | **6.8x** |
+
+**MPSC (Multiple Producers → 1 Consumer):**
+
+| Producers | Channel | go-lock-free-ring | Speedup |
+|-----------|---------|-------------------|---------|
+| 4 | 35 µs | 539 ns | **65x** |
+| 8 | 47 µs | 464 ns | **101x** |
+
+> **Key insight:** Channels scale terribly with multiple producers due to lock contention. For MPSC patterns, [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) provides **65-100x** speedup through sharded lock-free design.
 
 ### Combined Hot-Loop Pattern
 
@@ -141,14 +161,168 @@ Measure the raw cost of individual operations:
 
 > **Why combined matters:** Isolated benchmarks can be misleading. A 10x speedup on context checking means nothing if your loop is bottlenecked on channel receives. The combined benchmarks reveal the *actual* improvement in realistic scenarios.
 
-## High-Performance Alternatives
+### Queue Benchmarks: Goroutine Patterns
+
+Queue performance varies dramatically based on goroutine topology. We benchmark three implementations:
+
+| Implementation | Type | Best For |
+|----------------|------|----------|
+| Go Channel | MPSC | Simple code, moderate throughput |
+| Our SPSC Ring | SPSC | Maximum SPSC performance, zero allocs |
+| [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) | MPSC | High-throughput multi-producer scenarios |
+
+#### SPSC: 1 Producer → 1 Consumer
+
+**Cross-goroutine polling** (our benchmark - separate producer/consumer goroutines):
+
+| Implementation | Latency | Allocs | Speedup |
+|----------------|---------|--------|---------|
+| Channel | 248 ns | 0 | baseline |
+| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x |
+| **Our SPSC Ring (unguarded)** | **36.5 ns** | **0** | **6.8x** |
+
+**Same-goroutine** (go-lock-free-ring native benchmarks):
+
+| Benchmark | Latency | Notes |
+|-----------|---------|-------|
+| `BenchmarkWrite` | 35 ns | Single write operation |
+| `BenchmarkTryRead` | 31 ns | Single read operation |
+| `BenchmarkProducerConsumer` | 31 ns | Write + periodic drain in same goroutine |
+| `BenchmarkConcurrentWrite` (8 producers) | 10.7 ns | Parallel writes, sharded |
+
+> **Note:** Cross-goroutine coordination adds ~80ns overhead. For batched same-goroutine patterns, go-lock-free-ring achieves 31 ns/op.
+
+#### MPSC: N Producers → 1 Consumer
+
+This is where [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) shines:
+
+| Producers | Channel | go-lock-free-ring | Speedup |
+|-----------|---------|-------------------|---------|
+| 4 | 35.3 µs | 539 ns | **65x** |
+| 8 | 47.1 µs | 464 ns | **101x** |
+
+> **Key insight:** Channel lock contention scales terribly. With 8 producers, go-lock-free-ring is **101x faster** due to its sharded design.
+
+#### Choosing the Right Queue
+
+| Your Pattern | Recommendation | Why |
+|--------------|----------------|-----|
+| 1 producer, 1 consumer | Our SPSC Ring | Fastest, zero allocs |
+| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention |
+| Simple/infrequent | Channel | Simplicity, good enough |
+
+#### Why Our SPSC Ring is Faster in Cross-Goroutine Tests
+
+For SPSC scenarios with **separate producer/consumer goroutines**, our simple ring (36.5 ns) beats go-lock-free-ring (114 ns).
+
+> **Important:** go-lock-free-ring's native benchmarks show ~31 ns/op for producer-consumer, but that's in the **same goroutine**. Our 114 ns measurement is for **cross-goroutine** polling, which adds coordination overhead. Both measurements are valid for their respective patterns.
+
+Here's why our ring is faster in cross-goroutine scenarios:
+
+**1. CAS vs Simple Store**
+
+go-lock-free-ring must use Compare-And-Swap to safely handle multiple producers:
+
+```go
+// go-lock-free-ring: CAS to claim slot (expensive!)
+if !atomic.CompareAndSwapUint64(&s.writePos, pos, pos+1) {
+    continue  // Retry if another producer won
+}
+```
+
+Our SPSC ring just does a simple atomic store:
+
+```go
+// Our SPSC: simple store (fast!)
+r.head.Store(head + 1)
+```
+
+CAS is **3-10x more expensive** than a simple store because it must read, compare, and conditionally write while handling cache invalidation across cores.
+
+**2. Sequence Numbers for Race Protection**
+
+go-lock-free-ring uses per-slot sequence numbers to prevent a consumer from reading partially-written data:
+
+```go
+// go-lock-free-ring: extra atomic ops for safety
+seq := atomic.LoadUint64(&sl.seq)      // Check slot ready
+if seq != pos { return false }
+// ... write value ...
+atomic.StoreUint64(&sl.seq, pos+1)     // Signal to reader
+```
+
+Our SPSC ring skips this because we **trust** only one producer exists.
+
+**3. Boxing Allocations**
+
+```go
+// go-lock-free-ring uses 'any' → 8 B allocation per write
+sl.value = value
+
+// Our ring uses generics → zero allocations
+r.buf[head&r.mask] = v
+```
+
+**What We Give Up:**
 
+| Safety Feature | Our SPSC Ring | go-lock-free-ring |
+|----------------|---------------|-------------------|
+| Multiple producers | ❌ Undefined behavior | ✅ Safe |
+| Race protection | ❌ Trust-based | ✅ Sequence numbers |
+| Weak memory (ARM) | ⚠️ May need barriers | ✅ Proven safe |
 
-### Lock-Free Ring Buffer
+> **Bottom line:** Our SPSC ring is faster because it makes **dangerous assumptions** (single producer, x86 memory model). go-lock-free-ring is slower because it's **provably safe** for MPSC with explicit race protection. Use go-lock-free-ring for production multi-producer scenarios.
 
-In place of standard channels, we evaluate lock-free ring buffers for lower-latency communication between goroutines.
+#### Why Our Guarded RingBuffer is Slow
 
-→ [github.com/randomizedcoder/go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring)
+The in-repo `RingBuffer` includes debug guards that add ~25ns overhead:
+
+```go
+func (r *RingBuffer[T]) Push(v T) bool {
+    if !r.pushActive.CompareAndSwap(0, 1) { // +10-15ns
+        panic("concurrent Push")
+    }
+    defer r.pushActive.Store(0)             // +10-15ns
+    // ...
+}
+```
+
+**For production**: Use the unguarded version or [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring).
+
+## High-Performance Alternatives
+
+### Lock-Free Ring Buffers
+
+We provide two lock-free queue implementations with different safety/performance tradeoffs:
+
+**1. Our SPSC Ring Buffer** (`internal/queue/ringbuf.go`)
+- Single-Producer, Single-Consumer only
+- Generics-based (`[T any]`) — zero boxing allocations
+- Simple atomic Load/Store (no CAS) — maximum speed
+- Debug guards catch contract violations (disable for production)
+- ⚠️ **No race protection** — trusts caller to maintain SPSC contract
+- ⚠️ **x86 optimized** — may need memory barriers on ARM
+- **Best for:** Dedicated producer/consumer goroutine pairs where you control both ends
+
+**2. [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring)** (external library)
+- Multi-Producer, Single-Consumer (MPSC)
+- Sharded design reduces contention across producers
+- Uses CAS + sequence numbers for **proven race-free operation**
+- Uses `any` type (causes boxing allocations)
+- Configurable retry strategies for different load patterns
+- ✅ **Production-tested** at 2300+ Mb/s throughput
+- **Best for:** Fan-in patterns, worker pools, high-throughput pipelines
+
+| Feature | Our SPSC Ring | go-lock-free-ring |
+|---------|---------------|-------------------|
+| Producers | 1 only | Multiple |
+| Consumers | 1 only | 1 only |
+| Allocations | 0 | 1+ (boxing) |
+| SPSC latency | **36.5 ns** | 114 ns |
+| 8-producer latency | N/A | **464 ns** |
+| Race protection | ❌ None | ✅ Sequence numbers |
+| Write mechanism | Store | CAS + retry |
+| Production ready | ⚠️ SPSC only | ✅ Battle-tested |
 
 ### Atomic Flags for Cancellation
 
@@ -210,27 +384,55 @@ The Go runtime has an internal function `nanotime()` that returns a monotonic cl
 func nanotime() int64
 ```
 
-## Repo layout
+## Repo Layout
 
-The project layout is:
 ```
-[das@l:~/Downloads/some-go-benchmarks]$ tree
 .
-├── cmd
-│   ├── channel
-│   ├── context
-│   ├── context-ticker
-│   └── ticker
-├── internal
-├── LICENSE
-└── README.md
-
-7 directories, 2 files
+├── cmd/                        # CLI tools for interactive benchmarking
+│   ├── channel/main.go         # Queue comparison demo
+│   ├── context/main.go         # Cancel check comparison demo
+│   ├── context-ticker/main.go  # Combined benchmark demo
+│   └── ticker/main.go          # Tick check comparison demo
+│
+├── internal/
+│   ├── cancel/                 # Cancellation signaling
+│   │   ├── cancel.go           # Canceler interface
+│   │   ├── context.go          # Standard: ctx.Done() via select
+│   │   ├── atomic.go           # Optimized: atomic.Bool
+│   │   └── *_test.go           # Unit + benchmark tests
+│   │
+│   ├── queue/                  # SPSC message passing
+│   │   ├── queue.go            # Queue[T] interface
+│   │   ├── channel.go          # Standard: buffered channel
+│   │   ├── ringbuf.go          # Optimized: lock-free ring buffer
+│   │   └── *_test.go           # Unit + benchmark + contract tests
+│   │
+│   ├── tick/                   # Periodic triggers
+│   │   ├── tick.go             # Ticker interface
+│   │   ├── ticker.go           # Standard: time.Ticker
+│   │   ├── batch.go            # Optimized: check every N ops
+│   │   ├── atomic.go           # Optimized: runtime.nanotime
+│   │   ├── tsc_amd64.go/.s     # Optimized: raw RDTSC (x86 only)
+│   │   ├── tsc_stub.go         # Stub for non-x86 architectures
+│   │   └── *_test.go           # Unit + benchmark tests
+│   │
+│   └── combined/               # Interaction benchmarks
+│       └── combined_bench_test.go
+│
+├── .github/workflows/ci.yml    # CI: multi-version, multi-platform
+├── Makefile                    # Build targets
+├── README.md                   # This file
+├── WALKTHROUGH.md              # Guided tutorial with example output
+├── BENCHMARKING.md             # Environment setup & methodology
+├── IMPLEMENTATION_PLAN.md      # Design document
+└── IMPLEMENTATION_LOG.md       # Development log
 ```
 
-The internal folder is for small library functions that holds the main code.
+**Key directories:**
 
-The ./cmd/ folder has a main.go implmentations that use the libraries, to demostrate limits.
+- `internal/` — Core library implementations (standard vs optimized)
+- `cmd/` — CLI tools that demonstrate the libraries with human-readable output
+- `.github/workflows/` — CI testing across Go 1.21-1.23, Linux/macOS
 
 ## How to Run
 
diff --git a/WALKTHROUGH.md b/WALKTHROUGH.md
index 98ca43e..fdd0dce 100644
--- a/WALKTHROUGH.md
+++ b/WALKTHROUGH.md
@@ -33,6 +33,57 @@ ok      github.com/randomizedcoder/some-go-benchmarks/internal/queue    0.004s
 ok      github.com/randomizedcoder/some-go-benchmarks/internal/tick     0.735s
 ```
 
+### Using the Makefile
+
+The project includes a Makefile with convenient targets:
+
+```bash
+$ make help
+```
+
+**Output:**
+
+```
+Available targets:
+
+Build & Test:
+  build          - Build all packages
+  test           - Run all tests
+  race           - Run tests with race detector
+  lint           - Run golangci-lint
+  check          - Run build, test, and race
+
+All Benchmarks:
+  bench          - Run all benchmarks with memory stats
+  bench-count    - Run benchmarks 10 times (for variance)
+  bench-variance - Run benchmarks and save for benchstat
+  bench-race     - Run benchmarks with race detector
+
+Category Benchmarks:
+  bench-cancel   - Cancel check: context vs atomic
+  bench-tick     - Tick check: ticker implementations
+  bench-queue    - Queue: single goroutine push+pop
+  bench-pipeline - Pipeline: 2-goroutine SPSC producer/consumer
+  bench-mpsc     - MPSC: N producers -> 1 consumer (channel contention)
+  bench-lfr      - go-lock-free-ring comparison (SPSC vs MPSC)
+  bench-combined - Combined loop: cancel + tick + queue
+
+Cleanup:
+  clean          - Remove generated files
+```
+
+**Quick Start:**
+
+```bash
+# Run all benchmarks
+$ make bench
+
+# Run specific category
+$ make bench-lfr      # go-lock-free-ring comparison
+$ make bench-mpsc     # Channel contention with multiple producers
+$ make bench-pipeline # 2-goroutine producer/consumer
+```
+
 ---
 
 ## Step 2: Run Basic Benchmarks
@@ -120,6 +171,146 @@ BenchmarkCombined_FullLoop_Optimized-24         19513278                62.86 ns
 
 ---
 
+### Queue Benchmarks: Goroutine Patterns
+
+Queue performance varies dramatically based on goroutine topology.
+
+#### Single Goroutine (No Contention)
+
+```bash
+$ go test -bench=BenchmarkQueue -benchmem ./internal/queue
+```
+
+**Output:**
+
+```
+goos: linux
+goarch: amd64
+pkg: github.com/randomizedcoder/some-go-benchmarks/internal/queue
+cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores
+BenchmarkQueue_Channel_PushPop_Direct-24           30932498                38.96 ns/op            0 B/op          0 allocs/op
+BenchmarkQueue_RingBuffer_PushPop_Direct-24        32920832                35.89 ns/op            0 B/op          0 allocs/op
+BenchmarkQueue_Channel_PushPop_Interface-24        27947314                43.26 ns/op            0 B/op          0 allocs/op
+BenchmarkQueue_RingBuffer_PushPop_Interface-24     30313048                40.37 ns/op            0 B/op          0 allocs/op
+```
+
+> **Note:** The in-repo RingBuffer has SPSC guards that add overhead. An unguarded ring buffer achieves ~9.5 ns/op.
+
+#### SPSC: 1 Producer → 1 Consumer (2 Goroutines)
+
+This is the classic producer/consumer pattern—the most common Go concurrency pattern:
+
+```bash
+$ go test -bench=BenchmarkPipeline -benchmem ./internal/combined
+```
+
+**Output:**
+
+```
+goos: linux
+goarch: amd64
+pkg: github.com/randomizedcoder/some-go-benchmarks/internal/combined
+cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores
+BenchmarkPipeline_Channel-24             7858700               127.9 ns/op            0 B/op           0 allocs/op
+BenchmarkPipeline_RingBuffer-24         11740012               146.8 ns/op            0 B/op           0 allocs/op
+```
+
+> **Key insight:** The guarded RingBuffer is *slower* than channels due to SPSC guard overhead. An unguarded lock-free ring buffer achieves ~39 ns/op (**3.3x faster** than channels).
+
+#### MPSC: N Producers → 1 Consumer (Channel Lock Contention)
+
+Multiple goroutines sending to one consumer shows channel lock contention:
+
+```bash
+$ go test -bench=BenchmarkMPSC -benchmem ./internal/combined
+```
+
+**Output:**
+
+```
+goos: linux
+goarch: amd64
+pkg: github.com/randomizedcoder/some-go-benchmarks/internal/combined
+cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores
+BenchmarkMPSC_Channel_2Producers-24       180842              5922 ns/op              0 B/op           0 allocs/op
+BenchmarkMPSC_Channel_4Producers-24       119090             26351 ns/op              0 B/op           0 allocs/op
+BenchmarkMPSC_Channel_8Producers-24       171520             49074 ns/op              0 B/op           0 allocs/op
+```
+
+**Lock contention scaling:**
+
+| Producers | Latency | vs 1 Producer |
+|-----------|---------|---------------|
+| 1 (SPSC) | 128 ns | baseline |
+| 2 | 5.9 µs | **46x** slower |
+| 4 | 26 µs | **200x** slower |
+| 8 | 49 µs | **380x** slower |
+
+> **Key insight:** Channel lock contention scales poorly. For high-throughput fan-in patterns, use go-lock-free-ring.
+
+#### go-lock-free-ring Comparison
+
+The [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) library provides a sharded MPSC ring buffer. Let's compare:
+
+```bash
+$ go test -bench=BenchmarkLFR -benchmem ./internal/combined
+```
+
+**Output:**
+
+```
+goos: linux
+goarch: amd64
+pkg: github.com/randomizedcoder/some-go-benchmarks/internal/combined
+cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores
+BenchmarkLFR_SPSC_Channel-24                     4372419               247.8 ns/op             0 B/op          0 allocs/op
+BenchmarkLFR_SPSC_OurRing-24                    33405556                36.53 ns/op            0 B/op          0 allocs/op
+BenchmarkLFR_SPSC_ShardedRing1-24               10212240               114.1 ns/op             8 B/op          1 allocs/op
+BenchmarkLFR_MPSC_Channel_4P-24                    85386             35337 ns/op               0 B/op          0 allocs/op
+BenchmarkLFR_MPSC_ShardedRing_4P_4S-24           2179492               539.4 ns/op           412 B/op         51 allocs/op
+BenchmarkLFR_MPSC_Channel_8P-24                    58347             47067 ns/op               1 B/op          0 allocs/op
+BenchmarkLFR_MPSC_ShardedRing_8P_8S-24           2596642               464.0 ns/op           412 B/op         51 allocs/op
+```
+
+**SPSC Comparison (1 Producer → 1 Consumer):**
+
+These are **cross-goroutine** benchmarks (separate producer/consumer goroutines):
+
+| Implementation | Latency | Allocs | Speedup |
+|----------------|---------|--------|---------|
+| Channel | 248 ns | 0 | baseline |
+| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x |
+| **Our SPSC Ring** | **36.5 ns** | **0** | **6.8x** |
+
+> **Note:** go-lock-free-ring's native `BenchmarkProducerConsumer` shows 31 ns/op, but that's in the **same goroutine**. Cross-goroutine polling adds ~80ns coordination overhead.
+
+**Why Our SPSC Ring is Faster in Cross-Goroutine Tests:**
+
+1. **CAS vs Store**: go-lock-free-ring uses `CompareAndSwap` to safely handle multiple producers (3-10x slower than simple Store)
+2. **Sequence numbers**: go-lock-free-ring uses per-slot sequence numbers to prevent race conditions (extra atomic ops)
+3. **Boxing**: go-lock-free-ring uses `any` type causing allocations; our ring uses generics (zero allocs)
+
+> **Tradeoff**: Our ring is faster because it makes dangerous assumptions (single producer, x86 memory model). go-lock-free-ring is slower because it's provably race-free.
+
+**MPSC Comparison (N Producers → 1 Consumer):**
+
+| Producers | Channel | go-lock-free-ring | Speedup |
+|-----------|---------|-------------------|---------|
+| 4 | 35.3 µs | 539 ns | **65x** |
+| 8 | 47.1 µs | 464 ns | **101x** |
+
+> **Key insight:** go-lock-free-ring's sharded design eliminates lock contention, providing **65-101x** speedup over channels for multi-producer scenarios!
+
+**Choosing the Right Queue:**
+
+| Your Pattern | Best Choice | Why |
+|--------------|-------------|-----|
+| 1 producer, 1 consumer | Our SPSC Ring | Fastest (36.5 ns), zero allocs |
+| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention |
+| Simple/infrequent | Channel | Simplicity over speed |
+
+---
+
 ## Step 3: Use CLI Tools
 
 The CLI tools provide easier-to-read output with throughput analysis.
@@ -275,6 +466,8 @@ $ sudo nice -n -20 taskset -c 0 GOMAXPROCS=1 go test -bench=. ./internal/cancel
 |-----------|----------|-----------|---------|
 | Cancel check | 8.2 ns | 0.36 ns | **23x** |
 | Tick check | 86 ns | 5.6 ns (batch) | **15x** |
+| Queue SPSC (2 goroutines) | 248 ns | 36.5 ns | **6.8x** |
+| Queue MPSC (8 producers) | 47 µs | 464 ns | **101x** |
 | Combined loop | 130 ns | 63 ns | **2.1x** |
 
 ### When Do These Optimizations Matter?
diff --git a/go.mod b/go.mod
index 305fa46..46a8db5 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,5 @@
 module github.com/randomizedcoder/some-go-benchmarks
 
-go 1.25
+go 1.25.4
+
+require github.com/randomizedcoder/go-lock-free-ring v1.0.4 // indirect
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..89baee8
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,2 @@
+github.com/randomizedcoder/go-lock-free-ring v1.0.4 h1:BmhAuW2L9SER/f0NMYZ/XppBooF8dw2Hko6zw7wutzs=
+github.com/randomizedcoder/go-lock-free-ring v1.0.4/go.mod h1:Vlxt5+13n/4mqwbHrYJF20R5RcyYumTXIMiSEL5POSk=
diff --git a/internal/combined/combined_bench_test.go b/internal/combined/combined_bench_test.go
index af5be9b..9f186a2 100644
--- a/internal/combined/combined_bench_test.go
+++ b/internal/combined/combined_bench_test.go
@@ -177,3 +177,155 @@ func BenchmarkPipeline_RingBuffer(b *testing.B) {
 	b.StopTimer()
 	close(done)
 }
+
+// ============================================================================
+// MPSC benchmarks (Multiple Producer, Single Consumer)
+// ============================================================================
+// This is a very common Go pattern: multiple goroutines sending to one consumer.
+// Channels naturally support this. Lock-free MPSC queues are more complex
+// and require different data structures than SPSC.
+
+// BenchmarkMPSC_Channel_2Producers benchmarks 2 producers -> 1 consumer.
+func BenchmarkMPSC_Channel_2Producers(b *testing.B) {
+	ch := make(chan int, 1024)
+	done := make(chan struct{})
+	var consumerDone chan struct{}
+
+	// Consumer goroutine
+	consumerDone = make(chan struct{})
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			case <-ch:
+				// consume
+			default:
+				// non-blocking
+			}
+		}
+	}()
+
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	// Run producers in parallel using b.RunParallel
+	b.RunParallel(func(pb *testing.PB) {
+		i := 0
+		for pb.Next() {
+			select {
+			case ch <- i:
+			default:
+				// Channel full, spin
+				for {
+					select {
+					case ch <- i:
+						goto sent
+					default:
+					}
+				}
+			sent:
+			}
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}
+
+// BenchmarkMPSC_Channel_4Producers benchmarks 4 producers -> 1 consumer.
+func BenchmarkMPSC_Channel_4Producers(b *testing.B) {
+	ch := make(chan int, 1024)
+	done := make(chan struct{})
+	var consumerDone chan struct{}
+
+	// Consumer goroutine
+	consumerDone = make(chan struct{})
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			case <-ch:
+				// consume
+			default:
+				// non-blocking
+			}
+		}
+	}()
+
+	// Set GOMAXPROCS for consistent producer count
+	b.SetParallelism(4)
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		i := 0
+		for pb.Next() {
+			for {
+				select {
+				case ch <- i:
+					goto sent
+				default:
+				}
+			}
+		sent:
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}
+
+// BenchmarkMPSC_Channel_8Producers benchmarks 8 producers -> 1 consumer.
+// This stresses channel lock contention heavily.
+func BenchmarkMPSC_Channel_8Producers(b *testing.B) {
+	ch := make(chan int, 1024)
+	done := make(chan struct{})
+	var consumerDone chan struct{}
+
+	// Consumer goroutine
+	consumerDone = make(chan struct{})
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			case <-ch:
+				// consume
+			default:
+				// non-blocking
+			}
+		}
+	}()
+
+	b.SetParallelism(8)
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		i := 0
+		for pb.Next() {
+			for {
+				select {
+				case ch <- i:
+					goto sent
+				default:
+				}
+			}
+		sent:
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}
diff --git a/internal/combined/lockfreering_bench_test.go b/internal/combined/lockfreering_bench_test.go
new file mode 100644
index 0000000..4e65ebf
--- /dev/null
+++ b/internal/combined/lockfreering_bench_test.go
@@ -0,0 +1,304 @@
+package combined_test
+
+import (
+	"sync/atomic"
+	"testing"
+
+	ring "github.com/randomizedcoder/go-lock-free-ring"
+)
+
+// ============================================================================
+// Comparison Benchmarks: Channel vs Our SPSC vs go-lock-free-ring (MPSC)
+// ============================================================================
+//
+// KEY DIFFERENCE:
+// - Our RingBuffer: SPSC (Single-Producer, Single-Consumer)
+// - go-lock-free-ring: MPSC (Multi-Producer, Single-Consumer) with sharding
+//
+// The sharded MPSC design is optimized for multiple producers, not single.
+
+var sinkAny any
+var sinkOkLfr bool
+
+// ============================================================================
+// SPSC: 1 Producer → 1 Consumer (comparing apples to apples)
+// ============================================================================
+
+// Our unguarded SPSC ring buffer (for fair comparison)
+type spscRing struct {
+	buf  []int
+	mask uint64
+	head atomic.Uint64
+	tail atomic.Uint64
+}
+
+func newSPSCRing(size int) *spscRing {
+	n := uint64(1)
+	for n < uint64(size) {
+		n <<= 1
+	}
+	return &spscRing{buf: make([]int, n), mask: n - 1}
+}
+
+func (r *spscRing) Push(v int) bool {
+	head := r.head.Load()
+	tail := r.tail.Load()
+	if head-tail >= uint64(len(r.buf)) {
+		return false
+	}
+	r.buf[head&r.mask] = v
+	r.head.Store(head + 1)
+	return true
+}
+
+func (r *spscRing) Pop() (int, bool) {
+	tail := r.tail.Load()
+	head := r.head.Load()
+	if tail >= head {
+		return 0, false
+	}
+	v := r.buf[tail&r.mask]
+	r.tail.Store(tail + 1)
+	return v, true
+}
+
+// BenchmarkLFR_SPSC_Channel - baseline channel
+func BenchmarkLFR_SPSC_Channel(b *testing.B) {
+	ch := make(chan int, 1024)
+	done := make(chan struct{})
+
+	go func() {
+		for {
+			select {
+			case <-done:
+				return
+			case <-ch:
+			default:
+			}
+		}
+	}()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for {
+			select {
+			case ch <- i:
+				goto sent
+			default:
+			}
+		}
+	sent:
+	}
+	b.StopTimer()
+	close(done)
+}
+
+// BenchmarkLFR_SPSC_OurRing - our unguarded SPSC
+func BenchmarkLFR_SPSC_OurRing(b *testing.B) {
+	q := newSPSCRing(1024)
+	done := make(chan struct{})
+
+	go func() {
+		for {
+			select {
+			case <-done:
+				return
+			default:
+				q.Pop()
+			}
+		}
+	}()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for !q.Push(i) {
+		}
+	}
+	b.StopTimer()
+	close(done)
+}
+
+// BenchmarkLFR_SPSC_ShardedRing1 - go-lock-free-ring with 1 shard (SPSC-like)
+func BenchmarkLFR_SPSC_ShardedRing1(b *testing.B) {
+	r, _ := ring.NewShardedRing(1024, 1)
+	done := make(chan struct{})
+
+	go func() {
+		for {
+			select {
+			case <-done:
+				return
+			default:
+				r.TryRead()
+			}
+		}
+	}()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for !r.Write(0, i) {
+		}
+	}
+	b.StopTimer()
+	close(done)
+}
+
+// ============================================================================
+// MPSC: N Producers → 1 Consumer (where go-lock-free-ring shines)
+// ============================================================================
+
+// BenchmarkLFR_MPSC_Channel_4P - 4 producers using channel
+func BenchmarkLFR_MPSC_Channel_4P(b *testing.B) {
+	ch := make(chan int, 1024)
+	done := make(chan struct{})
+	consumerDone := make(chan struct{})
+
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			case <-ch:
+			default:
+			}
+		}
+	}()
+
+	b.SetParallelism(4)
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		i := 0
+		for pb.Next() {
+			for {
+				select {
+				case ch <- i:
+					goto sent
+				default:
+				}
+			}
+		sent:
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}
+
+// BenchmarkLFR_MPSC_ShardedRing_4P_4S - 4 producers, 4 shards
+func BenchmarkLFR_MPSC_ShardedRing_4P_4S(b *testing.B) {
+	r, _ := ring.NewShardedRing(1024, 4)
+	done := make(chan struct{})
+	consumerDone := make(chan struct{})
+
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			default:
+				r.TryRead()
+			}
+		}
+	}()
+
+	var producerID atomic.Uint64
+	b.SetParallelism(4)
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		pid := producerID.Add(1) - 1
+		i := 0
+		for pb.Next() {
+			for !r.Write(pid, i) {
+			}
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}
+
+// BenchmarkLFR_MPSC_Channel_8P - 8 producers using channel
+func BenchmarkLFR_MPSC_Channel_8P(b *testing.B) {
+	ch := make(chan int, 1024)
+	done := make(chan struct{})
+	consumerDone := make(chan struct{})
+
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			case <-ch:
+			default:
+			}
+		}
+	}()
+
+	b.SetParallelism(8)
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		i := 0
+		for pb.Next() {
+			for {
+				select {
+				case ch <- i:
+					goto sent
+				default:
+				}
+			}
+		sent:
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}
+
+// BenchmarkLFR_MPSC_ShardedRing_8P_8S - 8 producers, 8 shards
+func BenchmarkLFR_MPSC_ShardedRing_8P_8S(b *testing.B) {
+	r, _ := ring.NewShardedRing(2048, 8) // Larger capacity for 8 producers
+	done := make(chan struct{})
+	consumerDone := make(chan struct{})
+
+	go func() {
+		defer close(consumerDone)
+		for {
+			select {
+			case <-done:
+				return
+			default:
+				r.TryRead()
+			}
+		}
+	}()
+
+	var producerID atomic.Uint64
+	b.SetParallelism(8)
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		pid := producerID.Add(1) - 1
+		i := 0
+		for pb.Next() {
+			for !r.Write(pid, i) {
+			}
+			i++
+		}
+	})
+
+	b.StopTimer()
+	close(done)
+	<-consumerDone
+}