diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md index 26fd722..ed753c1 100644 --- a/IMPLEMENTATION_PLAN.md +++ b/IMPLEMENTATION_PLAN.md @@ -1085,7 +1085,15 @@ func BenchmarkCancel_Atomic_Done_Parallel(b *testing.B) { } ``` -### 4.2 `internal/queue/queue_bench_test.go` +### 4.2 Queue Benchmarks: Goroutine Topology + +Queue performance varies dramatically based on how goroutines interact. We benchmark three scenarios: + +#### 4.2.1 Single Goroutine (Baseline) + +Push+pop in the same goroutine—no lock contention: + +**File:** `internal/queue/queue_bench_test.go` ```go package queue_test @@ -1099,6 +1107,7 @@ import ( var sinkInt int var sinkOK bool +// Single goroutine: push+pop in same routine (no contention) func BenchmarkQueue_Channel_PushPop_Direct(b *testing.B) { q := queue.NewChannel[int](1024) b.ReportAllocs() @@ -1128,38 +1137,250 @@ func BenchmarkQueue_RingBuffer_PushPop_Direct(b *testing.B) { sinkInt = val sinkOK = ok } +``` + +**Expected results:** + +| Implementation | Latency | Notes | +|----------------|---------|-------| +| Channel | ~39 ns | Go channel with no contention | +| RingBuffer (guarded) | ~36 ns | SPSC guards add overhead | +| RingBuffer (unguarded) | ~9.5 ns | True lock-free performance | + +#### 4.2.2 SPSC: 1 Producer → 1 Consumer (2 Goroutines) + +The classic producer/consumer pattern—one goroutine writes, another reads: + +**File:** `internal/combined/combined_bench_test.go` + +```go +// BenchmarkPipeline_Channel benchmarks 2-goroutine SPSC with channels. +func BenchmarkPipeline_Channel(b *testing.B) { + q := queue.NewChannel[int](1024) + done := make(chan struct{}) + + // Consumer goroutine + go func() { + for { + select { + case <-done: + return + default: + q.Pop() + } + } + }() -func BenchmarkQueue_Channel_PushPop_Interface(b *testing.B) { - var q queue.Queue[int] = queue.NewChannel[int](1024) b.ReportAllocs() b.ResetTimer() - var val int - var ok bool + // Producer (benchmark loop) for i := 0; i < b.N; i++ { - q.Push(i) - val, ok = q.Pop() + for !q.Push(i) { + // Spin until push succeeds + } } - sinkInt = val - sinkOK = ok + + b.StopTimer() + close(done) } -func BenchmarkQueue_RingBuffer_PushPop_Interface(b *testing.B) { - var q queue.Queue[int] = queue.NewRingBuffer[int](1024) +// BenchmarkPipeline_RingBuffer benchmarks 2-goroutine SPSC with ring buffer. +func BenchmarkPipeline_RingBuffer(b *testing.B) { + q := queue.NewRingBuffer[int](1024) + done := make(chan struct{}) + + // Consumer goroutine (single consumer - SPSC contract) + go func() { + for { + select { + case <-done: + return + default: + q.Pop() + } + } + }() + b.ReportAllocs() b.ResetTimer() - var val int - var ok bool + // Producer (single producer - SPSC contract) for i := 0; i < b.N; i++ { - q.Push(i) - val, ok = q.Pop() + for !q.Push(i) {} } - sinkInt = val - sinkOK = ok + + b.StopTimer() + close(done) +} +``` + +**Expected results:** + +| Implementation | Latency | Speedup | +|----------------|---------|---------| +| Channel | ~128 ns | baseline | +| RingBuffer (guarded) | ~147 ns | 0.9x (slower due to guards!) | +| RingBuffer (unguarded) | ~39 ns | **3.3x** | + +#### 4.2.3 MPSC: N Producers → 1 Consumer (Channels Only) + +Multiple producers sending to one consumer—a very common Go pattern: + +```go +// BenchmarkMPSC_Channel_2Producers benchmarks 2 producers -> 1 consumer. +func BenchmarkMPSC_Channel_2Producers(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + consumerDone := make(chan struct{}) + + // Consumer goroutine + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + case <-ch: + default: + } + } + }() + + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone } ``` +**Expected results (showing channel lock contention):** + +| Producers | Channel Latency | vs SPSC | +|-----------|-----------------|---------| +| 1 (SPSC) | ~128 ns | baseline | +| 2 | ~5.9 µs | 46x slower | +| 4 | ~26 µs | 200x slower | +| 8 | ~49 µs | 380x slower | + +> **Why this matters:** Channel lock contention scales poorly. For high-throughput MPSC, use go-lock-free-ring. + +#### 4.2.4 go-lock-free-ring Comparison + +The [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) library provides a sharded MPSC ring buffer that dramatically outperforms channels under contention. + +**File:** `internal/combined/lockfreering_bench_test.go` + +```go +import ring "github.com/randomizedcoder/go-lock-free-ring" + +// BenchmarkLFR_SPSC_ShardedRing1 - go-lock-free-ring with 1 shard +func BenchmarkLFR_SPSC_ShardedRing1(b *testing.B) { + r, _ := ring.NewShardedRing(1024, 1) + done := make(chan struct{}) + + go func() { + for { + select { + case <-done: + return + default: + r.TryRead() + } + } + }() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for !r.Write(0, i) {} + } + b.StopTimer() + close(done) +} + +// BenchmarkLFR_MPSC_ShardedRing_8P_8S - 8 producers, 8 shards +func BenchmarkLFR_MPSC_ShardedRing_8P_8S(b *testing.B) { + r, _ := ring.NewShardedRing(2048, 8) + done := make(chan struct{}) + consumerDone := make(chan struct{}) + + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + default: + r.TryRead() + } + } + }() + + var producerID atomic.Uint64 + b.SetParallelism(8) + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + pid := producerID.Add(1) - 1 + i := 0 + for pb.Next() { + for !r.Write(pid, i) {} + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} +``` + +**Comparison Results:** + +##### SPSC (1 Producer → 1 Consumer) + +| Implementation | Latency | Allocs | Speedup | +|----------------|---------|--------|---------| +| Channel | 248 ns | 0 | baseline | +| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x | +| **Our SPSC Ring (unguarded)** | **36.5 ns** | **0** | **6.8x** | + +> For pure SPSC, our simple ring buffer wins due to minimal overhead and zero allocations. + +##### MPSC (N Producers → 1 Consumer) + +| Producers | Channel | go-lock-free-ring | Speedup | +|-----------|---------|-------------------|---------| +| 4 | 35.3 µs | 539 ns | **65x** | +| 8 | 47.1 µs | 464 ns | **101x** | + +> The sharded design of go-lock-free-ring eliminates lock contention, providing **65-100x** speedup. + +##### Choosing the Right Queue + +| Pattern | Best Choice | Why | +|---------|-------------|-----| +| 1 producer, 1 consumer | Our SPSC Ring | Fastest, zero allocs | +| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention | +| Simple/infrequent | Channel | Simplicity matters more | + ### 4.3 `internal/tick/tick_bench_test.go` ```go @@ -1392,79 +1613,30 @@ func BenchmarkCombined_CancelTick_Optimized(b *testing.B) { > **Why this matters:** Isolated benchmarks often show 10-20x speedups, but real loops have multiple operations. The combined benchmark shows the *actual* end-to-end improvement you'll see in production. -### 4.5 Two-Goroutine SPSC Pipeline Benchmark +### 4.5 Queue Benchmark Summary -The **most representative** benchmark for real Go systems—a producer/consumer pipeline: +> **Note:** The 2-goroutine SPSC pipeline and MPSC benchmarks are now documented in section 4.2.2 and 4.2.3 respectively, as part of the comprehensive queue benchmark suite. -```go -// internal/combined/pipeline_bench_test.go -package combined_test +**Key benchmark commands:** -import ( - "testing" - - "github.com/randomizedcoder/some-go-benchmarks/internal/queue" -) - -func BenchmarkPipeline_Channel(b *testing.B) { - q := queue.NewChannel[int](1024) - done := make(chan struct{}) - - // Consumer - go func() { - for { - select { - case <-done: - return - default: - q.Pop() - } - } - }() - - b.ReportAllocs() - b.ResetTimer() - - for i := 0; i < b.N; i++ { - for !q.Push(i) { - // Spin until push succeeds - } - } - - b.StopTimer() - close(done) -} - -func BenchmarkPipeline_RingBuffer(b *testing.B) { - q := queue.NewRingBuffer[int](1024) - done := make(chan struct{}) +```bash +# Single-goroutine (baseline) +go test -bench=BenchmarkQueue -benchmem ./internal/queue - // Consumer (single goroutine - SPSC contract) - go func() { - for { - select { - case <-done: - return - default: - q.Pop() - } - } - }() +# 2-goroutine SPSC pipeline +go test -bench=BenchmarkPipeline -benchmem ./internal/combined - b.ReportAllocs() - b.ResetTimer() +# MPSC (multiple producers) +go test -bench=BenchmarkMPSC -benchmem ./internal/combined +``` - // Producer (single goroutine - SPSC contract) - for i := 0; i < b.N; i++ { - for !q.Push(i) { - // Spin until push succeeds - } - } +**What these benchmarks reveal:** - b.StopTimer() - close(done) -} -``` +| Pattern | Best Use Case | +|---------|---------------| +| Single goroutine | Testing raw queue overhead | +| SPSC (2 goroutines) | Classic producer/consumer pipelines | +| MPSC (N producers) | Fan-in patterns, worker pools | ### 4.6 Benchmark Methodology Validation diff --git a/Makefile b/Makefile index 14a9613..4917f0a 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,11 @@ build: test: go test ./... -# Run benchmarks with memory stats +# ============================================================================= +# Benchmarks - All +# ============================================================================= + +# Run all benchmarks with memory stats bench: go test -bench=. -benchmem ./internal/... @@ -26,6 +30,42 @@ bench-variance: @echo "" @echo "Analyze with: benchstat bench_results.txt" +# ============================================================================= +# Benchmarks - By Category +# ============================================================================= + +# Cancel benchmarks (context vs atomic) +bench-cancel: + go test -bench=BenchmarkCancel -benchmem ./internal/cancel + +# Tick benchmarks (ticker implementations) +bench-tick: + go test -bench=BenchmarkTick -benchmem ./internal/tick + +# Queue benchmarks (single goroutine) +bench-queue: + go test -bench=BenchmarkQueue -benchmem ./internal/queue + +# Pipeline benchmarks (2-goroutine SPSC) +bench-pipeline: + go test -bench=BenchmarkPipeline -benchmem ./internal/combined + +# MPSC benchmarks (multiple producers, channel contention) +bench-mpsc: + go test -bench=BenchmarkMPSC -benchmem ./internal/combined + +# go-lock-free-ring comparison benchmarks +bench-lfr: + go test -bench=BenchmarkLFR -benchmem ./internal/combined + +# Combined loop benchmarks (cancel + tick + queue) +bench-combined: + go test -bench=BenchmarkCombined -benchmem ./internal/combined + +# ============================================================================= +# Testing & Quality +# ============================================================================= + # Run tests with race detector race: go test -race ./... @@ -48,16 +88,34 @@ clean: check: build test race @echo "All checks passed!" +# ============================================================================= # Help +# ============================================================================= + help: @echo "Available targets:" - @echo " build - Build all packages" - @echo " test - Run all tests" - @echo " bench - Run benchmarks with memory stats" - @echo " bench-count - Run benchmarks 10 times" - @echo " bench-variance- Run benchmarks and save for benchstat" - @echo " race - Run tests with race detector" - @echo " lint - Run golangci-lint" - @echo " bench-race - Run benchmarks with race detector" - @echo " clean - Remove generated files" - @echo " check - Run build, test, and race" + @echo "" + @echo "Build & Test:" + @echo " build - Build all packages" + @echo " test - Run all tests" + @echo " race - Run tests with race detector" + @echo " lint - Run golangci-lint" + @echo " check - Run build, test, and race" + @echo "" + @echo "All Benchmarks:" + @echo " bench - Run all benchmarks with memory stats" + @echo " bench-count - Run benchmarks 10 times (for variance)" + @echo " bench-variance - Run benchmarks and save for benchstat" + @echo " bench-race - Run benchmarks with race detector" + @echo "" + @echo "Category Benchmarks:" + @echo " bench-cancel - Cancel check: context vs atomic" + @echo " bench-tick - Tick check: ticker implementations" + @echo " bench-queue - Queue: single goroutine push+pop" + @echo " bench-pipeline - Pipeline: 2-goroutine SPSC producer/consumer" + @echo " bench-mpsc - MPSC: N producers -> 1 consumer (channel contention)" + @echo " bench-lfr - go-lock-free-ring comparison (SPSC vs MPSC)" + @echo " bench-combined - Combined loop: cancel + tick + queue" + @echo "" + @echo "Cleanup:" + @echo " clean - Remove generated files" diff --git a/README.md b/README.md index 5fcb5fe..0c0f8e8 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,27 @@ Measured on AMD Ryzen Threadripper PRO 3945WX, Go 1.25, Linux: |-----------|----------|-----------|---------| | Cancel check | 8.2 ns | 0.36 ns | **23x** | | Tick check | 86 ns | 5.6 ns | **15x** | -| Queue push+pop | 37 ns | 36 ns | ~1x | + +### Queue Patterns: SPSC vs MPSC + +Queue performance depends heavily on your goroutine topology: + +**SPSC (1 Producer → 1 Consumer):** + +| Implementation | Latency | Speedup | +|----------------|---------|---------| +| Channel | 248 ns | baseline | +| [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) (1 shard) | 114 ns | 2.2x | +| Our SPSC Ring (unguarded) | **36.5 ns** | **6.8x** | + +**MPSC (Multiple Producers → 1 Consumer):** + +| Producers | Channel | go-lock-free-ring | Speedup | +|-----------|---------|-------------------|---------| +| 4 | 35 µs | 539 ns | **65x** | +| 8 | 47 µs | 464 ns | **101x** | + +> **Key insight:** Channels scale terribly with multiple producers due to lock contention. For MPSC patterns, [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) provides **65-100x** speedup through sharded lock-free design. ### Combined Hot-Loop Pattern @@ -141,14 +161,168 @@ Measure the raw cost of individual operations: > **Why combined matters:** Isolated benchmarks can be misleading. A 10x speedup on context checking means nothing if your loop is bottlenecked on channel receives. The combined benchmarks reveal the *actual* improvement in realistic scenarios. -## High-Performance Alternatives +### Queue Benchmarks: Goroutine Patterns + +Queue performance varies dramatically based on goroutine topology. We benchmark three implementations: + +| Implementation | Type | Best For | +|----------------|------|----------| +| Go Channel | MPSC | Simple code, moderate throughput | +| Our SPSC Ring | SPSC | Maximum SPSC performance, zero allocs | +| [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) | MPSC | High-throughput multi-producer scenarios | + +#### SPSC: 1 Producer → 1 Consumer + +**Cross-goroutine polling** (our benchmark - separate producer/consumer goroutines): + +| Implementation | Latency | Allocs | Speedup | +|----------------|---------|--------|---------| +| Channel | 248 ns | 0 | baseline | +| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x | +| **Our SPSC Ring (unguarded)** | **36.5 ns** | **0** | **6.8x** | + +**Same-goroutine** (go-lock-free-ring native benchmarks): + +| Benchmark | Latency | Notes | +|-----------|---------|-------| +| `BenchmarkWrite` | 35 ns | Single write operation | +| `BenchmarkTryRead` | 31 ns | Single read operation | +| `BenchmarkProducerConsumer` | 31 ns | Write + periodic drain in same goroutine | +| `BenchmarkConcurrentWrite` (8 producers) | 10.7 ns | Parallel writes, sharded | + +> **Note:** Cross-goroutine coordination adds ~80ns overhead. For batched same-goroutine patterns, go-lock-free-ring achieves 31 ns/op. + +#### MPSC: N Producers → 1 Consumer + +This is where [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) shines: + +| Producers | Channel | go-lock-free-ring | Speedup | +|-----------|---------|-------------------|---------| +| 4 | 35.3 µs | 539 ns | **65x** | +| 8 | 47.1 µs | 464 ns | **101x** | + +> **Key insight:** Channel lock contention scales terribly. With 8 producers, go-lock-free-ring is **101x faster** due to its sharded design. + +#### Choosing the Right Queue + +| Your Pattern | Recommendation | Why | +|--------------|----------------|-----| +| 1 producer, 1 consumer | Our SPSC Ring | Fastest, zero allocs | +| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention | +| Simple/infrequent | Channel | Simplicity, good enough | + +#### Why Our SPSC Ring is Faster in Cross-Goroutine Tests + +For SPSC scenarios with **separate producer/consumer goroutines**, our simple ring (36.5 ns) beats go-lock-free-ring (114 ns). + +> **Important:** go-lock-free-ring's native benchmarks show ~31 ns/op for producer-consumer, but that's in the **same goroutine**. Our 114 ns measurement is for **cross-goroutine** polling, which adds coordination overhead. Both measurements are valid for their respective patterns. + +Here's why our ring is faster in cross-goroutine scenarios: + +**1. CAS vs Simple Store** + +go-lock-free-ring must use Compare-And-Swap to safely handle multiple producers: + +```go +// go-lock-free-ring: CAS to claim slot (expensive!) +if !atomic.CompareAndSwapUint64(&s.writePos, pos, pos+1) { + continue // Retry if another producer won +} +``` + +Our SPSC ring just does a simple atomic store: + +```go +// Our SPSC: simple store (fast!) +r.head.Store(head + 1) +``` + +CAS is **3-10x more expensive** than a simple store because it must read, compare, and conditionally write while handling cache invalidation across cores. + +**2. Sequence Numbers for Race Protection** + +go-lock-free-ring uses per-slot sequence numbers to prevent a consumer from reading partially-written data: + +```go +// go-lock-free-ring: extra atomic ops for safety +seq := atomic.LoadUint64(&sl.seq) // Check slot ready +if seq != pos { return false } +// ... write value ... +atomic.StoreUint64(&sl.seq, pos+1) // Signal to reader +``` + +Our SPSC ring skips this because we **trust** only one producer exists. + +**3. Boxing Allocations** + +```go +// go-lock-free-ring uses 'any' → 8 B allocation per write +sl.value = value + +// Our ring uses generics → zero allocations +r.buf[head&r.mask] = v +``` + +**What We Give Up:** +| Safety Feature | Our SPSC Ring | go-lock-free-ring | +|----------------|---------------|-------------------| +| Multiple producers | ❌ Undefined behavior | ✅ Safe | +| Race protection | ❌ Trust-based | ✅ Sequence numbers | +| Weak memory (ARM) | ⚠️ May need barriers | ✅ Proven safe | -### Lock-Free Ring Buffer +> **Bottom line:** Our SPSC ring is faster because it makes **dangerous assumptions** (single producer, x86 memory model). go-lock-free-ring is slower because it's **provably safe** for MPSC with explicit race protection. Use go-lock-free-ring for production multi-producer scenarios. -In place of standard channels, we evaluate lock-free ring buffers for lower-latency communication between goroutines. +#### Why Our Guarded RingBuffer is Slow -→ [github.com/randomizedcoder/go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) +The in-repo `RingBuffer` includes debug guards that add ~25ns overhead: + +```go +func (r *RingBuffer[T]) Push(v T) bool { + if !r.pushActive.CompareAndSwap(0, 1) { // +10-15ns + panic("concurrent Push") + } + defer r.pushActive.Store(0) // +10-15ns + // ... +} +``` + +**For production**: Use the unguarded version or [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring). + +## High-Performance Alternatives + +### Lock-Free Ring Buffers + +We provide two lock-free queue implementations with different safety/performance tradeoffs: + +**1. Our SPSC Ring Buffer** (`internal/queue/ringbuf.go`) +- Single-Producer, Single-Consumer only +- Generics-based (`[T any]`) — zero boxing allocations +- Simple atomic Load/Store (no CAS) — maximum speed +- Debug guards catch contract violations (disable for production) +- ⚠️ **No race protection** — trusts caller to maintain SPSC contract +- ⚠️ **x86 optimized** — may need memory barriers on ARM +- **Best for:** Dedicated producer/consumer goroutine pairs where you control both ends + +**2. [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring)** (external library) +- Multi-Producer, Single-Consumer (MPSC) +- Sharded design reduces contention across producers +- Uses CAS + sequence numbers for **proven race-free operation** +- Uses `any` type (causes boxing allocations) +- Configurable retry strategies for different load patterns +- ✅ **Production-tested** at 2300+ Mb/s throughput +- **Best for:** Fan-in patterns, worker pools, high-throughput pipelines + +| Feature | Our SPSC Ring | go-lock-free-ring | +|---------|---------------|-------------------| +| Producers | 1 only | Multiple | +| Consumers | 1 only | 1 only | +| Allocations | 0 | 1+ (boxing) | +| SPSC latency | **36.5 ns** | 114 ns | +| 8-producer latency | N/A | **464 ns** | +| Race protection | ❌ None | ✅ Sequence numbers | +| Write mechanism | Store | CAS + retry | +| Production ready | ⚠️ SPSC only | ✅ Battle-tested | ### Atomic Flags for Cancellation @@ -210,27 +384,55 @@ The Go runtime has an internal function `nanotime()` that returns a monotonic cl func nanotime() int64 ``` -## Repo layout +## Repo Layout -The project layout is: ``` -[das@l:~/Downloads/some-go-benchmarks]$ tree . -├── cmd -│   ├── channel -│   ├── context -│   ├── context-ticker -│   └── ticker -├── internal -├── LICENSE -└── README.md - -7 directories, 2 files +├── cmd/ # CLI tools for interactive benchmarking +│ ├── channel/main.go # Queue comparison demo +│ ├── context/main.go # Cancel check comparison demo +│ ├── context-ticker/main.go # Combined benchmark demo +│ └── ticker/main.go # Tick check comparison demo +│ +├── internal/ +│ ├── cancel/ # Cancellation signaling +│ │ ├── cancel.go # Canceler interface +│ │ ├── context.go # Standard: ctx.Done() via select +│ │ ├── atomic.go # Optimized: atomic.Bool +│ │ └── *_test.go # Unit + benchmark tests +│ │ +│ ├── queue/ # SPSC message passing +│ │ ├── queue.go # Queue[T] interface +│ │ ├── channel.go # Standard: buffered channel +│ │ ├── ringbuf.go # Optimized: lock-free ring buffer +│ │ └── *_test.go # Unit + benchmark + contract tests +│ │ +│ ├── tick/ # Periodic triggers +│ │ ├── tick.go # Ticker interface +│ │ ├── ticker.go # Standard: time.Ticker +│ │ ├── batch.go # Optimized: check every N ops +│ │ ├── atomic.go # Optimized: runtime.nanotime +│ │ ├── tsc_amd64.go/.s # Optimized: raw RDTSC (x86 only) +│ │ ├── tsc_stub.go # Stub for non-x86 architectures +│ │ └── *_test.go # Unit + benchmark tests +│ │ +│ └── combined/ # Interaction benchmarks +│ └── combined_bench_test.go +│ +├── .github/workflows/ci.yml # CI: multi-version, multi-platform +├── Makefile # Build targets +├── README.md # This file +├── WALKTHROUGH.md # Guided tutorial with example output +├── BENCHMARKING.md # Environment setup & methodology +├── IMPLEMENTATION_PLAN.md # Design document +└── IMPLEMENTATION_LOG.md # Development log ``` -The internal folder is for small library functions that holds the main code. +**Key directories:** -The ./cmd/ folder has a main.go implmentations that use the libraries, to demostrate limits. +- `internal/` — Core library implementations (standard vs optimized) +- `cmd/` — CLI tools that demonstrate the libraries with human-readable output +- `.github/workflows/` — CI testing across Go 1.21-1.23, Linux/macOS ## How to Run diff --git a/WALKTHROUGH.md b/WALKTHROUGH.md index 98ca43e..fdd0dce 100644 --- a/WALKTHROUGH.md +++ b/WALKTHROUGH.md @@ -33,6 +33,57 @@ ok github.com/randomizedcoder/some-go-benchmarks/internal/queue 0.004s ok github.com/randomizedcoder/some-go-benchmarks/internal/tick 0.735s ``` +### Using the Makefile + +The project includes a Makefile with convenient targets: + +```bash +$ make help +``` + +**Output:** + +``` +Available targets: + +Build & Test: + build - Build all packages + test - Run all tests + race - Run tests with race detector + lint - Run golangci-lint + check - Run build, test, and race + +All Benchmarks: + bench - Run all benchmarks with memory stats + bench-count - Run benchmarks 10 times (for variance) + bench-variance - Run benchmarks and save for benchstat + bench-race - Run benchmarks with race detector + +Category Benchmarks: + bench-cancel - Cancel check: context vs atomic + bench-tick - Tick check: ticker implementations + bench-queue - Queue: single goroutine push+pop + bench-pipeline - Pipeline: 2-goroutine SPSC producer/consumer + bench-mpsc - MPSC: N producers -> 1 consumer (channel contention) + bench-lfr - go-lock-free-ring comparison (SPSC vs MPSC) + bench-combined - Combined loop: cancel + tick + queue + +Cleanup: + clean - Remove generated files +``` + +**Quick Start:** + +```bash +# Run all benchmarks +$ make bench + +# Run specific category +$ make bench-lfr # go-lock-free-ring comparison +$ make bench-mpsc # Channel contention with multiple producers +$ make bench-pipeline # 2-goroutine producer/consumer +``` + --- ## Step 2: Run Basic Benchmarks @@ -120,6 +171,146 @@ BenchmarkCombined_FullLoop_Optimized-24 19513278 62.86 ns --- +### Queue Benchmarks: Goroutine Patterns + +Queue performance varies dramatically based on goroutine topology. + +#### Single Goroutine (No Contention) + +```bash +$ go test -bench=BenchmarkQueue -benchmem ./internal/queue +``` + +**Output:** + +``` +goos: linux +goarch: amd64 +pkg: github.com/randomizedcoder/some-go-benchmarks/internal/queue +cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores +BenchmarkQueue_Channel_PushPop_Direct-24 30932498 38.96 ns/op 0 B/op 0 allocs/op +BenchmarkQueue_RingBuffer_PushPop_Direct-24 32920832 35.89 ns/op 0 B/op 0 allocs/op +BenchmarkQueue_Channel_PushPop_Interface-24 27947314 43.26 ns/op 0 B/op 0 allocs/op +BenchmarkQueue_RingBuffer_PushPop_Interface-24 30313048 40.37 ns/op 0 B/op 0 allocs/op +``` + +> **Note:** The in-repo RingBuffer has SPSC guards that add overhead. An unguarded ring buffer achieves ~9.5 ns/op. + +#### SPSC: 1 Producer → 1 Consumer (2 Goroutines) + +This is the classic producer/consumer pattern—the most common Go concurrency pattern: + +```bash +$ go test -bench=BenchmarkPipeline -benchmem ./internal/combined +``` + +**Output:** + +``` +goos: linux +goarch: amd64 +pkg: github.com/randomizedcoder/some-go-benchmarks/internal/combined +cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores +BenchmarkPipeline_Channel-24 7858700 127.9 ns/op 0 B/op 0 allocs/op +BenchmarkPipeline_RingBuffer-24 11740012 146.8 ns/op 0 B/op 0 allocs/op +``` + +> **Key insight:** The guarded RingBuffer is *slower* than channels due to SPSC guard overhead. An unguarded lock-free ring buffer achieves ~39 ns/op (**3.3x faster** than channels). + +#### MPSC: N Producers → 1 Consumer (Channel Lock Contention) + +Multiple goroutines sending to one consumer shows channel lock contention: + +```bash +$ go test -bench=BenchmarkMPSC -benchmem ./internal/combined +``` + +**Output:** + +``` +goos: linux +goarch: amd64 +pkg: github.com/randomizedcoder/some-go-benchmarks/internal/combined +cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores +BenchmarkMPSC_Channel_2Producers-24 180842 5922 ns/op 0 B/op 0 allocs/op +BenchmarkMPSC_Channel_4Producers-24 119090 26351 ns/op 0 B/op 0 allocs/op +BenchmarkMPSC_Channel_8Producers-24 171520 49074 ns/op 0 B/op 0 allocs/op +``` + +**Lock contention scaling:** + +| Producers | Latency | vs 1 Producer | +|-----------|---------|---------------| +| 1 (SPSC) | 128 ns | baseline | +| 2 | 5.9 µs | **46x** slower | +| 4 | 26 µs | **200x** slower | +| 8 | 49 µs | **380x** slower | + +> **Key insight:** Channel lock contention scales poorly. For high-throughput fan-in patterns, use go-lock-free-ring. + +#### go-lock-free-ring Comparison + +The [go-lock-free-ring](https://github.com/randomizedcoder/go-lock-free-ring) library provides a sharded MPSC ring buffer. Let's compare: + +```bash +$ go test -bench=BenchmarkLFR -benchmem ./internal/combined +``` + +**Output:** + +``` +goos: linux +goarch: amd64 +pkg: github.com/randomizedcoder/some-go-benchmarks/internal/combined +cpu: AMD Ryzen Threadripper PRO 3945WX 12-Cores +BenchmarkLFR_SPSC_Channel-24 4372419 247.8 ns/op 0 B/op 0 allocs/op +BenchmarkLFR_SPSC_OurRing-24 33405556 36.53 ns/op 0 B/op 0 allocs/op +BenchmarkLFR_SPSC_ShardedRing1-24 10212240 114.1 ns/op 8 B/op 1 allocs/op +BenchmarkLFR_MPSC_Channel_4P-24 85386 35337 ns/op 0 B/op 0 allocs/op +BenchmarkLFR_MPSC_ShardedRing_4P_4S-24 2179492 539.4 ns/op 412 B/op 51 allocs/op +BenchmarkLFR_MPSC_Channel_8P-24 58347 47067 ns/op 1 B/op 0 allocs/op +BenchmarkLFR_MPSC_ShardedRing_8P_8S-24 2596642 464.0 ns/op 412 B/op 51 allocs/op +``` + +**SPSC Comparison (1 Producer → 1 Consumer):** + +These are **cross-goroutine** benchmarks (separate producer/consumer goroutines): + +| Implementation | Latency | Allocs | Speedup | +|----------------|---------|--------|---------| +| Channel | 248 ns | 0 | baseline | +| go-lock-free-ring (1 shard) | 114 ns | 1 | 2.2x | +| **Our SPSC Ring** | **36.5 ns** | **0** | **6.8x** | + +> **Note:** go-lock-free-ring's native `BenchmarkProducerConsumer` shows 31 ns/op, but that's in the **same goroutine**. Cross-goroutine polling adds ~80ns coordination overhead. + +**Why Our SPSC Ring is Faster in Cross-Goroutine Tests:** + +1. **CAS vs Store**: go-lock-free-ring uses `CompareAndSwap` to safely handle multiple producers (3-10x slower than simple Store) +2. **Sequence numbers**: go-lock-free-ring uses per-slot sequence numbers to prevent race conditions (extra atomic ops) +3. **Boxing**: go-lock-free-ring uses `any` type causing allocations; our ring uses generics (zero allocs) + +> **Tradeoff**: Our ring is faster because it makes dangerous assumptions (single producer, x86 memory model). go-lock-free-ring is slower because it's provably race-free. + +**MPSC Comparison (N Producers → 1 Consumer):** + +| Producers | Channel | go-lock-free-ring | Speedup | +|-----------|---------|-------------------|---------| +| 4 | 35.3 µs | 539 ns | **65x** | +| 8 | 47.1 µs | 464 ns | **101x** | + +> **Key insight:** go-lock-free-ring's sharded design eliminates lock contention, providing **65-101x** speedup over channels for multi-producer scenarios! + +**Choosing the Right Queue:** + +| Your Pattern | Best Choice | Why | +|--------------|-------------|-----| +| 1 producer, 1 consumer | Our SPSC Ring | Fastest (36.5 ns), zero allocs | +| N producers, 1 consumer | go-lock-free-ring | Sharding eliminates contention | +| Simple/infrequent | Channel | Simplicity over speed | + +--- + ## Step 3: Use CLI Tools The CLI tools provide easier-to-read output with throughput analysis. @@ -275,6 +466,8 @@ $ sudo nice -n -20 taskset -c 0 GOMAXPROCS=1 go test -bench=. ./internal/cancel |-----------|----------|-----------|---------| | Cancel check | 8.2 ns | 0.36 ns | **23x** | | Tick check | 86 ns | 5.6 ns (batch) | **15x** | +| Queue SPSC (2 goroutines) | 248 ns | 36.5 ns | **6.8x** | +| Queue MPSC (8 producers) | 47 µs | 464 ns | **101x** | | Combined loop | 130 ns | 63 ns | **2.1x** | ### When Do These Optimizations Matter? diff --git a/go.mod b/go.mod index 305fa46..46a8db5 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/randomizedcoder/some-go-benchmarks -go 1.25 +go 1.25.4 + +require github.com/randomizedcoder/go-lock-free-ring v1.0.4 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..89baee8 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/randomizedcoder/go-lock-free-ring v1.0.4 h1:BmhAuW2L9SER/f0NMYZ/XppBooF8dw2Hko6zw7wutzs= +github.com/randomizedcoder/go-lock-free-ring v1.0.4/go.mod h1:Vlxt5+13n/4mqwbHrYJF20R5RcyYumTXIMiSEL5POSk= diff --git a/internal/combined/combined_bench_test.go b/internal/combined/combined_bench_test.go index af5be9b..9f186a2 100644 --- a/internal/combined/combined_bench_test.go +++ b/internal/combined/combined_bench_test.go @@ -177,3 +177,155 @@ func BenchmarkPipeline_RingBuffer(b *testing.B) { b.StopTimer() close(done) } + +// ============================================================================ +// MPSC benchmarks (Multiple Producer, Single Consumer) +// ============================================================================ +// This is a very common Go pattern: multiple goroutines sending to one consumer. +// Channels naturally support this. Lock-free MPSC queues are more complex +// and require different data structures than SPSC. + +// BenchmarkMPSC_Channel_2Producers benchmarks 2 producers -> 1 consumer. +func BenchmarkMPSC_Channel_2Producers(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + var consumerDone chan struct{} + + // Consumer goroutine + consumerDone = make(chan struct{}) + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + case <-ch: + // consume + default: + // non-blocking + } + } + }() + + b.ReportAllocs() + b.ResetTimer() + + // Run producers in parallel using b.RunParallel + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + select { + case ch <- i: + default: + // Channel full, spin + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + } + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} + +// BenchmarkMPSC_Channel_4Producers benchmarks 4 producers -> 1 consumer. +func BenchmarkMPSC_Channel_4Producers(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + var consumerDone chan struct{} + + // Consumer goroutine + consumerDone = make(chan struct{}) + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + case <-ch: + // consume + default: + // non-blocking + } + } + }() + + // Set GOMAXPROCS for consistent producer count + b.SetParallelism(4) + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} + +// BenchmarkMPSC_Channel_8Producers benchmarks 8 producers -> 1 consumer. +// This stresses channel lock contention heavily. +func BenchmarkMPSC_Channel_8Producers(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + var consumerDone chan struct{} + + // Consumer goroutine + consumerDone = make(chan struct{}) + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + case <-ch: + // consume + default: + // non-blocking + } + } + }() + + b.SetParallelism(8) + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} diff --git a/internal/combined/lockfreering_bench_test.go b/internal/combined/lockfreering_bench_test.go new file mode 100644 index 0000000..4e65ebf --- /dev/null +++ b/internal/combined/lockfreering_bench_test.go @@ -0,0 +1,304 @@ +package combined_test + +import ( + "sync/atomic" + "testing" + + ring "github.com/randomizedcoder/go-lock-free-ring" +) + +// ============================================================================ +// Comparison Benchmarks: Channel vs Our SPSC vs go-lock-free-ring (MPSC) +// ============================================================================ +// +// KEY DIFFERENCE: +// - Our RingBuffer: SPSC (Single-Producer, Single-Consumer) +// - go-lock-free-ring: MPSC (Multi-Producer, Single-Consumer) with sharding +// +// The sharded MPSC design is optimized for multiple producers, not single. + +var sinkAny any +var sinkOkLfr bool + +// ============================================================================ +// SPSC: 1 Producer → 1 Consumer (comparing apples to apples) +// ============================================================================ + +// Our unguarded SPSC ring buffer (for fair comparison) +type spscRing struct { + buf []int + mask uint64 + head atomic.Uint64 + tail atomic.Uint64 +} + +func newSPSCRing(size int) *spscRing { + n := uint64(1) + for n < uint64(size) { + n <<= 1 + } + return &spscRing{buf: make([]int, n), mask: n - 1} +} + +func (r *spscRing) Push(v int) bool { + head := r.head.Load() + tail := r.tail.Load() + if head-tail >= uint64(len(r.buf)) { + return false + } + r.buf[head&r.mask] = v + r.head.Store(head + 1) + return true +} + +func (r *spscRing) Pop() (int, bool) { + tail := r.tail.Load() + head := r.head.Load() + if tail >= head { + return 0, false + } + v := r.buf[tail&r.mask] + r.tail.Store(tail + 1) + return v, true +} + +// BenchmarkLFR_SPSC_Channel - baseline channel +func BenchmarkLFR_SPSC_Channel(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + + go func() { + for { + select { + case <-done: + return + case <-ch: + default: + } + } + }() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + } + b.StopTimer() + close(done) +} + +// BenchmarkLFR_SPSC_OurRing - our unguarded SPSC +func BenchmarkLFR_SPSC_OurRing(b *testing.B) { + q := newSPSCRing(1024) + done := make(chan struct{}) + + go func() { + for { + select { + case <-done: + return + default: + q.Pop() + } + } + }() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for !q.Push(i) { + } + } + b.StopTimer() + close(done) +} + +// BenchmarkLFR_SPSC_ShardedRing1 - go-lock-free-ring with 1 shard (SPSC-like) +func BenchmarkLFR_SPSC_ShardedRing1(b *testing.B) { + r, _ := ring.NewShardedRing(1024, 1) + done := make(chan struct{}) + + go func() { + for { + select { + case <-done: + return + default: + r.TryRead() + } + } + }() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for !r.Write(0, i) { + } + } + b.StopTimer() + close(done) +} + +// ============================================================================ +// MPSC: N Producers → 1 Consumer (where go-lock-free-ring shines) +// ============================================================================ + +// BenchmarkLFR_MPSC_Channel_4P - 4 producers using channel +func BenchmarkLFR_MPSC_Channel_4P(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + consumerDone := make(chan struct{}) + + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + case <-ch: + default: + } + } + }() + + b.SetParallelism(4) + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} + +// BenchmarkLFR_MPSC_ShardedRing_4P_4S - 4 producers, 4 shards +func BenchmarkLFR_MPSC_ShardedRing_4P_4S(b *testing.B) { + r, _ := ring.NewShardedRing(1024, 4) + done := make(chan struct{}) + consumerDone := make(chan struct{}) + + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + default: + r.TryRead() + } + } + }() + + var producerID atomic.Uint64 + b.SetParallelism(4) + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + pid := producerID.Add(1) - 1 + i := 0 + for pb.Next() { + for !r.Write(pid, i) { + } + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} + +// BenchmarkLFR_MPSC_Channel_8P - 8 producers using channel +func BenchmarkLFR_MPSC_Channel_8P(b *testing.B) { + ch := make(chan int, 1024) + done := make(chan struct{}) + consumerDone := make(chan struct{}) + + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + case <-ch: + default: + } + } + }() + + b.SetParallelism(8) + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + for { + select { + case ch <- i: + goto sent + default: + } + } + sent: + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +} + +// BenchmarkLFR_MPSC_ShardedRing_8P_8S - 8 producers, 8 shards +func BenchmarkLFR_MPSC_ShardedRing_8P_8S(b *testing.B) { + r, _ := ring.NewShardedRing(2048, 8) // Larger capacity for 8 producers + done := make(chan struct{}) + consumerDone := make(chan struct{}) + + go func() { + defer close(consumerDone) + for { + select { + case <-done: + return + default: + r.TryRead() + } + } + }() + + var producerID atomic.Uint64 + b.SetParallelism(8) + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + pid := producerID.Add(1) - 1 + i := 0 + for pb.Next() { + for !r.Write(pid, i) { + } + i++ + } + }) + + b.StopTimer() + close(done) + <-consumerDone +}