goodhistogram/mapping_test.go at main · cockroachdb/goodhistogram · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// Copyright 2026 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0

package goodhistogram

import (
	"math"
	"math/rand"
	"testing"
)

// TestBucketMappingConsistency compares the Record() path (which uses
// the bucket lookup table for O(1) bucket resolution) against the
// promBucketKey path (which uses math.Frexp + sort.SearchFloat64s).
//
// The lookup table uses 8 bits of mantissa, so ~4 of 256 entries per
// schema may straddle a bucket boundary and resolve to the next
// bucket. This test verifies that:
//  1. Mismatches are bounded by the expected straddling rate (<2%).
//  2. All mismatches differ by exactly 1 bucket (never more).
//  3. The resulting quantile estimates remain accurate despite mismatches.
func TestBucketMappingConsistency(t *testing.T) {
	cfg := newConfig(500, 60e9, 0.10) // schema 2

	// recordBucketIndex matches Record() exactly: lookup table only.
	recordBucketIndex := func(v int64) int {
		fv := float64(v)
		bits := math.Float64bits(fv)
		exp := int(bits>>52) - 1022
		sub := int(cfg.bucketLookup[(bits>>bucketLookupShift)&0xFF])
		key := sub + (exp-1)*cfg.bucketsPerGroup
		idx := key - cfg.minKey
		if idx < 0 {
			idx = 0
		} else if idx >= cfg.numBuckets {
			idx = cfg.numBuckets - 1
		}
		return idx
	}

	// promBucketIndex uses promBucketKey (the reference implementation).
	promBucketIndex := func(v int64) int {
		fv := float64(v)
		key := promBucketKey(fv, cfg.schema)
		idx := key - cfg.minKey
		if idx < 0 {
			idx = 0
		} else if idx >= cfg.numBuckets {
			idx = cfg.numBuckets - 1
		}
		return idx
	}

	// Test random values across the range. Allow a small mismatch rate
	// due to lookup table straddling, but require that all mismatches
	// differ by at most 1 bucket.
	rng := rand.New(rand.NewSource(42))
	logLo := math.Log(500)
	logHi := math.Log(60e9)
	const numSamples = 1000000
	mismatches := 0
	for i := 0; i < numSamples; i++ {
		v := int64(math.Exp(rng.Float64()*(logHi-logLo) + logLo))
		if v <= 0 {
			continue
		}
		refIdx := promBucketIndex(v)
		recIdx := recordBucketIndex(v)
		if refIdx != recIdx {
			diff := recIdx - refIdx
			if diff < -1 || diff > 1 {
				t.Errorf("v=%d: ref=%d rec=%d (diff=%d exceeds 1)", v, refIdx, recIdx, diff)
			}
			mismatches++
		}
	}
	mismatchRate := float64(mismatches) / float64(numSamples) * 100
	t.Logf("Mismatches: %d / %d (%.2f%%)", mismatches, numSamples, mismatchRate)
	if mismatchRate > 2.0 {
		t.Errorf("Mismatch rate %.2f%% exceeds 2%% threshold", mismatchRate)
	}

	// Verify that quantile estimates are close despite straddling.
	t.Run("full-histogram-comparison", func(t *testing.T) {
		h := New(Params{Lo: 500, Hi: 60e9, ErrorBound: 0.10})

		rng := rand.New(rand.NewSource(99))
		values := make([]int64, 100000)
		for i := range values {
			values[i] = int64(math.Exp(rng.Float64()*(logHi-logLo) + logLo))
			if values[i] <= 0 {
				values[i] = 1
			}
		}
		for _, v := range values {
			h.Record(v)
		}
		newSnap := h.Snapshot()

		// Build counts using the reference (promBucketKey) path.
		refCounts := make([]uint64, cfg.numBuckets)
		for _, v := range values {
			idx := promBucketIndex(v)
			refCounts[idx]++
		}

		// Compare quantile estimates between the two bucket distributions.
		// They should be very close since mismatches shift values by at
		// most one bucket width.
		refSnap := Snapshot{
			cfg:        &cfg,
			Counts:     refCounts,
			TotalCount: newSnap.TotalCount,
			TotalSum:   newSnap.TotalSum,
		}
		for _, q := range []float64{0.50, 0.75, 0.90, 0.95, 0.99, 0.999} {
			refQ := refSnap.ValueAtQuantile(q)
			recQ := newSnap.ValueAtQuantile(q)
			if refQ == 0 {
				continue
			}
			relErr := math.Abs(recQ-refQ) / refQ
			if relErr > 0.05 {
				t.Errorf("p%.1f: ref=%.6f rec=%.6f relErr=%.4f (>5%%)", q*100, refQ, recQ, relErr)
			}
		}
	})

	// Show the exponent calculation difference for edge cases.
	t.Run("exponent-comparison", func(t *testing.T) {
		edgeCases := []int64{1, 2, 3, 4, 7, 8, 15, 16, 1023, 1024, 2047, 2048}
		for _, v := range edgeCases {
			fv := float64(v)
			_, frexpExp := math.Frexp(fv)
			bits := math.Float64bits(fv)
			bitsExp := int(bits>>52) - 1022
			if frexpExp != bitsExp {
				t.Logf("v=%d: frexp_exp=%d bits_exp=%d (DIFFER)", v, frexpExp, bitsExp)
			} else {
				t.Logf("v=%d: frexp_exp=%d bits_exp=%d (match)", v, frexpExp, bitsExp)
			}
		}
	})

	// Show bucket comparison for values near group boundaries.
	t.Run("bucket-boundary-comparison", func(t *testing.T) {
		bounds := cfg.groupBounds
		for i := 0; i < len(bounds); i++ {
			// Generate a value right at a bucket boundary.
			boundary := bounds[i] * 2 // undo the /2 from Frexp normalization
			// Test values just below, at, and just above.
			for _, offset := range []float64{-1e-10, 0, 1e-10} {
				testVal := boundary + offset
				bits := math.Float64bits(testVal)
				lookupIdx := int(cfg.bucketLookup[(bits>>bucketLookupShift)&0xFF])
				oldKey := promBucketKey(testVal, cfg.schema)
				newKey := lookupIdx + (int(bits>>52)-1022-1)*cfg.bucketsPerGroup

				if oldKey != newKey && i < 5 {
					t.Logf("boundary[%d]=%.15f offset=%+e: oldKey=%d newKey=%d (DIFFER)",
						i, boundary, offset, oldKey, newKey)
				}
			}
		}
	})
}