diff --git a/.github/workflows/performance-rebaseline.yml b/.github/workflows/performance-rebaseline.yml new file mode 100644 index 0000000..749d867 --- /dev/null +++ b/.github/workflows/performance-rebaseline.yml @@ -0,0 +1,59 @@ +name: Update CI Performance Baseline + +on: + workflow_dispatch: + inputs: + reason: + description: "Reason for re-baselining" + required: true + type: string + +permissions: + contents: write + +jobs: + rebaseline: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Run performance tests (update baseline) + run: npm run test:performance:update-baseline + env: + CI: true + PERF_TESTS: "1" + PERF_UPDATE_BASELINE: "1" + PERF_BASELINE_FILE: "performance-baseline.ci.json" + + - name: Commit CI baseline + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add test/performance/performance-baseline.ci.json + git commit -m "chore: update CI performance baseline + + Reason: ${{ github.event.inputs.reason }} + Triggered by: @${{ github.actor }} + Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + git push + + - name: Summary + run: | + echo "## Performance Baseline Updated" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Reason:** ${{ github.event.inputs.reason }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "The CI baseline has been committed to \`test/performance/performance-baseline.ci.json\`." >> $GITHUB_STEP_SUMMARY + echo "Future performance test runs will compare against this baseline." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + cat coverage/performance-report.md >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml new file mode 100644 index 0000000..a80d10f --- /dev/null +++ b/.github/workflows/performance-tests.yml @@ -0,0 +1,40 @@ +name: Performance Tests + +on: + pull_request: + branches: [main] + paths: + - "srv/**" + - "lib/**" + - "test/performance/**" + - "package.json" + workflow_dispatch: + +concurrency: + group: performance-${{ github.ref }} + cancel-in-progress: true + +jobs: + performance: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Run performance tests + run: npm run test:performance + env: + CI: true + PERF_TESTS: "1" + PERF_BASELINE_FILE: "performance-baseline.ci.json" + PERF_MAX_REGRESSION: "0.3" + PERF_MAX_SLOPE_VARIANCE: "0.3" diff --git a/.gitignore b/.gitignore index e026a18..27cc349 100644 --- a/.gitignore +++ b/.gitignore @@ -39,4 +39,7 @@ mta_archives/ # Tests coverage/ TEST-mocha.xml -.nyc_output/ \ No newline at end of file +.nyc_output/ + +# Performance baselines (machine-specific) +test/performance/performance-baseline.json diff --git a/.mocharc.json b/.mocharc.json index 647522c..6306198 100644 --- a/.mocharc.json +++ b/.mocharc.json @@ -1,5 +1,6 @@ { "spec": "test/**/*.test.ts", + "ignore": ["test/performance/**"], "recursive": true, "timeout": 600000, "require": ["ts-node/register/transpile-only"], diff --git a/package.json b/package.json index 38faeb3..97d2d46 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,9 @@ "tar": "npm run build && cd gen && npm pack", "watch-data-inspector-ui": "cds watch --open data-inspector-ui/webapp/index.html?sap-ui-xx-viewCache=false", "test": "cross-env CDS_TYPESCRIPT=true mocha", + "test:performance": "cross-env CDS_TYPESCRIPT=true PERF_TESTS=1 mocha --config test/performance/.mocharc.performance.json", + "test:performance:update-baseline": "cross-env CDS_TYPESCRIPT=true PERF_TESTS=1 PERF_UPDATE_BASELINE=1 mocha --config test/performance/.mocharc.performance.json", + "test:performance:check-drift": "node test/performance/check-baseline-drift.js", "coverage": "cross-env CDS_TYPESCRIPT=true c8 mocha" }, "peerDependencies": { diff --git a/test/performance/.mocharc.performance.json b/test/performance/.mocharc.performance.json new file mode 100644 index 0000000..d0f5895 --- /dev/null +++ b/test/performance/.mocharc.performance.json @@ -0,0 +1,7 @@ +{ + "spec": "test/performance/**/*.test.ts", + "recursive": true, + "timeout": 600000, + "require": ["ts-node/register/transpile-only"], + "exit": true +} diff --git a/test/performance/PERFORMANCE-TESTING-STRATEGY.md b/test/performance/PERFORMANCE-TESTING-STRATEGY.md new file mode 100644 index 0000000..ab82c2c --- /dev/null +++ b/test/performance/PERFORMANCE-TESTING-STRATEGY.md @@ -0,0 +1,616 @@ +# Performance Testing Strategy — @cap-js/data-inspector + +## 1. Overview + +This document describes the performance testing strategy for the `@cap-js/data-inspector` CAP plugin. The strategy focuses on **local processing benchmarks** — measuring the CPU/memory cost of in-process data transformations performed by the plugin's core classes, with external I/O (database, network) stubbed out. + +### Why not end-to-end? + +`data-inspector` is a CDS plugin that is consumed by host CAP applications. End-to-end latency depends heavily on the host application's database, network, and authentication stack — none of which are under this plugin's control. Testing at the class/method level isolates the plugin's own computational work and produces **stable, reproducible, CI-friendly** measurements. + +## 2. Architecture + +### 2.1 Test location + +``` +test/performance/ +├── .mocharc.performance.json # Mocha config (perf tests only) +├── ProcessingPerformance.test.ts # All benchmarks +├── check-baseline-drift.js # Long-term drift detection across git history +├── performance-baseline.json # Local developer baseline (gitignored) +├── performance-baseline.ci.json # CI baseline (committed to repo) +└── PERFORMANCE-TESTING-STRATEGY.md # This file +``` + +### 2.2 What is benchmarked + +| Group | Benchmark | What it measures | +| ------ | ------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------- | +| **A1** | `EntityDefinitionReader.read (collection)` | Full collection read: iterate entities, build element metadata, filter hidden entities, paginate, sort, construct response | +| **A2** | `EntityDefinitionReader.read (filtered)` | Same as A1 but with `$filter=contains(name, ...)` to measure filter parsing overhead | +| **A3** | `EntityDefinitionReader._getEntityElements (via read)` | Element extraction scaling: one entity with N elements (N = 10→1000) | +| **B1** | `DataReader.read (response construction, DB stubbed)` | Response loop after DB query: entity resolution, key construction, record transformation. DB returns pre-built synthetic records. | +| **B2** | `DataReader._emitAuditlogs (stubbed audit-log)` | Audit log emission with sensitive data fields. Audit-log service is stubbed; measures per-record processing overhead. | + +### 2.3 Measurement methodology + +For each benchmark, measurements are taken across 5 input sizes: **10, 50, 100, 500, 1000**. + +For each size: +1. **Warmup** — 10 runs (configurable) to stabilize JIT +2. **Measurement** — 30 total runs (20 kept + 10 extra for outlier trimming) +3. **Outlier removal** — Runs are sorted by distance from preliminary mean; the 50% extra runs furthest from the mean are discarded +4. **Statistics** — Median, mean, standard deviation, 95% confidence interval, CV% + +### 2.4 Scaling analysis + +Three complementary metrics detect non-linear scaling: + +| Metric | What it detects | Threshold | +| ------------------------------------- | ---------------------------------------------------------- | ------------------------------------ | +| **Slope ratio** | Ratio of last slope segment to first. O(n) = ~1.0 | 🟢 ≤ 2.0 / 🟡 2.0–4.0 / 🔴 > 4.0 | +| **R² (coefficient of determination)** | How well medians fit a straight line. 1.0 = perfect linear | 🟢 ≥ 0.995 / 🟡 0.98–0.995 / 🔴 < 0.98 | +| **Per-item time** | Time per item at max size; detects absolute overhead | Compared to baseline (30% tolerance) | + +### 2.5 Baseline management + +Two baselines are maintained, following the same pattern as `ai-log-analyzer`: + +| File | Git status | Purpose | +| ------------------------------ | -------------- | ------------------------------------------- | +| `performance-baseline.json` | **gitignored** | Local developer baseline (machine-specific) | +| `performance-baseline.ci.json` | **committed** | CI baseline (shared, versioned reference) | + +- **Local**: Run `npm run test:performance:update-baseline` to create `performance-baseline.json` for your machine +- **CI**: The rebaseline workflow (`performance-rebaseline.yml`) runs benchmarks on CI hardware and commits `performance-baseline.ci.json` back to the repo. The PR workflow reads this committed file via `PERF_BASELINE_FILE=performance-baseline.ci.json` +- Local baselines are **machine-specific** (gitignored) because absolute timings vary by hardware +- The CI baseline is **committed** so it is reproducible, auditable via `git log`, and immune to cache eviction +- The first run without a baseline gracefully skips (no failure) + +### 2.6 Regression detection + +When a baseline exists, each benchmark result is compared: + +1. **Per-item time** at maximum size must not exceed `baseline × (1 + MAX_REGRESSION)` (default: +30%) +2. **Slope ratio** must not exceed `baseline × (1 + MAX_SLOPE_VARIANCE)` (default: +30%) + +#### Warn-only behavior (by design) + +Regressions are surfaced via `console.warn` — **they do not fail the test**. The test only fails if no benchmarks run at all. This is intentional for the following reasons: + +- **CI hardware variance**: GitHub Actions shared runners have noisy neighbors, variable CPU clock speeds, and occasional GC pauses. Even with a 30% threshold and outlier trimming, hard failures would produce flaky CI. +- **Primary value is scaling detection**: The slope ratio and R² metrics detect O(n²) bugs, which produce dramatic regressions (10x+). These are obvious even in warn-only mode. +- **Per-item regression is informational**: Absolute timing depends on hardware; a 30% regression on CI may not reproduce locally. + +Warnings appear in the CI console output and in the performance report files (`coverage/performance-report.md`), so PR reviewers can investigate if they see them. + +#### Evolving to a hard gate (future) + +If a hard gate is desired in the future: +1. Change `console.warn` to `expect` assertions in the regression checks +2. Consider increasing the threshold to 50% for CI to absorb more noise +3. Alternatively, add a separate CI job with `continue-on-error: true` so it shows as a yellow check (not a red X) — signaling "review needed" without blocking merge + +## 3. Running the Tests + +### Local development + +```bash +# First time: create your machine's baseline +npm run test:performance:update-baseline + +# Subsequent runs: compare against baseline +npm run test:performance + +# Check for gradual drift across CI baseline git history +npm run test:performance:check-drift +``` + +### Environment variables + +| Variable | Default | Description | +| --------------------------- | --------------------------- | ---------------------------------------------- | +| `PERF_TESTS` | `0` | Set to `1` to enable performance tests | +| `PERF_UPDATE_BASELINE` | `0` | Set to `1` to write new baseline after run | +| `PERF_MAX_REGRESSION` | `0.3` | Maximum allowed per-item time regression (30%) | +| `PERF_MAX_SLOPE_VARIANCE` | `0.3` | Maximum allowed slope ratio increase (30%) | +| `PERF_WARMUP_RUNS` | `10` | Warmup iterations before measurement | +| `PERF_MEASUREMENT_RUNS` | `20` | Measurement iterations (kept after trimming) | +| `PERF_OUTLIER_TRIM_PERCENT` | `0.5` | Extra runs as fraction of measurement runs | +| `PERF_BASELINE_FILE` | `performance-baseline.json` | Baseline filename | + +### CI workflows + +| Workflow | Trigger | Purpose | +| ---------------------------- | -------------------------------------------------------- | -------------------------------------------------------------- | +| `performance-tests.yml` | PR to `main` (when srv/, lib/, test/performance/ change) | Run benchmarks, compare to committed CI baseline, log warnings | +| `performance-rebaseline.yml` | Manual dispatch | Run benchmarks on CI and commit `performance-baseline.ci.json` | + +## 4. Reports + +After each run, two report files are generated in `coverage/`: + +- **`performance-report.json`** — Machine-readable full results +- **`performance-report.md`** — Human-readable markdown with emoji indicators + +The markdown report includes: +- Environment details (Node version, CPU, memory, load) +- Test configuration (warmup, measurement, trim settings) +- System warnings (high CPU load, memory pressure) +- Results table with timing medians, CV%, per-item times, memory deltas, slope ratios, R², and baseline comparisons +- Legend explaining all indicators + +## 5. Synthetic Data Design + +All benchmarks use **synthetic data** rather than real CDS models: + +- **Entities**: Generated with configurable element counts, including keys, typed fields, hidden elements, associations, and various annotations (`@HideFromDataInspector`, `@PersonalData.IsPotentiallySensitive`, `@Core.Computed`) +- **Records**: Generated with configurable field counts, simulating realistic DB query results with UUIDs, strings, integers, and booleans +- **CDS Runtime**: `cds.model.all()`, `cds.services.db.run()`, `cds.parse.expr()`, and `cds.connect.to()` are monkey-patched per benchmark to return synthetic data, isolating the plugin's processing from actual CDS bootstrapping + +This approach ensures: +- No dependency on database state +- Deterministic, reproducible inputs +- Configurable scaling (the `sizes` array can be adjusted) +- Fast execution (no CDS server boot required) + +## 6. Baseline Drift Detection + +The `check-baseline-drift.js` script detects **gradual performance degradation** that no single run would catch. It reads the git history of `performance-baseline.ci.json` and analyzes how `perItemMsAtMax` values have changed across commits. + +### What it detects + +| Condition | Default Threshold | Severity | +| ------------------------------------------------------- | -------------------------------- | ------------------- | +| Total per-item cost increase across the examined window | 20% (`DRIFT_MAX_TOTAL_INCREASE`) | **FAIL** | +| Consecutive per-item cost increases | 3 (`DRIFT_CONSECUTIVE_WARN`) | **WARN** (advisory) | + +### How it works + +1. Queries `git log` for commits that touched `performance-baseline.ci.json` +2. Loads up to 10 historical snapshots (configurable via `DRIFT_WINDOW`) +3. For each benchmark, computes total increase, consecutive-increase streak, and OLS trend slope +4. Outputs a history table and per-benchmark analysis + +### Configuration (env vars) + +| Variable | Default | Description | +| -------------------------- | ----------------------------------------------- | --------------------------------------------- | +| `DRIFT_BASELINE_FILE` | `test/performance/performance-baseline.ci.json` | Git path of the baseline file to inspect | +| `DRIFT_WINDOW` | `10` | Number of recent commits to examine | +| `DRIFT_MAX_TOTAL_INCREASE` | `0.20` | Max allowed total increase (fraction) | +| `DRIFT_CONSECUTIVE_WARN` | `3` | Consecutive increases before advisory warning | + +### When to use + +- After accumulating 2+ CI baseline snapshots in git history (requires running the rebaseline workflow at least twice) +- As part of periodic performance health checks +- Before major releases, to verify no gradual cost drift has occurred + +## 7. Future Enhancements + +As the plugin evolves, consider adding: + +1. **Memory profiling benchmarks** — Track heap growth across repeated operations to detect memory leaks and unexpected resource consumption growth +2. **Concurrent simulation** — If the plugin adds stateful processing, add benchmarks that simulate concurrent request patterns +3. **Larger scale tests** — Extend the sizes array to [100, 500, 1000, 5000, 10000] if real-world deployments involve very large CDS models +4. **UI rendering benchmarks** — If the SAPUI5 frontend becomes a performance concern, add browser-based benchmarks using Puppeteer + +--- + +## Appendix: Performance Testing 101 — Concepts & KPIs Explained + +This appendix explains every statistical concept and KPI used in this testing strategy from first principles. If you've never done performance benchmarking before, start here. + +--- + +### A.1 Why do we measure performance at all? + +Software can be "correct" (produces the right answer) yet still unusable if it's too slow. Performance testing answers two questions: + +1. **Does it scale?** — If the input doubles, does the time roughly double (good) or quadruple (bad)? +2. **Did it get slower?** — Compared to last week's version, is the same operation taking longer? + +Question 1 is about **algorithmic complexity**. Question 2 is about **regression detection**. + +--- + +### A.2 Big-O Notation + +Big-O describes how an algorithm's cost grows as input size *n* increases: + +| Notation | Name | Example | Doubling *n* does what? | +| -------------- | ---------- | -------------------------------- | --------------------------------- | +| **O(1)** | Constant | Hash table lookup | Time stays the same | +| **O(n)** | Linear | Scanning every item in a list | Time doubles | +| **O(n²)** | Quadratic | Nested loop over all pairs | Time quadruples (4×) | +| **O(n³)** | Cubic | Triple nested loop | Time increases 8× | +| **O(n·log n)** | Log-linear | Good sort algorithms (mergesort) | Time roughly doubles (a bit more) | + +**Our goal**: every operation in data-inspector should be **O(n)** or better. If we accidentally introduce an O(n²) algorithm (e.g., a nested loop that compares every entity to every other entity), the benchmarks will catch it. + +--- + +### A.3 Median vs. Mean — Which "average" to use? + +Both are measures of central tendency, but they behave differently with outliers: + +- **Mean** (arithmetic average): Sum all values, divide by count. Sensitive to outliers — one very slow run pulls the mean up dramatically. +- **Median**: Sort all values, pick the middle one. Robust to outliers — even if one run was 100× slower, the median barely moves. + +**Why we use the median for benchmark reporting**: In benchmarking, you occasionally get "hiccup" runs where the garbage collector fires, the OS scheduler intervenes, or the CPU thermal-throttles. The median naturally ignores these glitches without requiring you to manually identify and remove them. + +We still report the mean (and use it internally for outlier detection), but the **median is the primary metric** in our results. + +--- + +### A.4 Standard Deviation (σ) and Coefficient of Variation (CV%) + +Imagine you time a function 20 times and get these results (in ms): + +``` +Run 1: 5.1 Run 2: 4.9 Run 3: 5.0 Run 4: 5.2 Run 5: 5.0 ... +``` + +The **mean** is 5.04ms. But how *consistent* are these numbers? That's what standard deviation tells you. + +#### Standard Deviation (σ) — "How spread out are my measurements?" + +Think of σ as the "average distance from the mean." Here's the intuition: + +1. Take each measurement and ask: "How far is this from the mean?" + - Run 1: |5.1 - 5.04| = 0.06 + - Run 2: |4.9 - 5.04| = 0.14 + - Run 3: |5.0 - 5.04| = 0.04 + - ...and so on for all 20 runs +2. Square those distances (so negative and positive don't cancel out) +3. Average the squared distances +4. Take the square root (to get back to the original units — milliseconds) + +The result is σ. A small σ (say 0.08ms when the mean is 5ms) means your measurements are very consistent. A large σ (say 2.5ms when the mean is 5ms) means they're all over the place. + +#### Coefficient of Variation (CV%) — "Is that spread *relatively* big or small?" + +Here's the problem with σ alone: is σ = 2ms "good" or "bad"? It depends on context: + +- If the mean is **1000ms**, then σ = 2ms is tiny (0.2% of the mean) → very stable +- If the mean is **5ms**, then σ = 2ms is huge (40% of the mean) → extremely noisy + +CV% solves this by expressing σ as a percentage of the mean: + +``` +CV% = (σ / mean) × 100 +``` + +This lets you compare the stability of a 5ms benchmark to a 500ms benchmark on equal footing. + +**Real-world example from our tests**: +- Benchmark A: mean = 0.04ms, σ = 0.008ms → CV = 20% 🔴 (noisy — the function is so fast that GC jitter dominates) +- Benchmark B: mean = 3.85ms, σ = 0.12ms → CV = 3.1% 🟢 (stable — the function takes long enough that noise is negligible) + +**Our thresholds**: + +| CV% | Indicator | Meaning | +| ----- | --------- | ------------------------------------------------------ | +| ≤ 5% | 🟢 | Stable — measurements are repeatable | +| 5–15% | 🟡 | Acceptable for Node.js (GC pauses cause some variance) | +| > 15% | 🔴 | High noise — consider more warmup or runs | + +--- + +### A.5 Confidence Interval (CI) + +Imagine you measured a function 20 times and got a median of 5.23ms. If you ran those 20 measurements again tomorrow, would you get exactly 5.23ms again? Probably not — maybe 5.18ms, or 5.31ms. The **confidence interval** tells you the range where the "true" value most likely lives. + +#### The analogy + +Think of it like measuring your height with a wobbly ruler. You measure yourself 5 times and get: 175.2cm, 174.8cm, 175.1cm, 175.5cm, 174.9cm. You're probably not exactly 175.1cm tall, but you're pretty confident you're somewhere between 174.8cm and 175.5cm. That range is your confidence interval. + +#### The math (simplified) + +``` +CI = ±1.96 × (σ / √n) +``` + +Breaking this down: +- **σ** = standard deviation (how noisy your measurements are — see A.4) +- **√n** = square root of the number of runs (more runs = narrower interval, because more data = more certainty) +- **1.96** = a magic number from statistics that gives you 95% confidence (you can think of it as "about 2") + +So the formula says: *"Take the noise level (σ), shrink it by how many measurements you took (√n), and multiply by ~2."* + +#### A worked example + +- You measured 20 runs. Median = 5.23ms. σ = 0.22ms. +- CI = ±1.96 × (0.22 / √20) = ±1.96 × (0.22 / 4.47) = ±1.96 × 0.049 = **±0.097ms** +- So we report: **5.23ms ±0.10ms** +- Meaning: "We're 95% confident the true typical time is between 5.13ms and 5.33ms." + +#### Why it matters for us + +When comparing two benchmark results (e.g., before vs. after a code change), if their confidence intervals overlap, the difference is probably just measurement noise — not a real performance change. For example: +- Before: 5.23ms ±0.10ms → range [5.13, 5.33] +- After: 5.28ms ±0.12ms → range [5.16, 5.40] +- The ranges overlap heavily → **no meaningful difference** (don't panic!) + +But if: +- Before: 5.23ms ±0.10ms → range [5.13, 5.33] +- After: 6.80ms ±0.15ms → range [6.65, 6.95] +- No overlap at all → **real regression** (investigate!) + +--- + +### A.6 Outlier Trimming + +Raw benchmark timings often contain outliers — unusually slow (or fast) runs caused by GC pauses, OS scheduling, background processes, etc. + +**Our approach** (mean-distance trimming): +1. Run 30 iterations (20 to keep + 10 extra) +2. Compute the preliminary mean of all 30 +3. For each run, compute its distance from the mean +4. Sort by distance (closest to mean first) +5. Keep the 20 closest; discard the 10 furthest + +This is more nuanced than simple "remove top/bottom 10%" trimming, because it removes outliers on *both* ends that are far from the central tendency, regardless of which direction they're in. + +--- + +### A.7 Warmup Runs + +JavaScript engines (V8 in Node.js) use **Just-In-Time (JIT) compilation**. The first few calls to a function are interpreted (slow), then V8 compiles them to optimized machine code (fast). This process is called "warming up." + +If you measure the first 5 runs, you're measuring the interpreter, not the optimized code that will run in production. That's why we run 10 warmup iterations (discarded) before starting measurements. + +**Think of it like warming up a car engine** — you don't measure fuel efficiency during the first 30 seconds after a cold start. + +--- + +### A.8 Slope and Slope Ratio + +These are the core metrics for detecting whether an algorithm is O(n) or worse. The key idea is surprisingly simple: **if adding more items always costs the same amount of extra time, the algorithm is linear. If adding more items costs *increasingly* more time, it's not.** + +#### Slope — "How much extra time does each additional item cost?" + +Imagine you're timing a function with different input sizes and you get: + +``` +Size 10 → took 1ms +Size 50 → took 5ms +Size 100 → took 10ms +Size 500 → took 50ms +Size 1000 → took 100ms +``` + +The **slope** between any two points is the "price per additional item": + +``` +slope = (time₂ - time₁) / (size₂ - size₁) +``` + +For the data above: +- Between size 10→50: slope = (5 - 1) / (50 - 10) = 4 / 40 = **0.1ms per item** +- Between size 500→1000: slope = (100 - 50) / (1000 - 500) = 50 / 500 = **0.1ms per item** + +The slope is the same! Each additional item always costs 0.1ms, regardless of whether you have 10 items or 1000. This is classic **O(n) linear** behavior. + +Now imagine a *bad* function: + +``` +Size 10 → took 1ms +Size 50 → took 5ms +Size 100 → took 20ms +Size 500 → took 250ms +Size 1000 → took 1000ms +``` + +- Between size 10→50: slope = (5 - 1) / 40 = **0.1ms per item** +- Between size 500→1000: slope = (1000 - 250) / 500 = **1.5ms per item** + +The slope grew 15× ! Adding items at large scale is much more expensive than at small scale. This screams **O(n²)**. + +#### Slope Ratio — "Did the slope stay the same or grow?" + +Instead of eyeballing slopes, we compute a single number: + +``` +slope_ratio = last_slope / first_slope +``` + +Using the examples above: +- Good function: 0.1 / 0.1 = **1.0** (perfect — the cost per item never changed) +- Bad function: 1.5 / 0.1 = **15.0** (terrible — the cost per item grew 15×) + +**Think of it like a road trip**: If driving the first 100km takes 1 hour, and the last 100km also takes 1 hour, the "slope" (time per km) is constant — that's a straight highway (linear). If the last 100km takes 5 hours, the road got progressively worse — that's like a quadratic algorithm bogging down as data grows. + +**Interpretation**: + +| Slope ratio | What it means | Big-O | +| ----------- | ------------------------------------------------------ | --------------- | +| ~1.0 | Each additional item costs the same regardless of size | **O(n)** | +| ~2.0 | Cost per item roughly doubles at larger scale | **~O(n·log n)** | +| ~4.0+ | Cost per item grows dramatically — likely quadratic | **O(n²)** | +| ~10.0+ | Severe super-linear scaling | **O(n²)+** | + +**Our thresholds**: + +| Range | Indicator | Assessment | +| ------- | --------- | ------------------------------------------------ | +| ≤ 2.0 | 🟢 | Consistent with O(n) linear scaling | +| 2.0–4.0 | 🟡 | Suspicious — investigate for hidden nested loops | +| > 4.0 | 🔴 | Clearly non-linear (O(n²) or worse) | + +--- + +### A.9 R² — Coefficient of Determination + +R² answers a simple question: **"If I draw the best possible straight line through my data, how well does it fit?"** + +#### The school analogy + +Imagine you're a teacher plotting students' study hours (x-axis) vs. exam scores (y-axis). If every student who studied twice as long scored exactly twice as high, all the dots would fall on a perfect straight line — R² = 1.0. + +In reality, some students score higher or lower than the line predicts. R² tells you what fraction of the pattern is explained by the straight line vs. what fraction is "random scatter." + +#### Visually + +``` +R² ≈ 1.0 (linear) R² ≈ 0.7 (curved/noisy) + +Time ↑ Time ↑ + | • | • + | • | • + | • | • + | • | • + | • | • + +----------→ Size +----------→ Size + Points hug the line Points curve away from the line +``` + +#### How it works (no math degree needed) + +1. **Draw the best straight line** through your 5 data points (the computer finds the line that minimizes the total distance from all points) +2. **Measure the "misses"**: For each point, how far is it from the line? Square those distances and add them up. Call this **"unexplained scatter."** +3. **Measure the "baseline scatter"**: How far is each point from the simple average (a flat horizontal line)? Square and sum. Call this **"total scatter."** +4. **Compute R²**: + +``` +R² = 1 - (unexplained scatter / total scatter) +``` + +- If the line explains everything → unexplained scatter = 0 → R² = 1.0 +- If the line explains nothing (data is random) → unexplained = total → R² = 0.0 + +#### What R² values mean for our benchmarks + +| R² | Meaning | +| ---------- | --------------------------------------------------------------------------- | +| 1.000 | All points fall exactly on a straight line — perfectly linear | +| 0.995+ | Excellent linear fit — minor measurement noise only | +| 0.98–0.995 | Mostly linear with some deviation — could be noise or mild non-linearity | +| < 0.98 | Clearly not linear — the relationship curves (quadratic, exponential, etc.) | + +#### Why do we need BOTH slope ratio and R²? + +They catch **different types of problems**: + +**Slope ratio** only looks at the first and last segments — like checking the start and end of a road trip. **R²** looks at every point along the way. + +Consider this scenario: +``` +Size: 10 50 100 500 1000 +Time: 1ms 5ms 30ms 50ms 100ms +``` + +- Slope ratio = (100-50)/(1000-500) ÷ (5-1)/(50-10) = 0.1 / 0.1 = **1.0** → looks perfect! +- But R² = **0.93** → wait, something's off! + +What happened? The function has a "hump" at size 100 (30ms is way above the straight line). The slope ratio missed it because it only compared the endpoints, but R² caught it because it checks every point. + +That's why we use both: **slope ratio catches endpoint divergence, R² catches mid-range curvature.** + +--- + +### A.10 Per-Item Time + +This is the simplest metric — just divide total time by input size: + +``` +per_item_ms = median_time_ms / size +``` + +For a truly O(n) algorithm, per-item time should be roughly constant regardless of size. If per-item time grows with size, you have a scaling problem. + +**Per-item time at max size** (the value stored in the baseline) is the most important data point because it amplifies any scaling issues. At size 10, even an O(n²) algorithm might only add 0.001ms overhead. At size 1000, that same O(n²) adds 1.0ms — visible and measurable. + +--- + +### A.11 Memory Delta (Heap ΔMB) + +We measure `process.memoryUsage().heapUsed` before and after each benchmark: + +``` +ΔMB = (heapAfter - heapBefore) / (1024 × 1024) +``` + +This catches: +- **Hidden allocations** — Creating intermediate arrays, string concatenations, or object copies that scale with input size +- **Memory leaks** — Objects that survive garbage collection because they're accidentally retained + +**Note**: JavaScript GC is non-deterministic, so memory deltas are noisier than timing measurements. They're included as an advisory signal, not a hard gate. + +--- + +### A.12 Baseline and Regression Detection + +A **baseline** is a snapshot of your benchmark results at a known-good point in time. It records, for each benchmark: +- `perItemMsAtMax` — per-item time at maximum size +- `slopeRatio` — scaling behavior +- `r2` — linearity score + +**Regression detection** compares current results to the baseline: + +``` +allowed = baseline_value × (1 + threshold) + +# Example with 30% threshold: +# If baseline per-item time = 0.005ms +# allowed = 0.005 × 1.30 = 0.0065ms +# If current = 0.007ms → REGRESSION WARNING +``` + +**Why 30% threshold?** Benchmark noise on shared CI runners (GitHub Actions) typically causes 5–15% variance. A 30% threshold means only genuine code-level regressions trigger warnings, not hardware noise. + +--- + +### A.13 Putting It All Together — Reading a Result Row + +Here's how to read a line from the performance report: + +``` +| EntityDefReader.read | 0.05, 0.19, 0.38, 1.92, 3.85 | 4.2% 🟢 | 0.0050, 0.0038, 0.0038, 0.0038, 0.0039 | 0.12, 0.15, 0.18, 0.22, 0.25 | 1.0234 🟢 | 0.9998 🟢 | 0.0040 | 1.0100 | 0.9995 | +``` + +Reading left to right: +1. **Timings** [0.05→3.85ms]: Time grows ~77× as input grows 100× → slightly sub-linear (good) +2. **CV% 4.2% 🟢**: Low variance — stable measurements +3. **Per-item** [0.005→0.0039ms]: Cost per item stays flat → O(n) confirmed +4. **Memory** [0.12→0.25MB]: Slight growth — proportional to input (expected) +5. **Slope ratio 1.0234 🟢**: Almost exactly 1.0 → perfectly linear +6. **R² 0.9998 🟢**: Nearly perfect straight line +7. **Baseline columns**: Previous per-item=0.004ms, slope=1.01, R²=0.9995 — no regression + +**Verdict**: This benchmark is healthy — linear scaling, stable measurements, no regression. + +--- + +### A.14 Quick Reference: All Emoji Indicators + +| Metric | 🟢 Good | 🟡 Watch | 🔴 Problem | +| ----------- | ------- | ------------- | --------- | +| Slope ratio | ≤ 2.0 | 2.0 – 4.0 | > 4.0 | +| R² | ≥ 0.995 | 0.980 – 0.995 | < 0.980 | +| CV% | ≤ 5% | 5% – 15% | > 15% | + +### A.15 Glossary + +| Term | Definition | +| ----------------------- | ------------------------------------------------------------------------------------------------------ | +| **Benchmark** | A controlled, repeatable experiment measuring one specific operation | +| **Warmup** | Discarded initial runs that let the JIT compiler optimize the code path | +| **Outlier** | A measurement far from the typical value, usually caused by GC/OS interference | +| **Trimming** | Removing outlier measurements before computing statistics | +| **Median** | The middle value when measurements are sorted; our primary metric | +| **Mean** | The arithmetic average of all measurements | +| **Standard deviation** | How spread out measurements are from the mean | +| **CV%** | Standard deviation as a percentage of the mean — normalized measure of noise | +| **Confidence interval** | Range within which the true value likely falls (95% probability) | +| **Slope** | Rate of time change per unit of input size between two measurement points | +| **Slope ratio** | Last slope ÷ first slope; 1.0 = perfectly linear growth | +| **R²** | Coefficient of determination; 1.0 = data falls perfectly on a straight line | +| **Per-item time** | Total time ÷ input size; should stay constant for O(n) algorithms | +| **Baseline** | Stored snapshot of benchmark results used as the reference for regression detection | +| **Regression** | A statistically significant increase in cost compared to the baseline | +| **Drift** | Gradual, incremental performance degradation across many commits (no single commit triggers a warning) | +| **Heap delta** | Change in V8 heap memory usage during a benchmark run | +| **JIT** | Just-In-Time compilation — V8's process of compiling JavaScript to machine code at runtime | +| **GC** | Garbage Collection — V8's automatic memory reclamation process | diff --git a/test/performance/ProcessingPerformance.test.ts b/test/performance/ProcessingPerformance.test.ts new file mode 100644 index 0000000..a1cff54 --- /dev/null +++ b/test/performance/ProcessingPerformance.test.ts @@ -0,0 +1,445 @@ +/** + * Performance benchmarks for @cap-js/data-inspector. + * + * Measures local processing cost of EntityDefinitionReader and DataReader + * across multiple input sizes (10→1000) to detect non-linear scaling and + * regressions against a stored baseline. + * + * Run: + * npm run test:performance # compare against baseline + * npm run test:performance:update-baseline # create/update baseline + * + * See PERFORMANCE-TESTING-STRATEGY.md for full documentation. + */ + +import cds from "@sap/cds"; +import fs from "fs"; +import path from "path"; +import os from "os"; +import { expect } from "chai"; + +import { EntityDefinitionReader } from "../../srv/EntityDefinitionReader"; +import { DataReader } from "../../srv/DataReader"; + +import { + type BenchmarkResult, + type BaselineData, + type Report, + sizes, + checkSystemState, + benchmarkSync, + benchmarkAsync, + buildMarkdownReport, + buildSyntheticEntities, + buildSyntheticRecords, + buildEntityDefinitionRequest, + buildDataReadRequest, +} from "./helpers"; + +// --------------------------------------------------------------------------- +// Configuration (env-overridable) +// --------------------------------------------------------------------------- +const PERF_ENABLED = process.env.PERF_TESTS === "1"; +const UPDATE_BASELINE = process.env.PERF_UPDATE_BASELINE === "1"; +const MAX_REGRESSION = Number(process.env.PERF_MAX_REGRESSION ?? "0.3"); +const MAX_SLOPE_VARIANCE = Number(process.env.PERF_MAX_SLOPE_VARIANCE ?? "0.3"); +const WARMUP_RUNS = Number(process.env.PERF_WARMUP_RUNS ?? "10"); +const MEASUREMENT_RUNS = Number(process.env.PERF_MEASUREMENT_RUNS ?? "20"); +const OUTLIER_TRIM_PERCENT = Number(process.env.PERF_OUTLIER_TRIM_PERCENT ?? "0.5"); + +const BASELINE_FILENAME = process.env.PERF_BASELINE_FILE ?? "performance-baseline.json"; +const BASELINE_PATH = path.resolve(__dirname, BASELINE_FILENAME); +const REPORT_PATH = path.resolve(__dirname, "..", "..", "coverage", "performance-report.json"); +const REPORT_MD_PATH = path.resolve(__dirname, "..", "..", "coverage", "performance-report.md"); + +const describePerf = PERF_ENABLED ? describe : describe.skip; + +// --------------------------------------------------------------------------- +// Test suite +// --------------------------------------------------------------------------- +describePerf("Performance - Data Inspector Processing", function () { + this.timeout(300000); // 5 minutes + + /** Pre-built synthetic data per size (populated in before hook). */ + const entitiesBySize = new Map(); + const recordsBySize = new Map(); + + let report: Report; + + // Load CDS model so cds.model, cds.parse, cds.ql are available + before(async function () { + const csn = await cds.load(path.resolve(__dirname, "..", "..")); + cds.model = cds.compile.for.nodejs(csn); + + if (!UPDATE_BASELINE && !fs.existsSync(BASELINE_PATH)) { + const isCI = process.env.CI === "true" || !!process.env.GITHUB_ACTIONS; + const message = isCI + ? `Performance baseline not found at ${BASELINE_FILENAME}.\n` + + " To establish the CI baseline, run the 'Update CI Performance Baseline' workflow.\n" + + " See: .github/workflows/performance-rebaseline.yml" + : `Performance baseline not found at ${BASELINE_FILENAME}.\n` + + " Run 'npm run test:performance:update-baseline' to create a baseline for your machine."; + console.log(`\n ⚠️ Skipping performance tests: ${message}\n`); + this.skip(); + } + }); + + // Pre-generate synthetic data for all sizes + before(() => { + for (const size of sizes) { + entitiesBySize.set(size, buildSyntheticEntities(size)); + recordsBySize.set(size, buildSyntheticRecords(size)); + } + }); + + // Write reports and optionally update baseline after all benchmarks + after(() => { + if (!report) return; + + const reportDir = path.dirname(REPORT_PATH); + fs.mkdirSync(reportDir, { recursive: true }); + fs.writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2), "utf8"); + fs.writeFileSync(REPORT_MD_PATH, buildMarkdownReport(report), "utf8"); + + if (UPDATE_BASELINE) { + fs.writeFileSync( + BASELINE_PATH, + JSON.stringify( + report.results.reduce((acc, result) => { + acc[result.name] = { + sizes: result.sizes, + perItemMsAtMax: result.perItemMs[result.perItemMs.length - 1], + slopeRatio: result.slopeRatio, + r2: result.r2, + }; + return acc; + }, {} as BaselineData), + null, + 2 + ), + "utf8" + ); + } + }); + + it("should keep local processing roughly linear", async () => { + const results: BenchmarkResult[] = []; + + // ------------------------------------------------------------------- + // Group A: EntityDefinitionReader — pure in-memory, no DB + // ------------------------------------------------------------------- + + // A1: Collection read — iterate entities, build metadata, paginate, sort + results.push( + benchmarkSync("EntityDefinitionReader.read (collection)", (size) => { + const entities = entitiesBySize.get(size)!; + const originalAll = cds.model.all; + cds.model.all = ((kind: string) => { + if (kind === "entity") return entities; + if (kind === "service") return []; + return originalAll.call(cds.model, kind); + }) as any; + + try { + const reader = new EntityDefinitionReader(); + const req = buildEntityDefinitionRequest({ top: size }); + reader.read(req as any); + } finally { + cds.model.all = originalAll; + } + }) + ); + + // A2: Collection read with $filter — measures filter parsing overhead + results.push( + benchmarkSync("EntityDefinitionReader.read (filtered)", (size) => { + const entities = entitiesBySize.get(size)!; + const originalAll = cds.model.all; + cds.model.all = ((kind: string) => { + if (kind === "entity") return entities; + if (kind === "service") return []; + return originalAll.call(cds.model, kind); + }) as any; + + try { + const reader = new EntityDefinitionReader(); + const req = buildEntityDefinitionRequest({ + filter: `contains(name, 'Entity')`, + top: size, + }); + reader.read(req as any); + } finally { + cds.model.all = originalAll; + } + }) + ); + + // A3: Element extraction — one entity with N elements (N = 10→1000) + results.push( + benchmarkSync("EntityDefinitionReader._getEntityElements (via read)", (size) => { + const entity = buildSyntheticEntities(1, size)[0]; + const entities = [entity]; + const originalAll = cds.model.all; + cds.model.all = ((kind: string) => { + if (kind === "entity") return entities; + if (kind === "service") return []; + return originalAll.call(cds.model, kind); + }) as any; + + try { + const reader = new EntityDefinitionReader(); + const req: any = { + params: [{ name: entity.name }], + query: { SELECT: { columns: ["*"] } }, + req: { query: {} }, + reject: (code: number, msg: string) => { + throw new Error(`${code} ${msg}`); + }, + }; + reader.read(req as any); + } finally { + cds.model.all = originalAll; + } + }) + ); + + // ------------------------------------------------------------------- + // Group B: DataReader — response construction (DB stubbed) + // ------------------------------------------------------------------- + + // B1: Response loop — entity resolution, key construction, record transformation + results.push( + await benchmarkAsync("DataReader.read (response construction, DB stubbed)", async (size) => { + const records = recordsBySize.get(size)!; + const entityName = "perf.test.Entity_0"; + + const syntheticEntity = buildSyntheticEntityForDataReader(entityName); + + // Stub cds.model.all + const originalAll = cds.model.all; + cds.model.all = ((kind: string) => { + if (kind === "entity") return [syntheticEntity]; + if (kind === "service") return []; + return originalAll.call(cds.model, kind); + }) as any; + + // Stub cds.services.db.run → return synthetic records + const originalDb = cds.services.db; + (cds.services as any).db = { + run: async () => { + const result = [...records]; + (result as any).$count = records.length; + return result; + }, + }; + + // Stub cds.ql.SELECT → chainable builder + const originalQL = cds.ql; + (cds as any).ql = { + ...originalQL, + SELECT: { + from: () => { + const builder: any = { + columns: () => builder, + where: () => builder, + orderBy: () => builder, + limit: (l: number, o: number) => { + builder.SELECT = { limit: { offset: { val: o } }, count: true }; + return builder; + }, + SELECT: { limit: { offset: { val: 0 } }, count: true }, + }; + return builder; + }, + }, + }; + + // Stub cds.parse.expr + const originalParse = cds.parse; + (cds as any).parse = { + ...originalParse, + expr: () => ({ + xpr: [{ ref: ["entityName"] }, "=", { val: entityName }], + }), + }; + + try { + const reader = new DataReader(); + const req = buildDataReadRequest(entityName); + await reader.read(req as any); + } finally { + cds.model.all = originalAll; + (cds.services as any).db = originalDb; + (cds as any).ql = originalQL; + (cds as any).parse = originalParse; + } + }) + ); + + // B2: Audit log emission — sensitive data fields, stubbed audit-log service + results.push( + await benchmarkAsync("DataReader._emitAuditlogs (stubbed audit-log)", async (size) => { + const records = recordsBySize.get(size)!; + + const syntheticEntity: any = { + name: "perf.test.SensitiveEntity", + "@PersonalData.DataSubjectRole": "Customer", + elements: { + id: { type: "cds.UUID", key: true }, + email: { type: "cds.String", key: false, "@PersonalData.IsPotentiallySensitive": true }, + phone: { type: "cds.String", key: false, "@PersonalData.IsPotentiallySensitive": true }, + name: { type: "cds.String", key: false }, + }, + get keyElements4DataInspector() { + return ["id"]; + }, + }; + + const sensitiveRecords = records.map((r: any) => ({ + ...r, + email: `user_${r.id}@example.com`, + phone: `+1-555-${String(records.indexOf(r)).padStart(4, "0")}`, + name: `User ${r.id}`, + })); + + // Stub cds.env.requires to include audit-log + const originalEnv = { ...cds.env }; + (cds.env as any).requires = { + ...cds.env.requires, + "audit-log": { kind: "audit-log-to-console" }, + }; + + // Stub cds.connect.to → return stubbed audit-log service + const originalConnect = cds.connect; + (cds as any).connect = { + ...originalConnect, + to: async (serviceName: string) => { + if (serviceName === "audit-log") return { log: async () => {} }; + return originalConnect.to(serviceName); + }, + }; + + try { + const reader = new DataReader(); + await (reader as any)._emitAuditlogs(syntheticEntity, sensitiveRecords); + } finally { + (cds as any).env = originalEnv; + (cds as any).connect = originalConnect; + } + }) + ); + + // ------------------------------------------------------------------- + // Build report and check regressions + // ------------------------------------------------------------------- + report = buildReport(results); + + if (report.systemWarnings.length > 0) { + console.log("\n System Warnings:"); + report.systemWarnings.forEach((w) => console.log(` ! ${w}`)); + console.log(""); + } + + // Load baseline and check for regressions + let baseline: BaselineData | undefined; + if (fs.existsSync(BASELINE_PATH)) { + baseline = JSON.parse(fs.readFileSync(BASELINE_PATH, "utf8")) as BaselineData; + report.baseline = baseline; + } + + expect(results).to.have.length.greaterThan(0); + checkRegressions(results, baseline); + }); +}); + +// --------------------------------------------------------------------------- +// Helpers (test-specific, not reusable across projects) +// --------------------------------------------------------------------------- + +/** Builds the Report object from benchmark results and current environment. */ +function buildReport(results: BenchmarkResult[]): Report { + return { + timestamp: new Date().toISOString(), + sizes: [...sizes], + results, + regressionThreshold: MAX_REGRESSION, + slopeVarianceThreshold: MAX_SLOPE_VARIANCE, + testConfig: { + warmupRuns: WARMUP_RUNS, + measurementRuns: MEASUREMENT_RUNS, + outlierTrimPercent: OUTLIER_TRIM_PERCENT, + totalRunsPerSize: MEASUREMENT_RUNS + Math.ceil(MEASUREMENT_RUNS * OUTLIER_TRIM_PERCENT), + }, + environment: { + node: process.version, + platform: `${process.platform} ${os.release()}`, + cpus: os.cpus()[0].model, + totalMemoryGB: os.totalmem() / 1024 ** 3, + cpuLoad: os.loadavg(), + }, + systemWarnings: checkSystemState(), + }; +} + +/** + * Checks each result against the baseline and emits warnings for regressions. + * Warnings are advisory only — they do not fail the test (see strategy doc §6). + */ +function checkRegressions(results: BenchmarkResult[], baseline?: BaselineData): void { + if (!baseline || UPDATE_BASELINE) return; + + for (const result of results) { + const entry = baseline[result.name]; + if (!entry) { + console.warn( + ` ⚠️ WARNING: ${result.name} baseline entry missing — skipping regression check` + ); + continue; + } + + // Slope ratio regression + if (entry.slopeRatio > 0.5 && result.slopeRatio > 0) { + const slopeAllowed = entry.slopeRatio * (1 + MAX_SLOPE_VARIANCE); + if (result.slopeRatio > slopeAllowed) { + console.warn( + ` ⚠️ WARNING: ${result.name} slope ratio regression: ` + + `${result.slopeRatio.toFixed(4)} > allowed ${slopeAllowed.toFixed(4)} ` + + `(baseline: ${entry.slopeRatio.toFixed(4)}, threshold: +${(MAX_SLOPE_VARIANCE * 100).toFixed(0)}%)` + ); + } + } + + // Per-item time regression + const currentPerItem = result.perItemMs[result.perItemMs.length - 1]; + const allowed = entry.perItemMsAtMax * (1 + MAX_REGRESSION); + if (currentPerItem > allowed) { + console.warn( + ` ⚠️ WARNING: ${result.name} per-item time regression: ` + + `${currentPerItem.toFixed(7)}ms > allowed ${allowed.toFixed(7)}ms ` + + `(baseline: ${entry.perItemMsAtMax.toFixed(7)}ms, threshold: +${(MAX_REGRESSION * 100).toFixed(0)}%)` + ); + } + } +} + +/** Builds a synthetic entity definition for DataReader benchmarks (B1). */ +function buildSyntheticEntityForDataReader(entityName: string): any { + return { + name: entityName, + "@HideFromDataInspector": false, + "@cds.query.limit.default": 1000, + "@cds.query.limit.max": 1000, + elements: { + id: { type: "cds.UUID", key: true }, + ...Object.fromEntries( + Array.from({ length: 9 }, (_, j) => [`field_${j + 1}`, { type: "cds.String", key: false }]) + ), + }, + get keyElements4DataInspector() { + return ["id"]; + }, + get dataSource4DataInspector() { + return "db"; + }, + }; +} diff --git a/test/performance/check-baseline-drift.js b/test/performance/check-baseline-drift.js new file mode 100644 index 0000000..bab4107 --- /dev/null +++ b/test/performance/check-baseline-drift.js @@ -0,0 +1,270 @@ +#!/usr/bin/env node +// check-baseline-drift.js +// +// Detects gradual drift in the CI performance baseline across git commits. +// +// Background: each developer keeps a local `performance-baseline.json` (gitignored) +// calibrated to their own machine. The CI-managed baseline is +// `performance-baseline.ci.json`, which is committed and updated only via the +// manual `performance-rebaseline` GitHub Actions workflow. Because it lives in git, +// its history captures every time the CI environment was re-baselined, making it +// possible to detect gradual cost drift even when no single update exceeded the +// single-run regression threshold. +// +// This script reads those commits and warns when: +// - The total per-item cost increase across the examined window exceeds +// DRIFT_MAX_TOTAL_INCREASE (default 20%). +// - There are DRIFT_CONSECUTIVE_WARN (default 3) consecutive increases. +// +// Run: +// npm run test:performance:check-drift (uses CI baseline history) +// node test/performance/check-baseline-drift.js +// +// Options (env vars): +// DRIFT_BASELINE_FILE (default "test/performance/performance-baseline.ci.json"): +// git path of the baseline file to inspect. +// DRIFT_WINDOW (default 10): number of recent commits to examine. +// DRIFT_MAX_TOTAL_INCREASE (default 0.20): max allowed total increase across +// the window as a fraction (0.20 = 20%). +// DRIFT_CONSECUTIVE_WARN (default 3): number of consecutive per-item cost +// increases before emitting a warning. + +/* eslint-disable no-console */ +"use strict"; + +const { execSync } = require("child_process"); + +const BASELINE_GIT_PATH = + process.env.DRIFT_BASELINE_FILE ?? "test/performance/performance-baseline.ci.json"; +const DRIFT_WINDOW = Number(process.env.DRIFT_WINDOW ?? "10"); +const DRIFT_MAX_TOTAL = Number(process.env.DRIFT_MAX_TOTAL_INCREASE ?? "0.20"); +const DRIFT_CONSECUTIVE = Number(process.env.DRIFT_CONSECUTIVE_WARN ?? "3"); + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function run(cmd) { + try { + return execSync(cmd, { + encoding: "utf8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + return null; + } +} + +/** + * Returns commits that touched the baseline file, most recent first. + * Each entry: { hash: string, date: string } + */ +function getCommitHistory() { + const raw = run(`git log --follow --format="%H %aI" -- ${BASELINE_GIT_PATH}`); + if (!raw) return []; + return raw + .split("\n") + .map((line) => { + const cleaned = line.replace(/"/g, ""); + const spaceIdx = cleaned.indexOf(" "); + if (spaceIdx === -1) return null; + return { + hash: cleaned.slice(0, spaceIdx), + date: cleaned.slice(spaceIdx + 1), + }; + }) + .filter((c) => c && c.hash && c.date); +} + +/** + * Reads and parses performance-baseline.ci.json at the given commit hash. + */ +function readBaselineAtCommit(hash) { + const raw = run(`git show ${hash}:${BASELINE_GIT_PATH}`); + if (!raw) return null; + try { + return JSON.parse(raw); + } catch { + return null; + } +} + +/** + * Returns the Ordinary Least Squares slope for `values` indexed 0..n-1. + */ +function olsSlope(values) { + const n = values.length; + if (n < 2) return 0; + const meanX = (n - 1) / 2; + const meanY = values.reduce((a, b) => a + b, 0) / n; + let num = 0, + den = 0; + for (let i = 0; i < n; i++) { + const dx = i - meanX; + num += dx * (values[i] - meanY); + den += dx * dx; + } + return den === 0 ? 0 : num / den; +} + +/** + * Returns the length of the trailing run of strictly increasing values. + * E.g. [1, 2, 1, 3, 4, 5] → 3 (last three entries form an increasing run) + */ +function trailingIncreaseStreak(values) { + let count = 0; + for (let i = values.length - 1; i > 0; i--) { + if (values[i] > values[i - 1]) count++; + else break; + } + return count; +} + +/** Left-pad / right-pad helpers for table formatting. */ +const rpad = (s, w) => String(s).slice(0, w).padEnd(w); +const lpad = (s, w) => String(s).slice(0, w).padStart(w); + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +function main() { + console.log("=== Performance Baseline Drift Check ===\n"); + + const commits = getCommitHistory(); + if (commits.length === 0) { + console.log(`No git history found for: ${BASELINE_GIT_PATH}`); + console.log( + "Tip: the CI baseline is created by the `performance-rebaseline` workflow (manual trigger in GitHub Actions)." + ); + console.log( + " Run it at least twice to accumulate history. Until then, drift detection is not possible." + ); + process.exit(0); + } + + const window = commits.slice(0, DRIFT_WINDOW); // most recent first + console.log( + `Examining ${window.length} most recent commit(s) (${commits.length} total). DRIFT_WINDOW=${DRIFT_WINDOW}\n` + ); + + // Load snapshots in chronological order (oldest first) for trend analysis. + const snapshots = []; + for (const commit of [...window].reverse()) { + const data = readBaselineAtCommit(commit.hash); + if (data) snapshots.push({ ...commit, data }); + } + + if (snapshots.length < 2) { + console.log(`Only ${snapshots.length} readable snapshot(s) — need at least 2 to detect drift.`); + console.log( + "Trigger the `performance-rebaseline` workflow again to accumulate a second snapshot." + ); + process.exit(0); + } + + // Collect all known benchmark names across all snapshots. + const benchmarkNames = [...new Set(snapshots.flatMap((s) => Object.keys(s.data)))]; + + // ------------------------------------------------------------------------- + // History table: perItemMsAtMax per benchmark per commit date + // ------------------------------------------------------------------------- + const dateHeaders = snapshots.map((s) => s.date.slice(0, 10)); + const nameWidth = 36; + const colWidth = 14; + + const headerRow = + rpad("Benchmark (perItemMsAtMax)", nameWidth) + + dateHeaders.map((d) => lpad(d, colWidth)).join(""); + console.log(headerRow); + console.log("-".repeat(headerRow.length)); + + for (const name of benchmarkNames) { + const cells = snapshots.map((s) => { + const v = s.data[name]?.perItemMsAtMax; + return typeof v === "number" ? v.toExponential(3) : "n/a"; + }); + console.log(rpad(name, nameWidth) + cells.map((c) => lpad(c, colWidth)).join("")); + } + + // ------------------------------------------------------------------------- + // Drift analysis per benchmark + // ------------------------------------------------------------------------- + console.log("\n=== Drift Analysis ===\n"); + let hasViolation = false; + + for (const name of benchmarkNames) { + const values = snapshots + .map((s) => s.data[name]?.perItemMsAtMax) + .filter((v) => typeof v === "number"); + + if (values.length < 2) continue; + + const oldest = values[0]; + const latest = values[values.length - 1]; + const totalIncrease = oldest > 0 ? (latest - oldest) / oldest : 0; + const streak = trailingIncreaseStreak(values); + const slope = olsSlope(values); + // Normalized slope: fraction of oldest value per commit step. + const slopeNorm = oldest > 0 ? slope / oldest : 0; + + const issues = []; + if (totalIncrease > DRIFT_MAX_TOTAL) { + issues.push( + `total increase ${(totalIncrease * 100).toFixed(1)}% exceeds DRIFT_MAX_TOTAL_INCREASE=${(DRIFT_MAX_TOTAL * 100).toFixed(0)}%` + ); + hasViolation = true; + } + if (streak >= DRIFT_CONSECUTIVE) { + // Streak warnings are advisory only — not violations (could be noise). + issues.push( + `${streak} consecutive increases (DRIFT_CONSECUTIVE_WARN=${DRIFT_CONSECUTIVE}) — investigate, may be noise` + ); + } + + const tag = + issues.length > 0 && totalIncrease > DRIFT_MAX_TOTAL + ? "FAIL" + : issues.length > 0 + ? "WARN" + : slopeNorm > 0 + ? "info" + : "ok "; + + console.log( + `[${tag}] ${rpad(name, nameWidth - 7)}` + + ` total=${lpad((totalIncrease * 100).toFixed(1) + "%", 7)}` + + ` streak=${streak}` + + ` slope=${slopeNorm >= 0 ? "+" : ""}${(slopeNorm * 100).toFixed(2)}%/commit` + ); + for (const issue of issues) { + console.log(` └─ ${issue}`); + } + } + + console.log("\n--- Thresholds ---"); + console.log( + ` DRIFT_MAX_TOTAL_INCREASE = ${(DRIFT_MAX_TOTAL * 100).toFixed(0)}% (set via env var)` + ); + console.log( + ` DRIFT_CONSECUTIVE_WARN = ${DRIFT_CONSECUTIVE} consecutive increases (advisory, not a violation)` + ); + console.log( + "\nTo re-baseline after an intentional performance change: trigger the `performance-rebaseline` workflow in GitHub Actions." + ); + console.log( + " Developers: keep your local `performance-baseline.json` up to date with `npm run test:performance:update-baseline`." + ); + + if (hasViolation) { + console.log( + "\n[WARN] Baseline drift exceeds threshold(s). Either optimize the affected code path" + + " and update the baseline, or raise DRIFT_MAX_TOTAL_INCREASE if the increase is intentional." + ); + } else { + console.log("\n[PASS] No significant drift detected."); + } + process.exit(0); +} + +main(); diff --git a/test/performance/helpers/index.ts b/test/performance/helpers/index.ts new file mode 100644 index 0000000..ba43bd6 --- /dev/null +++ b/test/performance/helpers/index.ts @@ -0,0 +1,12 @@ +/** + * Barrel export for performance test helpers. + * + * Usage in test files: + * import { benchmarkSync, buildSyntheticEntities, ... } from "./helpers"; + */ + +export * from "./types"; +export * from "./statistics"; +export * from "./measurement"; +export * from "./reporting"; +export * from "./synthetic-data"; diff --git a/test/performance/helpers/measurement.ts b/test/performance/helpers/measurement.ts new file mode 100644 index 0000000..f74acd8 --- /dev/null +++ b/test/performance/helpers/measurement.ts @@ -0,0 +1,259 @@ +/** + * Benchmark measurement infrastructure. + * + * Provides timing functions (sync/async), outlier trimming, slope computation, + * R² linear fit analysis, and high-level benchmark runners that orchestrate + * warmup → measure → analyze across multiple input sizes. + */ + +import { performance } from "perf_hooks"; +import type { MeasurementStats, BenchmarkResult } from "./types"; +import { mean, calculateStats } from "./statistics"; + +// --------------------------------------------------------------------------- +// Configuration (env-overridable defaults) +// --------------------------------------------------------------------------- +const MEASUREMENT_RUNS = Number(process.env.PERF_MEASUREMENT_RUNS ?? "20"); +const OUTLIER_TRIM_PERCENT = Number(process.env.PERF_OUTLIER_TRIM_PERCENT ?? "0.5"); +const WARMUP_RUNS = Number(process.env.PERF_WARMUP_RUNS ?? "10"); + +/** Default input sizes used across all benchmarks. */ +export const sizes = [10, 50, 100, 500, 1000]; + +// --------------------------------------------------------------------------- +// Low-level timing +// --------------------------------------------------------------------------- + +/** Result of a single measurement pass (one input size). */ +type MeasurementResult = { + timings: number[]; + stats: MeasurementStats; + /** Heap delta in MB across all runs. */ + memoryDeltaMB: number; +}; + +/** + * Times an async function `runs` times (plus extra runs for outlier trimming). + * Returns trimmed timings, descriptive stats, and heap memory delta. + */ +export const measureAsync = async ( + fn: () => Promise, + runs: number = MEASUREMENT_RUNS +): Promise => { + const extraRuns = Math.ceil(runs * OUTLIER_TRIM_PERCENT); + const totalRuns = runs + extraRuns; + const allTimings: number[] = []; + const memBefore = process.memoryUsage(); + + for (let i = 0; i < totalRuns; i++) { + const start = performance.now(); + await fn(); + const end = performance.now(); + allTimings.push(end - start); + } + + const memAfter = process.memoryUsage(); + const memoryDeltaMB = (memAfter.heapUsed - memBefore.heapUsed) / (1024 * 1024); + + return { ...trimOutliers(allTimings, runs), memoryDeltaMB }; +}; + +/** + * Times a synchronous function `runs` times (plus extra runs for outlier trimming). + * Returns trimmed timings, descriptive stats, and heap memory delta. + */ +export const measureSync = (fn: () => void, runs: number = MEASUREMENT_RUNS): MeasurementResult => { + const extraRuns = Math.ceil(runs * OUTLIER_TRIM_PERCENT); + const totalRuns = runs + extraRuns; + const allTimings: number[] = []; + const memBefore = process.memoryUsage(); + + for (let i = 0; i < totalRuns; i++) { + const start = performance.now(); + fn(); + const end = performance.now(); + allTimings.push(end - start); + } + + const memAfter = process.memoryUsage(); + const memoryDeltaMB = (memAfter.heapUsed - memBefore.heapUsed) / (1024 * 1024); + + return { ...trimOutliers(allTimings, runs), memoryDeltaMB }; +}; + +/** + * Removes outliers by keeping the `keep` values closest to the preliminary mean. + * Returns the trimmed, sorted timings and their stats. + */ +function trimOutliers( + allTimings: number[], + keep: number +): { timings: number[]; stats: MeasurementStats } { + const preliminaryMean = mean(allTimings); + const timingsWithDistance = allTimings.map((timing) => ({ + timing, + distance: Math.abs(timing - preliminaryMean), + })); + timingsWithDistance.sort((a, b) => a.distance - b.distance); + const trimmedTimings = timingsWithDistance + .slice(0, keep) + .map((t) => t.timing) + .sort((a, b) => a - b); + + return { timings: trimmedTimings, stats: calculateStats(trimmedTimings) }; +} + +// --------------------------------------------------------------------------- +// Scaling analysis +// --------------------------------------------------------------------------- + +/** + * Computes the slope (Δtime / Δsize) between each consecutive pair of sizes. + * Returns an array of length `times.length - 1`. + */ +export const computeSlopes = (times: number[], sizeValues: number[]): number[] => { + const slopes: number[] = []; + for (let i = 1; i < times.length; i++) { + const deltaT = times[i] - times[i - 1]; + const deltaN = sizeValues[i] - sizeValues[i - 1]; + slopes.push(deltaT / deltaN); + } + return slopes; +}; + +/** + * Computes R² (coefficient of determination) for a linear least-squares fit + * of `times` vs `sizeValues`. Returns 1.0 for a perfect straight line. + */ +export const computeR2 = (times: number[], sizeValues: number[]): number => { + const n = times.length; + if (n < 2) return 1; + const meanX = sizeValues.reduce((sum, x) => sum + x, 0) / n; + const meanY = times.reduce((sum, y) => sum + y, 0) / n; + let numerator = 0; + let denominator = 0; + for (let i = 0; i < n; i++) { + const dx = sizeValues[i] - meanX; + numerator += dx * (times[i] - meanY); + denominator += dx * dx; + } + const slope = denominator === 0 ? 0 : numerator / denominator; + const intercept = meanY - slope * meanX; + let ssRes = 0; + let ssTot = 0; + for (let i = 0; i < n; i++) { + const predicted = slope * sizeValues[i] + intercept; + ssRes += (times[i] - predicted) ** 2; + ssTot += (times[i] - meanY) ** 2; + } + return ssTot === 0 ? 1 : 1 - ssRes / ssTot; +}; + +// --------------------------------------------------------------------------- +// High-level benchmark runners +// --------------------------------------------------------------------------- + +/** + * Runs a synchronous benchmark across all input sizes. + * For each size: warmup → measure → compute per-item cost, slopes, R². + * Logs progress to stdout. + */ +export const benchmarkSync = (name: string, runFn: (size: number) => void): BenchmarkResult => { + const timingsMs: number[] = []; + const timingStats: MeasurementStats[] = []; + const memoryDeltaMB: number[] = []; + + console.log(` Benchmarking ${name}...`); + for (const size of sizes) { + const totalRuns = MEASUREMENT_RUNS + Math.ceil(MEASUREMENT_RUNS * OUTLIER_TRIM_PERCENT); + process.stdout.write(` Size ${size}: warmup (${WARMUP_RUNS} runs)...`); + + for (let w = 0; w < WARMUP_RUNS; w++) { + runFn(size); + } + + process.stdout.write(` measuring (${totalRuns} runs)...`); + const measurement = measureSync(() => runFn(size), MEASUREMENT_RUNS); + + timingsMs.push(measurement.stats.median); + timingStats.push(measurement.stats); + memoryDeltaMB.push(measurement.memoryDeltaMB); + + const cv = (measurement.stats.stdDev / measurement.stats.mean) * 100; + const cvWarning = cv > 20 ? " ! HIGH VARIANCE" : ""; + console.log( + ` ✓ (${measurement.stats.median.toFixed(2)}ms ±${measurement.stats.confidenceInterval.toFixed(2)}ms, CV: ${cv.toFixed(1)}%${cvWarning})` + ); + } + + const perItemMs = timingsMs.map((time, index) => time / sizes[index]); + const slopes = computeSlopes(timingsMs, sizes); + const slopeRatio = slopes.length >= 2 ? slopes[slopes.length - 1] / slopes[0] : 1; + const r2 = computeR2(timingsMs, sizes); + + return { + name, + sizes: [...sizes], + timingsMs, + timingStats, + perItemMs, + slopes, + slopeRatio, + r2, + memoryDeltaMB, + }; +}; + +/** + * Runs an async benchmark across all input sizes. + * For each size: warmup → measure → compute per-item cost, slopes, R². + * Logs progress to stdout. + */ +export const benchmarkAsync = async ( + name: string, + runFn: (size: number) => Promise +): Promise => { + const timingsMs: number[] = []; + const timingStats: MeasurementStats[] = []; + const memoryDeltaMB: number[] = []; + + console.log(` Benchmarking ${name}...`); + for (const size of sizes) { + const totalRuns = MEASUREMENT_RUNS + Math.ceil(MEASUREMENT_RUNS * OUTLIER_TRIM_PERCENT); + process.stdout.write(` Size ${size}: warmup (${WARMUP_RUNS} runs)...`); + + for (let w = 0; w < WARMUP_RUNS; w++) { + await runFn(size); + } + + process.stdout.write(` measuring (${totalRuns} runs)...`); + const measurement = await measureAsync(() => runFn(size), MEASUREMENT_RUNS); + + timingsMs.push(measurement.stats.median); + timingStats.push(measurement.stats); + memoryDeltaMB.push(measurement.memoryDeltaMB); + + const cv = (measurement.stats.stdDev / measurement.stats.mean) * 100; + const cvWarning = cv > 20 ? " ! HIGH VARIANCE" : ""; + console.log( + ` ✓ (${measurement.stats.median.toFixed(2)}ms ±${measurement.stats.confidenceInterval.toFixed(2)}ms, CV: ${cv.toFixed(1)}%${cvWarning})` + ); + } + + const perItemMs = timingsMs.map((time, index) => time / sizes[index]); + const slopes = computeSlopes(timingsMs, sizes); + const slopeRatio = slopes.length >= 2 ? slopes[slopes.length - 1] / slopes[0] : 1; + const r2 = computeR2(timingsMs, sizes); + + return { + name, + sizes: [...sizes], + timingsMs, + timingStats, + perItemMs, + slopes, + slopeRatio, + r2, + memoryDeltaMB, + }; +}; diff --git a/test/performance/helpers/reporting.ts b/test/performance/helpers/reporting.ts new file mode 100644 index 0000000..487b99e --- /dev/null +++ b/test/performance/helpers/reporting.ts @@ -0,0 +1,161 @@ +/** + * Performance report generation. + * + * Builds a human-readable Markdown report from benchmark results, + * including environment info, configuration, results table with + * emoji-coded indicators, and a legend. + */ + +import type { Report } from "./types"; + +// --------------------------------------------------------------------------- +// Emoji indicators for report table cells +// --------------------------------------------------------------------------- + +/** Slope ratio: 🟢 ≤2.0 (linear), 🟡 2–4 (suspicious), 🔴 >4 (non-linear). */ +export const slopeRatioEmoji = (ratio: number): string => { + if (ratio <= 2.0) return "🟢"; + if (ratio <= 4.0) return "🟡"; + return "🔴"; +}; + +/** R²: 🟢 ≥0.995 (excellent), 🟡 0.98–0.995, 🔴 <0.98. */ +export const r2Emoji = (r2: number): string => { + if (r2 >= 0.995) return "🟢"; + if (r2 >= 0.98) return "🟡"; + return "🔴"; +}; + +/** CV%: 🟢 ≤5% (stable), 🟡 5–15%, 🔴 >15% (noisy). */ +export const cvEmoji = (cv: number): string => { + if (cv <= 5) return "🟢"; + if (cv <= 15) return "🟡"; + return "🔴"; +}; + +// --------------------------------------------------------------------------- +// Formatting helpers +// --------------------------------------------------------------------------- + +/** Format a number to fixed decimal places. */ +const formatNumber = (value: number, digits: number): string => value.toFixed(digits); + +/** Format an array of numbers as a comma-separated string. */ +const formatList = (values: number[], digits: number): string => + values.map((v) => formatNumber(v, digits)).join(", "); + +// --------------------------------------------------------------------------- +// Markdown report builder +// --------------------------------------------------------------------------- + +/** + * Builds a complete Markdown performance report. + * + * Sections: Environment, Test Configuration, System Warnings, + * Results table (with baseline comparison columns), and Legend. + */ +export const buildMarkdownReport = (report: Report): string => { + const lines: string[] = []; + lines.push(`# Performance Report (${report.timestamp})`); + lines.push(""); + + // --- Environment --- + lines.push("## Environment"); + lines.push(""); + lines.push(`- Node: ${report.environment.node}`); + lines.push(`- Platform: ${report.environment.platform}`); + lines.push(`- CPU: ${report.environment.cpus}`); + lines.push(`- Memory: ${report.environment.totalMemoryGB.toFixed(1)} GB`); + lines.push(`- CPU Load: ${report.environment.cpuLoad.map((l) => l.toFixed(2)).join(", ")}`); + lines.push(""); + + // --- Test Configuration --- + lines.push("## Test Configuration"); + lines.push(""); + lines.push(`- Warmup runs: ${report.testConfig.warmupRuns}`); + lines.push(`- Measurement runs: ${report.testConfig.measurementRuns}`); + lines.push( + `- Outlier trim: ${(report.testConfig.outlierTrimPercent * 100).toFixed(0)}% extra (${report.testConfig.totalRunsPerSize - report.testConfig.measurementRuns} trimmed)` + ); + lines.push(`- Total runs per size: ${report.testConfig.totalRunsPerSize}`); + + // --- System Warnings --- + if (report.systemWarnings.length > 0) { + lines.push(""); + lines.push("### System Warnings"); + lines.push(""); + report.systemWarnings.forEach((w) => lines.push(`- ${w}`)); + } + + // --- Results table --- + lines.push(""); + lines.push("## Results"); + lines.push(""); + lines.push( + "| Benchmark | Timings ms (median) | Variance (CV%) | Per-item ms | Memory ΔMB | Slope ratio | R² | Baseline per-item max | Baseline slope ratio | Baseline R² |" + ); + lines.push("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"); + + for (const result of report.results) { + const baseline = report.baseline?.[result.name]; + const baselinePerItem = baseline ? formatNumber(baseline.perItemMsAtMax, 7) : "n/a"; + const baselineSlope = baseline ? formatNumber(baseline.slopeRatio, 4) : "n/a"; + const baselineR2 = baseline?.r2 !== undefined ? formatNumber(baseline.r2, 4) : "n/a"; + const avgCV = + result.timingStats.map((s) => (s.stdDev / s.mean) * 100).reduce((sum, cv) => sum + cv, 0) / + result.timingStats.length; + + lines.push( + [ + result.name, + formatList(result.timingsMs, 2), + formatNumber(avgCV, 1) + "% " + cvEmoji(avgCV), + formatList(result.perItemMs, 7), + formatList(result.memoryDeltaMB, 2), + formatNumber(result.slopeRatio, 4) + " " + slopeRatioEmoji(result.slopeRatio), + formatNumber(result.r2, 4) + " " + r2Emoji(result.r2), + baselinePerItem, + baselineSlope, + baselineR2, + ].join(" | ") + ); + } + + // --- Legend --- + lines.push(""); + lines.push("## Legend"); + lines.push(""); + lines.push("### Slope ratio"); + lines.push(""); + lines.push( + "Ratio of the last slope segment to the first. A perfectly linear O(n) function scores 1.0." + ); + lines.push(""); + lines.push("| Indicator | Range | Meaning |"); + lines.push("| --- | --- | --- |"); + lines.push("| 🟢 | ≤ 2.0 | Consistent with O(n) linear scaling |"); + lines.push("| 🟡 | 2.0 – 4.0 | Suspicious — possible mild super-linear growth |"); + lines.push("| 🔴 | > 4.0 | Clearly non-linear (O(n²) or worse) |"); + lines.push(""); + lines.push("### CV% (Coefficient of Variation)"); + lines.push(""); + lines.push("Average CV across all measured sizes. Measures measurement stability."); + lines.push(""); + lines.push("| Indicator | Range | Meaning |"); + lines.push("| --- | --- | --- |"); + lines.push("| 🟢 | ≤ 5% | Stable — measurements are repeatable |"); + lines.push("| 🟡 | 5% – 15% | Acceptable for Node.js |"); + lines.push("| 🔴 | > 15% | High noise — results unreliable |"); + lines.push(""); + lines.push("### R² (Coefficient of Determination)"); + lines.push(""); + lines.push("1.0 = medians fall perfectly on a straight line."); + lines.push(""); + lines.push("| Indicator | Range | Meaning |"); + lines.push("| --- | --- | --- |"); + lines.push("| 🟢 | ≥ 0.995 | Excellent linear fit |"); + lines.push("| 🟡 | 0.980 – 0.995 | Minor deviation from linearity |"); + lines.push("| 🔴 | < 0.980 | Clearly non-linear scaling |"); + + return lines.join("\n"); +}; diff --git a/test/performance/helpers/statistics.ts b/test/performance/helpers/statistics.ts new file mode 100644 index 0000000..a5952b0 --- /dev/null +++ b/test/performance/helpers/statistics.ts @@ -0,0 +1,70 @@ +/** + * Statistical functions for performance measurement analysis. + * + * Provides basic descriptive statistics (median, mean, standard deviation), + * composite stats calculation, and system health checks. + */ + +import os from "os"; +import type { MeasurementStats } from "./types"; + +/** Returns the median of a numeric array. */ +export const median = (values: number[]): number => { + const sorted = [...values].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; +}; + +/** Returns the arithmetic mean of a numeric array. */ +export const mean = (values: number[]): number => + values.reduce((sum, val) => sum + val, 0) / values.length; + +/** Returns the population standard deviation of a numeric array. */ +export const stdDev = (values: number[]): number => { + const avg = mean(values); + const squareDiffs = values.map((value) => Math.pow(value - avg, 2)); + return Math.sqrt(mean(squareDiffs)); +}; + +/** + * Computes full descriptive statistics for a set of timing values. + * Includes median, mean, stdDev, min, max, and 95% confidence interval. + */ +export const calculateStats = (values: number[]): MeasurementStats => { + const sorted = [...values].sort((a, b) => a - b); + const avg = mean(values); + const sd = stdDev(values); + const ci = 1.96 * (sd / Math.sqrt(values.length)); + return { + median: median(values), + mean: avg, + stdDev: sd, + min: sorted[0], + max: sorted[sorted.length - 1], + confidenceInterval: ci, + }; +}; + +/** + * Checks current system state and returns warnings if conditions + * may produce unreliable benchmark results (high CPU load, high memory pressure). + */ +export const checkSystemState = (): string[] => { + const warnings: string[] = []; + const loadAvg = os.loadavg(); + const cpuCount = os.cpus().length; + if (loadAvg[0] > cpuCount * 0.7) { + warnings.push( + `High CPU load detected: ${loadAvg[0].toFixed(2)} (${cpuCount} CPUs). Results may be unreliable.` + ); + } + const freeMemGB = os.freemem() / 1024 ** 3; + const totalMemGB = os.totalmem() / 1024 ** 3; + const memUsagePercent = ((totalMemGB - freeMemGB) / totalMemGB) * 100; + if (memUsagePercent > 85) { + warnings.push( + `High memory usage: ${memUsagePercent.toFixed(1)}% (${freeMemGB.toFixed(1)}GB free of ${totalMemGB.toFixed(1)}GB).` + ); + } + return warnings; +}; diff --git a/test/performance/helpers/synthetic-data.ts b/test/performance/helpers/synthetic-data.ts new file mode 100644 index 0000000..5a26a1b --- /dev/null +++ b/test/performance/helpers/synthetic-data.ts @@ -0,0 +1,178 @@ +/** + * Synthetic data generators and mock request builders. + * + * These functions produce deterministic, configurable test data that isolates + * plugin processing from real CDS models and database queries. Used by + * performance benchmarks to control input size precisely. + */ + +// --------------------------------------------------------------------------- +// Entity generators +// --------------------------------------------------------------------------- + +/** + * Builds an array of synthetic CDS-like entity definitions. + * + * Each entity includes: + * - A UUID key element + * - `elementsPerEntity - 1` typed fields with varied annotations + * - One hidden element (`@HideFromDataInspector: true`) — should be filtered out + * - One association element — should be filtered out + * + * @param count - Number of entities to generate + * @param elementsPerEntity - Number of regular elements per entity (default 10) + */ +export function buildSyntheticEntities(count: number, elementsPerEntity: number = 10): any[] { + const entities: any[] = []; + for (let i = 0; i < count; i++) { + const elements: Record = {}; + + // Key element + elements[`id_${i}`] = { + type: "cds.UUID", + key: true, + "@HideFromDataInspector": false, + }; + + // Regular elements with varied types and annotations + for (let j = 1; j < elementsPerEntity; j++) { + elements[`field_${i}_${j}`] = { + type: j % 3 === 0 ? "cds.Integer" : j % 3 === 1 ? "cds.String" : "cds.Boolean", + key: false, + length: j % 3 === 1 ? 255 : undefined, + default: j % 5 === 0 ? { val: "default" } : undefined, + notNull: j % 4 === 0, + "@PersonalData.IsPotentiallySensitive": j % 7 === 0, + "@Core.Computed": j % 9 === 0, + "@HideFromDataInspector": false, + }; + } + + // Hidden element (filtered out by EntityDefinitionReader) + elements[`hidden_${i}`] = { + type: "cds.String", + "@HideFromDataInspector": true, + }; + + // Association element (filtered out by EntityDefinitionReader) + elements[`assoc_${i}`] = { + type: "cds.Association", + }; + + entities.push({ + name: `perf.test.Entity_${i}`, + "@title": i % 3 === 0 ? `Entity ${i} Title` : undefined, + "@HideFromDataInspector": false, + elements, + get dataSource4DataInspector() { + return i % 2 === 0 ? "db" : "service"; + }, + get keyElements4DataInspector() { + return [`id_${i}`]; + }, + }); + } + return entities; +} + +// --------------------------------------------------------------------------- +// Record generators +// --------------------------------------------------------------------------- + +/** + * Builds an array of synthetic database records for DataReader benchmarks. + * + * Each record contains an `id` field and `fieldsPerRecord - 1` typed fields. + * The returned array has a `$count` property set to `count` (simulating CDS query result). + * + * @param count - Number of records to generate + * @param fieldsPerRecord - Number of fields per record (default 10) + */ +export function buildSyntheticRecords(count: number, fieldsPerRecord: number = 10): any[] { + const records: any[] = []; + for (let i = 0; i < count; i++) { + const record: Record = { id: `uuid-${i}` }; + for (let j = 1; j < fieldsPerRecord; j++) { + record[`field_${j}`] = j % 3 === 0 ? i * j : j % 3 === 1 ? `value_${i}_${j}` : i % 2 === 0; + } + records.push(record); + } + (records as any).$count = count; + return records; +} + +// --------------------------------------------------------------------------- +// Mock request builders +// --------------------------------------------------------------------------- + +/** + * Creates a mock `cds.Request` for EntityDefinitionReader.read() — collection request. + * + * Simulates a GET with `$select=*` and optional OData query options. + * + * @param options.filter - OData $filter expression (e.g. `contains(name, 'Foo')`) + * @param options.orderby - OData $orderby expression (e.g. `name asc`) + * @param options.skip - OData $skip value + * @param options.top - OData $top value + */ +export function buildEntityDefinitionRequest(options?: { + filter?: string; + orderby?: string; + skip?: number; + top?: number; +}): any { + const columns = ["*"]; + return { + params: [], + query: { + SELECT: { + columns, + count: true, + orderBy: options?.orderby + ? [{ ref: [options.orderby.split(" ")[0]], sort: options.orderby.split(" ")[1] || "asc" }] + : undefined, + }, + }, + req: { + query: { + $filter: options?.filter, + $orderby: options?.orderby, + $skip: options?.skip !== undefined ? String(options.skip) : undefined, + $top: options?.top !== undefined ? String(options.top) : undefined, + }, + }, + reject: (code: number, msg: string) => { + throw new Error(`Request rejected: ${code} ${msg}`); + }, + }; +} + +/** + * Creates a mock `cds.Request` for DataReader.read() — data retrieval request. + * + * Simulates a GET filtered by entity name with `$select=*`. + * + * @param entityName - The entity name to filter on (e.g. `perf.test.Entity_0`) + */ +export function buildDataReadRequest(entityName: string): any { + const columns = ["*"]; + return { + params: [], + query: { + SELECT: { + columns, + count: true, + }, + }, + req: { + query: { + $filter: `entityName = '${entityName}'`, + $skip: "0", + $top: "1000", + }, + }, + reject: (code: number, msg: string) => { + throw new Error(`Request rejected: ${code} ${msg}`); + }, + }; +} diff --git a/test/performance/helpers/types.ts b/test/performance/helpers/types.ts new file mode 100644 index 0000000..05ea53c --- /dev/null +++ b/test/performance/helpers/types.ts @@ -0,0 +1,75 @@ +/** + * Type definitions for the performance testing infrastructure. + * + * These types define the shape of measurement results, baseline data, + * and the final performance report. + */ + +/** Descriptive statistics for a set of timing measurements. */ +export type MeasurementStats = { + median: number; + mean: number; + stdDev: number; + min: number; + max: number; + /** 95% confidence interval half-width (±value). */ + confidenceInterval: number; +}; + +/** Result of a single benchmark run across all input sizes. */ +export type BenchmarkResult = { + name: string; + sizes: number[]; + /** Median timing in ms for each size. */ + timingsMs: number[]; + /** Full statistics for each size. */ + timingStats: MeasurementStats[]; + /** Time per item (timingMs / size) for each size. */ + perItemMs: number[]; + /** Slope between consecutive size pairs (ms per additional item). */ + slopes: number[]; + /** Ratio of last slope to first slope. 1.0 = perfectly linear. */ + slopeRatio: number; + /** R² coefficient of determination for linear fit. 1.0 = perfect. */ + r2: number; + /** Heap memory delta in MB for each size. */ + memoryDeltaMB: number[]; +}; + +/** A single entry in the performance baseline file. */ +export type BaselineEntry = { + sizes: number[]; + perItemMsAtMax: number; + slopeRatio: number; + r2?: number; +}; + +/** The full baseline file: benchmark name → baseline entry. */ +export type BaselineData = Record; + +/** Test configuration summary for the report. */ +export type TestConfig = { + warmupRuns: number; + measurementRuns: number; + outlierTrimPercent: number; + totalRunsPerSize: number; +}; + +/** The complete performance report written to disk after a run. */ +export type Report = { + timestamp: string; + sizes: number[]; + results: BenchmarkResult[]; + baseline?: BaselineData; + regressionThreshold: number; + slopeVarianceThreshold: number; + testConfig: TestConfig; + environment: { + node: string; + platform: string; + cpus: string; + totalMemoryGB: number; + cpuLoad: number[]; + }; + systemWarnings: string[]; +}; diff --git a/test/performance/performance-baseline.ci.json b/test/performance/performance-baseline.ci.json new file mode 100644 index 0000000..e7855f9 --- /dev/null +++ b/test/performance/performance-baseline.ci.json @@ -0,0 +1,62 @@ +{ + "EntityDefinitionReader.read (collection)": { + "sizes": [ + 10, + 50, + 100, + 500, + 1000 + ], + "perItemMsAtMax": 0.0008664794999999686, + "slopeRatio": 1.0563725392233307, + "r2": 0.9991454243695702 + }, + "EntityDefinitionReader.read (filtered)": { + "sizes": [ + 10, + 50, + 100, + 500, + 1000 + ], + "perItemMsAtMax": 0.000915937500000041, + "slopeRatio": 0.8465581902233116, + "r2": 0.9997151636234601 + }, + "EntityDefinitionReader._getEntityElements (via read)": { + "sizes": [ + 10, + 50, + 100, + 500, + 1000 + ], + "perItemMsAtMax": 0.00030279100000001334, + "slopeRatio": 0.9524803034682242, + "r2": 0.997822499293361 + }, + "DataReader.read (response construction, DB stubbed)": { + "sizes": [ + 10, + 50, + 100, + 500, + 1000 + ], + "perItemMsAtMax": 0.00016691650000001345, + "slopeRatio": 0.5997224934519662, + "r2": 0.9979562674639578 + }, + "DataReader._emitAuditlogs (stubbed audit-log)": { + "sizes": [ + 10, + 50, + 100, + 500, + 1000 + ], + "perItemMsAtMax": 0.0007755829999999832, + "slopeRatio": 0.40819046558119343, + "r2": 0.9743144804610471 + } +} \ No newline at end of file diff --git a/test/tsconfig.json b/test/tsconfig.json index 93e467b..c833933 100644 --- a/test/tsconfig.json +++ b/test/tsconfig.json @@ -3,6 +3,6 @@ "include": ["**/*.ts"], "compilerOptions": { "noEmit": true, - "rootDir": "." + "rootDir": ".." } }