From e5455bb940f8a5864acf9b471676389b434c6e7d Mon Sep 17 00:00:00 2001
From: hexxyan <1027796553@qq.com>
Date: Sun, 31 May 2026 20:22:58 +0800
Subject: [PATCH] bench: add routed expert locality profiler

Opt-in profiler that records per-layer per-token MoE expert selections
during CPU decode.  Outputs JSON with per-layer statistics:

  - Expert frequency histogram and weight distribution
  - Top-10 experts with cumulative coverage curve
  - Shannon entropy / entropy ratio (uniformity measure)
  - Adjacent-token top-k overlap and Jaccard similarity
  - Position stability (same slot, same expert across tokens)
  - Hash-routing vs top-k-routing per layer

Usage:
  DS4_EXPERT_PROFILE=profile.json ./ds4-server -m <model> --cpu
  ./ds4-bench --cpu --expert-profile profile.json -m <model>

No performance impact when not activated.  CPU-only: expert selection
patterns are a model property, not backend-dependent.
---
 ds4.c       | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 ds4_bench.c |  10 ++
 ds4_help.c  |   1 +
 3 files changed, 351 insertions(+)
diff --git a/ds4.c b/ds4.c
index 39694470d..8dde8884d 100644
--- a/ds4.c
+++ b/ds4.c
@@ -302,6 +302,313 @@ static uint32_t g_ds4_compress_ratios[DS4_MAX_LAYER] = {0};
 
 static int g_ds4_lock_fd = -1;
 
+static void *xcalloc(size_t n, size_t size);
+static void *xmalloc(size_t size);
+
+/* Expert locality profiler (opt-in, DS4_EXPERT_PROFILE env var). */
+
+typedef struct {
+    int    expert_id;
+    float  weight;
+} ds4_profile_expert_slot;
+
+typedef struct {
+    int n_layers;
+    int n_experts;
+    int n_expert_used;
+    int n_tokens;
+    int capacity;
+    int current_token;
+    bool truncated;
+    ds4_profile_expert_slot *records; /* [n_layers * capacity * n_expert_used] */
+    bool *layer_is_hash;              /* [n_layers] */
+    bool *layer_recorded;             /* [n_layers] */
+} ds4_expert_profile;
+
+static ds4_expert_profile g_expert_profile;
+static bool g_expert_profile_active = false;
+static bool g_expert_profile_recording_token = false;
+
+static void ds4_expert_profile_init(int n_layers, int n_experts, int n_expert_used, int capacity) {
+    memset(&g_expert_profile, 0, sizeof(g_expert_profile));
+    g_expert_profile.n_layers = n_layers;
+    g_expert_profile.n_experts = n_experts;
+    g_expert_profile.n_expert_used = n_expert_used;
+    g_expert_profile.capacity = capacity;
+    g_expert_profile.current_token = -1;
+    const size_t n_records = (size_t)n_layers * (size_t)capacity * (size_t)n_expert_used;
+    g_expert_profile.records = xmalloc(n_records * sizeof(ds4_profile_expert_slot));
+    for (size_t i = 0; i < n_records; i++) {
+        g_expert_profile.records[i].expert_id = -1;
+        g_expert_profile.records[i].weight = 0.0f;
+    }
+    g_expert_profile.layer_is_hash = xcalloc((size_t)n_layers, sizeof(bool));
+    g_expert_profile.layer_recorded = xcalloc((size_t)n_layers, sizeof(bool));
+    g_expert_profile_active = true;
+}
+
+static void ds4_expert_profile_cleanup(void) {
+    free(g_expert_profile.records);
+    free(g_expert_profile.layer_is_hash);
+    free(g_expert_profile.layer_recorded);
+    memset(&g_expert_profile, 0, sizeof(g_expert_profile));
+    g_expert_profile_active = false;
+    g_expert_profile_recording_token = false;
+}
+
+/* Expert locality profiler JSON output. */
+
+typedef struct { int expert_id; uint64_t count; } ds4_profile_sort_entry;
+
+static void ds4_expert_profile_write_json(const char *path) {
+    if (!g_expert_profile_active || !path || !path[0]) return;
+    ds4_expert_profile *p = &g_expert_profile;
+    const int NL = p->n_layers;
+    const int NE = p->n_experts;
+    const int NK = p->n_expert_used;
+    const int NT = p->n_tokens;
+    if (NT == 0) { fprintf(stderr, "ds4: expert profiler has no data, skipping output\n"); return; }
+
+    FILE *fp = fopen(path, "w");
+    if (!fp) { fprintf(stderr, "ds4: cannot write expert profile to %s\n", path); return; }
+
+    fprintf(fp, "{\n");
+    fprintf(fp, "  \"n_layers\": %d,\n", NL);
+    fprintf(fp, "  \"n_experts\": %d,\n", NE);
+    fprintf(fp, "  \"n_expert_used\": %d,\n", NK);
+    fprintf(fp, "  \"capacity\": %d,\n", p->capacity);
+    fprintf(fp, "  \"truncated\": %s,\n", p->truncated ? "true" : "false");
+    fprintf(fp, "  \"n_tokens_profiled\": %d,\n", NT);
+    fprintf(fp, "  \"layers\": [\n");
+
+    uint64_t *hist       = xcalloc((size_t)NE, sizeof(uint64_t));
+    double   *weight_sum = xcalloc((size_t)NE, sizeof(double));
+    ds4_profile_sort_entry *sorted = xmalloc((size_t)NE * sizeof(sorted[0]));
+
+    for (int il = 0; il < NL; il++) {
+        fprintf(fp, "    {\n");
+        fprintf(fp, "      \"layer\": %d,\n", il);
+        fprintf(fp, "      \"recorded\": %s,\n", p->layer_recorded[il] ? "true" : "false");
+        fprintf(fp, "      \"is_hash_routed\": %s,\n", p->layer_is_hash[il] ? "true" : "false");
+
+        /* Histogram and weight distribution. */
+        memset(hist, 0, (size_t)NE * sizeof(uint64_t));
+        memset(weight_sum, 0.0, (size_t)NE * sizeof(double));
+
+        uint64_t total_selections = 0;
+        for (int t = 0; t < NT; t++) {
+            const ds4_profile_expert_slot *row =
+                p->records + ((size_t)il * (size_t)p->capacity + (size_t)t) * (size_t)NK;
+            for (int k = 0; k < NK; k++) {
+                int eid = row[k].expert_id;
+                if (eid >= 0 && eid < NE) {
+                    hist[eid]++;
+                    weight_sum[eid] += (double)row[k].weight;
+                    total_selections++;
+                }
+            }
+        }
+
+        /* unique expert count */
+        int unique_experts = 0;
+        for (int e = 0; e < NE; e++) {
+            if (hist[e] > 0) unique_experts++;
+        }
+        fprintf(fp, "      \"unique_experts\": %d,\n", unique_experts);
+
+        /* Histogram JSON. */
+        fprintf(fp, "      \"histogram\": {\n");
+        {
+            bool first = true;
+            for (int e = 0; e < NE; e++) {
+                if (hist[e] == 0) continue;
+                if (!first) fprintf(fp, ",\n");
+                first = false;
+                fprintf(fp, "        \"%d\": %llu", e, (unsigned long long)hist[e]);
+            }
+            fprintf(fp, "\n      },\n");
+        }
+
+        /* Weight distribution JSON. */
+        fprintf(fp, "      \"weight_distribution\": {\n");
+        {
+            bool first = true;
+            for (int e = 0; e < NE; e++) {
+                if (hist[e] == 0) continue;
+                if (!first) fprintf(fp, ",\n");
+                first = false;
+                fprintf(fp, "        \"%d\": %.4f", e, weight_sum[e] / (double)hist[e]);
+            }
+            fprintf(fp, "\n      },\n");
+        }
+
+        /* Sort experts descending by count (insertion sort, NE <= 256). */
+        for (int e = 0; e < NE; e++) {
+            sorted[e].expert_id = e;
+            sorted[e].count = hist[e];
+        }
+        for (int i = 1; i < NE; i++) {
+            ds4_profile_sort_entry tmp = sorted[i];
+            int j = i - 1;
+            while (j >= 0 && sorted[j].count < tmp.count) {
+                sorted[j + 1] = sorted[j];
+                j--;
+            }
+            sorted[j + 1] = tmp;
+        }
+
+        /* Top 10 experts. */
+        {
+            int top_n = unique_experts < 10 ? unique_experts : 10;
+            fprintf(fp, "      \"top_experts\": [\n");
+            for (int i = 0; i < top_n; i++) {
+                double pct = total_selections > 0
+                    ? 100.0 * (double)sorted[i].count / (double)total_selections : 0.0;
+                fprintf(fp, "        {\"expert\": %d, \"count\": %llu, \"pct\": %.2f}%s\n",
+                    sorted[i].expert_id,
+                    (unsigned long long)sorted[i].count,
+                    pct,
+                    (i < top_n - 1) ? "," : "");
+            }
+            fprintf(fp, "      ],\n");
+        }
+
+        /* Cumulative coverage. */
+        {
+            fprintf(fp, "      \"cumulative_coverage\": [");
+            uint64_t cum = 0;
+            for (int i = 0; i < unique_experts; i++) {
+                cum += sorted[i].count;
+                double frac = total_selections > 0
+                    ? (double)cum / (double)total_selections : 0.0;
+                if (i > 0) fprintf(fp, ", ");
+                fprintf(fp, "%.4f", frac);
+            }
+            fprintf(fp, "],\n");
+        }
+
+        /* Entropy. */
+        {
+            double entropy = 0.0;
+            for (int e = 0; e < NE; e++) {
+                if (hist[e] == 0) continue;
+                double pe = (double)hist[e] / (double)total_selections;
+                entropy -= pe * log2(pe);
+            }
+            double max_entropy = log2((double)NE);
+            double entropy_ratio = max_entropy > 0.0 ? entropy / max_entropy : 0.0;
+            fprintf(fp, "      \"entropy\": %.4f,\n", entropy);
+            fprintf(fp, "      \"max_entropy\": %.4f,\n", max_entropy);
+            fprintf(fp, "      \"entropy_ratio\": %.4f,\n", entropy_ratio);
+        }
+
+        /* Adjacent-token overlap and Jaccard similarity. */
+        {
+            double overlap_sum = 0.0;
+            double jaccard_sum = 0.0;
+            double pos_stab_sum = 0.0;
+            int n_pairs = 0;
+
+            for (int t = 0; t < NT - 1; t++) {
+                const ds4_profile_expert_slot *row_a =
+                    p->records + ((size_t)il * (size_t)p->capacity + (size_t)t) * (size_t)NK;
+                const ds4_profile_expert_slot *row_b =
+                    p->records + ((size_t)il * (size_t)p->capacity + (size_t)(t + 1)) * (size_t)NK;
+
+                int intersection = 0;
+                int pos_match = 0;
+                int valid_a = 0;
+                int valid_b = 0;
+                for (int ka = 0; ka < NK; ka++) {
+                    const int eid_a = row_a[ka].expert_id;
+                    const int eid_b = row_b[ka].expert_id;
+                    const bool a_valid = eid_a >= 0 && eid_a < NE;
+                    const bool b_valid = eid_b >= 0 && eid_b < NE;
+                    if (a_valid) valid_a++;
+                    if (b_valid) valid_b++;
+                    if (!a_valid) continue;
+                    for (int kb = 0; kb < NK; kb++) {
+                        const int eid_b_any = row_b[kb].expert_id;
+                        if (eid_b_any >= 0 && eid_b_any < NE && eid_a == eid_b_any) {
+                            intersection++;
+                            break;
+                        }
+                    }
+                    if (b_valid && eid_a == eid_b) {
+                        pos_match++;
+                    }
+                }
+
+                overlap_sum += (double)intersection / (double)NK;
+                int union_size = valid_a + valid_b - intersection;
+                jaccard_sum += (union_size > 0) ? (double)intersection / (double)union_size : 0.0;
+                pos_stab_sum += (double)pos_match / (double)NK;
+                n_pairs++;
+            }
+
+            double avg_overlap  = n_pairs > 0 ? overlap_sum  / (double)n_pairs : 0.0;
+            double avg_jaccard  = n_pairs > 0 ? jaccard_sum  / (double)n_pairs : 0.0;
+            double avg_pos_stab = n_pairs > 0 ? pos_stab_sum / (double)n_pairs : 0.0;
+
+            fprintf(fp, "      \"avg_overlap\": %.4f,\n", avg_overlap);
+            fprintf(fp, "      \"avg_jaccard\": %.4f,\n", avg_jaccard);
+            fprintf(fp, "      \"avg_position_stability\": %.4f\n", avg_pos_stab);
+        }
+
+        fprintf(fp, "    }%s\n", (il < NL - 1) ? "," : "");
+    }
+
+    fprintf(fp, "  ]\n");
+    fprintf(fp, "}\n");
+    fclose(fp);
+
+    fprintf(stderr, "ds4: expert profile written to %s (%d tokens, %d layers)\n", path, NT, NL);
+
+    free(hist);
+    free(weight_sum);
+    free(sorted);
+}
+
+static void ds4_expert_profile_record(
+        uint32_t il,
+        const int *selected,
+        const float *expert_weight,
+        bool is_hash) {
+    if (!g_expert_profile_active || !g_expert_profile_recording_token) return;
+    ds4_expert_profile *p = &g_expert_profile;
+    if (il >= (uint32_t)p->n_layers) return;
+    if (p->current_token < 0 || p->current_token >= p->capacity) return;
+    const int neu = p->n_expert_used;
+    ds4_profile_expert_slot *row =
+        p->records + ((size_t)il * (size_t)p->capacity + (size_t)p->current_token) * (size_t)neu;
+    for (int i = 0; i < neu; i++) {
+        row[i].expert_id = selected[i];
+        row[i].weight = expert_weight[i];
+    }
+    if (is_hash) p->layer_is_hash[il] = true;
+    p->layer_recorded[il] = true;
+}
+
+static bool ds4_expert_profile_begin_token(void) {
+    if (!g_expert_profile_active) return false;
+    if (g_expert_profile.n_tokens >= g_expert_profile.capacity) {
+        g_expert_profile_recording_token = false;
+        g_expert_profile.current_token = -1;
+        g_expert_profile.truncated = true;
+        return false;
+    }
+    g_expert_profile.current_token = g_expert_profile.n_tokens;
+    g_expert_profile_recording_token = true;
+    return true;
+}
+
+static void ds4_expert_profile_end_token(bool recorded) {
+    if (!g_expert_profile_active) return;
+    if (recorded) g_expert_profile.n_tokens++;
+    g_expert_profile.current_token = -1;
+    g_expert_profile_recording_token = false;
+}
+
 #if defined(__GNUC__) || defined(__clang__)
 #define DS4_MAYBE_UNUSED __attribute__((unused))
 #else
@@ -6422,6 +6729,11 @@ static void layer_routed_moe_one(
         layer_topk_selected_experts(selected, expert_weight, model, layer, x);
     }
 
+    if (g_expert_profile_active) {
+        const bool is_hash = (layer->ffn_gate_tid2eid != NULL);
+        ds4_expert_profile_record(il, selected, expert_weight, is_hash);
+    }
+
     if (!trace) {
         matvec_experts_mid_prequant(mid_all, model,
                                     layer->ffn_gate_exps,
@@ -6516,6 +6828,11 @@ static void layer_routed_moe_one_prealloc(
         layer_topk_selected_experts(selected, expert_weight, model, layer, x);
     }
 
+    if (g_expert_profile_active) {
+        const bool is_hash = (layer->ffn_gate_tid2eid != NULL);
+        ds4_expert_profile_record(il, selected, expert_weight, is_hash);
+    }
+
     matvec_experts_mid_prequant(mid_all, model,
                                 layer->ffn_gate_exps,
                                 layer->ffn_up_exps,
@@ -8815,6 +9132,7 @@ static void forward_token_raw_swa_cpu_decode_scratch(
         ds4_cpu_decode_scratch * scratch) {
     float *cur = scratch->cur;
     float *next = scratch->next;
+    const bool expert_profile_token = ds4_expert_profile_begin_token();
 
     embed_token_f16(model, weights, token, scratch->plain);
     hc_from_plain_embedding(cur, scratch->plain, DS4_N_EMBD, DS4_N_HC);
@@ -8830,6 +9148,7 @@ static void forward_token_raw_swa_cpu_decode_scratch(
         cur = next;
         next = tmp;
     }
+    ds4_expert_profile_end_token(expert_profile_token);
 
     if (logits) {
         output_logits_one_decode_scratch(logits, model, weights, cur, scratch);
@@ -19329,6 +19648,22 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
         *out = e;
         return 0;
     }
+    const char *expert_profile_path = getenv("DS4_EXPERT_PROFILE");
+    if (expert_profile_path && expert_profile_path[0]) {
+        if (e->backend == DS4_BACKEND_CPU) {
+            ds4_expert_profile_init(
+                (int)g_ds4_shape.n_layer,
+                (int)g_ds4_shape.n_expert,
+                (int)g_ds4_shape.n_expert_used,
+                8192);
+            fprintf(stderr, "ds4: expert locality profiler active (output: %s)\n",
+                    expert_profile_path);
+        } else {
+            fprintf(stderr,
+                    "ds4: DS4_EXPERT_PROFILE is CPU-only; ignoring it for %s backend\n",
+                    ds4_backend_name(e->backend));
+        }
+    }
     if (e->backend == DS4_BACKEND_CPU && !cpu_load_directional_steering(e)) {
         ds4_engine_close(e);
         *out = NULL;
@@ -19546,6 +19881,11 @@ int ds4_engine_model_id(ds4_engine *e) {
 
 void ds4_engine_close(ds4_engine *e) {
     if (!e) return;
+    if (g_expert_profile_active) {
+        const char *expert_profile_path = getenv("DS4_EXPERT_PROFILE");
+        ds4_expert_profile_write_json(expert_profile_path);
+        ds4_expert_profile_cleanup();
+    }
     weights_free(&e->weights);
     vocab_free(&e->vocab);
     ds4_threads_shutdown();
diff --git a/ds4_bench.c b/ds4_bench.c
index 06da6e0eb..cb0daea38 100644
--- a/ds4_bench.c
+++ b/ds4_bench.c
@@ -38,6 +38,7 @@ typedef struct {
     int power_percent;
     double step_mul;
     const char *dump_frontier_logits_dir;
+    const char *expert_profile_path;
     ds4_dist_options dist;
     bool warm_weights;
     bool quality;
@@ -210,6 +211,8 @@ static bench_config parse_options(int argc, char **argv) {
             c.csv_path = need_arg(&i, argc, argv, arg);
         } else if (!strcmp(arg, "--dump-frontier-logits-dir")) {
             c.dump_frontier_logits_dir = need_arg(&i, argc, argv, arg);
+        } else if (!strcmp(arg, "--expert-profile")) {
+            c.expert_profile_path = need_arg(&i, argc, argv, arg);
         } else if (!strcmp(arg, "-t") || !strcmp(arg, "--threads")) {
             c.threads = parse_int(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "--backend")) {
@@ -463,6 +466,13 @@ int main(int argc, char **argv) {
         fprintf(stderr, "ds4-bench: %s\n", dist_err);
         return 2;
     }
+    if (cfg.expert_profile_path) {
+        if (cfg.backend != DS4_BACKEND_CPU) {
+            fprintf(stderr, "ds4-bench: --expert-profile requires --cpu\n");
+            return 2;
+        }
+        setenv("DS4_EXPERT_PROFILE", cfg.expert_profile_path, 1);
+    }
     ds4_engine *engine = NULL;
     if (ds4_engine_open(&engine, &opt) != 0) return 1;
     log_context_memory(cfg.backend, cfg.ctx_alloc);
diff --git a/ds4_help.c b/ds4_help.c
index 2ca13df4b..06260a721 100644
--- a/ds4_help.c
+++ b/ds4_help.c
@@ -335,6 +335,7 @@ static void print_bench_specific(FILE *fp, const help_colors *c) {
     opt(fp, c, "--gen-tokens N", "Greedy decode tokens per frontier. 0 for pure prefill. Default: 128");
     opt(fp, c, "--csv FILE", "Write CSV there instead of stdout.");
     opt(fp, c, "--dump-frontier-logits-dir DIR", "Write one full-logit JSON file per frontier.");
+    opt(fp, c, "--expert-profile FILE", "Record expert locality stats to JSON (CPU only).");
     fputc('\n', fp);
 }