From 7afa9a76b3f074ba5607baec9f84eb2faee60c7e Mon Sep 17 00:00:00 2001
From: Curtis Anderson <curtis@mlcommons.org>
Date: Tue, 23 Jun 2026 21:33:25 -0700
Subject: [PATCH] fix(kvcache): consolidate per-option workload params in
 mlpstorage (#498, #500)

The mlperf_wrapper.py script was unconditionally injecting WORKLOAD_PARAMS
on every kv-cache.py invocation, overriding user --config YAML and CLI
flags even in OPEN mode. CLOSED/OPEN gating already lives in
mlpstorage_py/cli/kvcache_args.py, so the wrapper-side squashing was
both redundant in CLOSED and actively harmful in OPEN.

WORKLOAD_PARAMS now lives only in mlpstorage_py.benchmarks.kvcache.
KVCacheBenchmark._build_option_kvcache_args emits the per-option flags:
verbatim in CLOSED, with user CLI flags superseding per-option defaults
in OPEN. max-concurrent-allocs stays at the option's mandated value
because it has no OPEN CLI flag.

mlperf_wrapper.py becomes a pure rank-aware launcher: --rank-output-base,
--rank-cache-base, --seed-base, --start-delay, --end-delay; everything
else forwards verbatim to kv-cache.py. Per-rank --seed/--output/--cache-dir
are appended last so argparse store-action takes them over forwarded
duplicates. allow_abbrev=False prevents a forwarded --seed from
collapsing into --seed-base.
---
 kv_cache_benchmark/mlperf_wrapper.py  | 129 +++++++-------------
 mlpstorage_py/benchmarks/kvcache.py   |  95 ++++++++++++++-
 tests/unit/test_benchmarks_kvcache.py | 167 +++++++++++++++++++++++++-
 tests/unit/test_mlperf_wrapper.py     |  90 ++++++++------
 4 files changed, 347 insertions(+), 134 deletions(-)

diff --git a/kv_cache_benchmark/mlperf_wrapper.py b/kv_cache_benchmark/mlperf_wrapper.py
index dc3a5296..896616ae 100755
--- a/kv_cache_benchmark/mlperf_wrapper.py
+++ b/kv_cache_benchmark/mlperf_wrapper.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python3
-"""MPI-rank-aware wrapper for MLPerf KV Cache benchmark.
+"""MPI-rank-aware launcher for the MLPerf KV Cache benchmark.
 
 Invoked by mpirun per-rank; reads OMPI_COMM_WORLD_RANK (OpenMPI) or PMI_RANK
-(MPICH) to determine this rank's index. Computes per-rank seed, output
-directory, and cache directory, then invokes kv-cache.py with the fixed MLPerf
-v3.0 parameters for the specified option.
+(MPICH) to determine this rank's index. The wrapper does NOT encode any
+workload parameters — it only computes per-rank values (seed, output file,
+cache directory) and forwards everything else to kv-cache.py. The caller
+(mlpstorage_py.benchmarks.kvcache.KVCacheBenchmark) owns the per-option
+workload parameter set and CLOSED/OPEN enforcement.
 """
 
 import argparse
@@ -18,40 +20,6 @@
 BASE_SEED = 42
 TEST_DELAY = 90
 
-# MLPerf v3.0 fixed parameters per option.
-# All numeric values stored as int/float; converted to str when building cmd.
-# 'generation-mode' is ALWAYS 'none' for MLPerf compliance — do NOT rely on
-# kv-cache.py defaults; the default may change in future versions.
-WORKLOAD_PARAMS = {
-    1: {
-        'model': 'llama3.1-8b',
-        'num-users': 200,
-        'duration': 300,
-        'gpu-mem-gb': 0,
-        'cpu-mem-gb': 0,
-        'max-concurrent-allocs': 16,
-        'generation-mode': 'none',
-    },
-    2: {
-        'model': 'llama3.1-8b',
-        'num-users': 100,
-        'duration': 300,
-        'gpu-mem-gb': 0,
-        'cpu-mem-gb': 4,
-        'max-concurrent-allocs': 16,
-        'generation-mode': 'none',
-    },
-    3: {
-        'model': 'llama3.1-70b-instruct',
-        'num-users': 70,
-        'duration': 300,
-        'gpu-mem-gb': 0,
-        'cpu-mem-gb': 0,
-        'max-concurrent-allocs': 4,
-        'generation-mode': 'none',
-    },
-}
-
 
 def get_rank() -> int:
     """Read global MPI rank from environment (no mpi4py).
@@ -79,50 +47,57 @@ def get_rank() -> int:
 
 
 def main():
+    # allow_abbrev=False so forwarded kv-cache.py flags like --seed are not
+    # silently swallowed by a prefix match against --seed-base.
     parser = argparse.ArgumentParser(
-        description="MLPerf KV Cache MPI-rank-aware wrapper"
+        description="MLPerf KV Cache MPI-rank-aware launcher",
+        allow_abbrev=False,
     )
     parser.add_argument(
-        '--option',
-        type=int,
-        choices=[1, 2, 3],
+        '--rank-output-base',
+        type=str,
         required=True,
-        help="MLPerf v3.0 option (1=Max Storage Stress, 2=Storage Throughput, 3=Large Model 70B).",
+        dest='rank_output_base',
+        help="Base output directory. Per-rank results written to <rank-output-base>/rank_<N>/.",
     )
     parser.add_argument(
-        '--seed',
+        '--rank-cache-base',
+        type=str,
+        required=True,
+        dest='rank_cache_base',
+        help="Base cache directory. Per-rank cache written to <rank-cache-base>/rank_<N>/.",
+    )
+    parser.add_argument(
+        '--seed-base',
         type=int,
         default=BASE_SEED,
+        dest='seed_base',
         help=f"Base random seed (default: {BASE_SEED}). Effective seed = base + rank.",
     )
     parser.add_argument(
-        '--base-output-dir',
-        type=str,
-        required=True,
-        dest='base_output_dir',
-        help="Base output directory. Per-rank results written to <base_output_dir>/rank_<N>/.",
-    )
-    parser.add_argument(
-        '--cache-dir',
-        type=str,
-        required=True,
-        dest='cache_dir',
-        help="Base cache directory. Per-rank cache written to <cache_dir>/rank_<N>/.",
+        '--start-delay',
+        type=int,
+        default=TEST_DELAY,
+        dest='start_delay',
+        help=f"Seconds to sleep before invoking kv-cache.py (default: {TEST_DELAY}).",
     )
     parser.add_argument(
-        '--config',
-        type=str,
-        default=None,
-        help="Path to YAML config file. Defaults to config.yaml adjacent to this script (D-02).",
+        '--end-delay',
+        type=int,
+        default=TEST_DELAY,
+        dest='end_delay',
+        help=f"Seconds to sleep after kv-cache.py exits (default: {TEST_DELAY}).",
     )
 
-    args = parser.parse_args()
+    # Wrapper-specific flags are consumed; everything else is forwarded to
+    # kv-cache.py verbatim (including --config, --model, --num-users, etc.).
+    args, forwarded = parser.parse_known_args()
 
     rank = get_rank()
-    effective_seed = args.seed + rank
+    effective_seed = args.seed_base + rank
 
-    rank_output_dir = Path(args.base_output_dir) / f"rank_{rank}"
-    rank_cache_dir = Path(args.cache_dir) / f"rank_{rank}"
+    rank_output_dir = Path(args.rank_output_base) / f"rank_{rank}"
+    rank_cache_dir = Path(args.rank_cache_base) / f"rank_{rank}"
 
     rank_output_dir.mkdir(parents=True, exist_ok=True)
     rank_cache_dir.mkdir(parents=True, exist_ok=True)
@@ -130,38 +105,26 @@ def main():
     ts = datetime.now().strftime('%Y%m%d_%H%M%S')
     output_file = rank_output_dir / f"kvcache_results_{ts}.json"
 
-    # D-01: kv-cache.py located relative to this script; both share kv_cache_benchmark/
     kvcache_script = Path(__file__).parent / 'kv-cache.py'
-    # D-02: config.yaml located relative to this script; user may override via --config
-    config_path = args.config or str(Path(__file__).parent / 'config.yaml')
-
-    params = WORKLOAD_PARAMS[args.option]
 
+    # Per-rank seed/output/cache are appended last so argparse store-action
+    # uses them over any duplicates that came in via --forwarded args.
     cmd = [
         sys.executable,
         str(kvcache_script),
-        '--config', config_path,
+        *forwarded,
         '--seed', str(effective_seed),
         '--output', str(output_file),
         '--cache-dir', str(rank_cache_dir),
-        '--model', params['model'],
-        '--num-users', str(params['num-users']),
-        '--duration', str(params['duration']),
-        '--gpu-mem-gb', str(params['gpu-mem-gb']),
-        '--cpu-mem-gb', str(params['cpu-mem-gb']),
-        '--max-concurrent-allocs', str(params['max-concurrent-allocs']),
-        # '--generation-mode' is passed EXPLICITLY — do NOT omit even though the
-        # value is always 'none'. kv-cache.py defaults may change in future versions.
-        '--generation-mode', params['generation-mode'],
     ]
 
-    print(f"KV Cache Wrapper - Start delay for {TEST_DELAY} seconds")
-    time.sleep(TEST_DELAY)
+    print(f"KV Cache Wrapper - Start delay for {args.start_delay} seconds")
+    time.sleep(args.start_delay)
     print(f"KV Cache Wrapper - Starting benchmark pass...")
 
     result = subprocess.run(cmd)
-    print(f"KV Cache Wrapper - End delay for {TEST_DELAY} seconds")
-    time.sleep(TEST_DELAY)
+    print(f"KV Cache Wrapper - End delay for {args.end_delay} seconds")
+    time.sleep(args.end_delay)
     print(f"KV Cache Wrapper - Finished benchmark pass")
 
     sys.exit(result.returncode)
diff --git a/mlpstorage_py/benchmarks/kvcache.py b/mlpstorage_py/benchmarks/kvcache.py
index 42dd423f..06c26919 100755
--- a/mlpstorage_py/benchmarks/kvcache.py
+++ b/mlpstorage_py/benchmarks/kvcache.py
@@ -33,6 +33,40 @@
 from mlpstorage_py.utils import generate_mpi_prefix_cmd, MLPSJsonEncoder
 
 
+# MLPerf v3.0 fixed per-option workload parameters. Single source of truth for
+# what option N means at the kv-cache.py level. In CLOSED these are mandated;
+# in OPEN they act as per-option defaults that user CLI flags supersede.
+WORKLOAD_PARAMS = {
+    1: {
+        'model': 'llama3.1-8b',
+        'num-users': 200,
+        'duration': 300,
+        'gpu-mem-gb': 0,
+        'cpu-mem-gb': 0,
+        'max-concurrent-allocs': 16,
+        'generation-mode': 'none',
+    },
+    2: {
+        'model': 'llama3.1-8b',
+        'num-users': 100,
+        'duration': 300,
+        'gpu-mem-gb': 0,
+        'cpu-mem-gb': 4,
+        'max-concurrent-allocs': 16,
+        'generation-mode': 'none',
+    },
+    3: {
+        'model': 'llama3.1-70b-instruct',
+        'num-users': 70,
+        'duration': 300,
+        'gpu-mem-gb': 0,
+        'cpu-mem-gb': 0,
+        'max-concurrent-allocs': 4,
+        'generation-mode': 'none',
+    },
+}
+
+
 class KVCacheBenchmark(Benchmark):
     """KV Cache benchmark for LLM inference storage.
 
@@ -234,6 +268,8 @@ def _execute_run(self) -> int:
         )
 
         wrapper_path = Path(self.kvcache_bin_path).parent / 'mlperf_wrapper.py'
+        # Wrapper-adjacent config.yaml is the default; CLOSED forbids overriding.
+        config_path = config or str(Path(self.kvcache_bin_path).parent / 'config.yaml')
 
         mpi_prefix = generate_mpi_prefix_cmd(
             mpi_cmd=getattr(self.args, 'mpi_bin', 'mpirun'),
@@ -248,6 +284,7 @@ def _execute_run(self) -> int:
 
         option_results = {}
         for option in [1, 2, 3]:
+            option_kv_args = self._build_option_kvcache_args(option, is_closed)
             trial_dirs = []
 
             for trial in range(trials):
@@ -258,13 +295,12 @@ def _execute_run(self) -> int:
 
                 wrapper_cmd = (
                     f"{mpi_prefix} {sys.executable} {wrapper_path}"
-                    f" --option {option}"
-                    f" --seed {seed}"
-                    f" --base-output-dir {option_trial_dir}"
-                    f" --cache-dir {cache_dir}"
+                    f" --rank-output-base {option_trial_dir}"
+                    f" --rank-cache-base {cache_dir}"
+                    f" --seed-base {seed}"
+                    f" --config {config_path}"
+                    f" {' '.join(option_kv_args)}"
                 )
-                if config:
-                    wrapper_cmd += f" --config {config}"
 
                 self.logger.status(f"Running option {option} trial {trial + 1}/{trials}...")
                 self._execute_command(
@@ -292,6 +328,53 @@ def _execute_run(self) -> int:
         self.write_cluster_info()
         return 0
 
+    def _build_option_kvcache_args(self, option: int, is_closed: bool) -> List[str]:
+        """Return the kv-cache.py CLI args for this option.
+
+        CLOSED: emits WORKLOAD_PARAMS[option] verbatim — MLPerf-mandated, no
+        user input can reach kv-cache.py through this path because the CLOSED
+        CLI does not expose the corresponding flags.
+
+        OPEN: user-set flags supersede WORKLOAD_PARAMS[option] one key at a
+        time. max-concurrent-allocs is not exposed by the OPEN CLI, so it
+        always comes from WORKLOAD_PARAMS.
+        """
+        defaults = WORKLOAD_PARAMS[option]
+        if is_closed:
+            params = dict(defaults)
+        else:
+            params = {
+                'model': getattr(self.args, 'model', None) or defaults['model'],
+                'num-users': (
+                    getattr(self.args, 'num_users', None)
+                    if getattr(self.args, 'num_users', None) is not None
+                    else defaults['num-users']
+                ),
+                'duration': (
+                    getattr(self.args, 'duration', None)
+                    if getattr(self.args, 'duration', None) is not None
+                    else defaults['duration']
+                ),
+                'gpu-mem-gb': (
+                    getattr(self.args, 'gpu_mem_gb', None)
+                    if getattr(self.args, 'gpu_mem_gb', None) is not None
+                    else defaults['gpu-mem-gb']
+                ),
+                'cpu-mem-gb': (
+                    getattr(self.args, 'cpu_mem_gb', None)
+                    if getattr(self.args, 'cpu_mem_gb', None) is not None
+                    else defaults['cpu-mem-gb']
+                ),
+                'max-concurrent-allocs': defaults['max-concurrent-allocs'],
+                'generation-mode': (
+                    getattr(self.args, 'generation_mode', None) or defaults['generation-mode']
+                ),
+            }
+        out = []
+        for key, value in params.items():
+            out.extend([f'--{key}', str(value)])
+        return out
+
     def _execute_datasize(self) -> int:
         """Calculate memory requirements for KV cache.
 
diff --git a/tests/unit/test_benchmarks_kvcache.py b/tests/unit/test_benchmarks_kvcache.py
index 818cf404..c4158033 100755
--- a/tests/unit/test_benchmarks_kvcache.py
+++ b/tests/unit/test_benchmarks_kvcache.py
@@ -835,8 +835,13 @@ def fake_execute(cmd, **kwargs):
         assert '--npernode 2' in executed_cmds[0], \
             f"Missing --npernode 2 in: {executed_cmds[0]}"
 
-    def test_wrapper_receives_option_seed_output_cache(self, bm, fake_agg_result):
-        """Wrapper command must include --option, --seed, --base-output-dir, --cache-dir (DIST-07)."""
+    def test_wrapper_receives_rank_bases_and_seed_base(self, bm, fake_agg_result):
+        """Wrapper command must include --rank-output-base, --rank-cache-base, --seed-base (DIST-07).
+
+        The wrapper API was reshaped for #498/#500: the wrapper no longer takes
+        --option and no longer encodes WORKLOAD_PARAMS. Per-option kv-cache.py
+        args are emitted by mlpstorage_py.benchmarks.kvcache and pass through
+        the wrapper via parse_known_args."""
         bm.args.trials = 1
         bm.args.inter_option_delay = 0
         bm.args.seed = 42
@@ -852,10 +857,11 @@ def fake_execute(cmd, **kwargs):
              patch.object(bm, 'write_metadata'):
             bm._execute_run()
         cmd0 = executed_cmds[0]
-        assert '--option 1' in cmd0, f"Missing --option 1 in: {cmd0}"
-        assert '--seed 42' in cmd0, f"Missing --seed 42 in: {cmd0}"
-        assert '--base-output-dir' in cmd0, f"Missing --base-output-dir in: {cmd0}"
-        assert '--cache-dir /tmp/kv' in cmd0, f"Missing --cache-dir in: {cmd0}"
+        assert '--seed-base 42' in cmd0, f"Missing --seed-base 42 in: {cmd0}"
+        assert '--rank-output-base' in cmd0, f"Missing --rank-output-base in: {cmd0}"
+        assert '--rank-cache-base /tmp/kv' in cmd0, f"Missing --rank-cache-base in: {cmd0}"
+        # The legacy --option flag is no longer part of the wrapper API.
+        assert '--option ' not in cmd0, f"Stale --option flag in: {cmd0}"
 
     def test_per_option_trial_dirs_created(self, bm, fake_agg_result, tmp_path):
         """option_{N}/trial_{T}/ directories must be created."""
@@ -1131,3 +1137,152 @@ def test_open_seed_override_allowed(self, tmp_path):
              patch.object(bm, 'write_metadata'):
             rc = bm._execute_run()
         assert rc == 0
+
+
+class TestWorkloadParamsConstant:
+    """WORKLOAD_PARAMS lives in mlpstorage_py.benchmarks.kvcache — the single
+    source of truth for per-option MLPerf v3.0 workloads. Previously it lived
+    in kv_cache_benchmark/mlperf_wrapper.py; centralizing it here is what
+    closes issues #498 and #500."""
+
+    def test_options_are_1_2_3(self):
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        assert set(WORKLOAD_PARAMS.keys()) == {1, 2, 3}
+
+    def test_option1_model_is_8b(self):
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        assert WORKLOAD_PARAMS[1]['model'] == 'llama3.1-8b'
+
+    def test_option3_model_is_70b(self):
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        assert WORKLOAD_PARAMS[3]['model'] == 'llama3.1-70b-instruct'
+
+    def test_generation_mode_always_none(self):
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        for opt in (1, 2, 3):
+            assert WORKLOAD_PARAMS[opt]['generation-mode'] == 'none'
+
+
+class TestBuildOptionKvcacheArgs:
+    """Per-option kv-cache.py CLI args returned by _build_option_kvcache_args.
+
+    Verifies CLOSED uses the mandated WORKLOAD_PARAMS verbatim and OPEN lets
+    user-set CLI flags supersede the per-option defaults — the behavior the
+    old in-wrapper squashing prevented (issues #498 and #500)."""
+
+    def test_closed_emits_workload_params_verbatim_for_option1(self, tmp_path):
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        bm = _make_run_benchmark(tmp_path)
+        # Even if args carry non-default values, CLOSED ignores them.
+        bm.args.model = 'llama3.1-70b-instruct'
+        bm.args.num_users = 999
+        out = bm._build_option_kvcache_args(1, is_closed=True)
+        assert '--model' in out and out[out.index('--model') + 1] == WORKLOAD_PARAMS[1]['model']
+        assert '--num-users' in out and out[out.index('--num-users') + 1] == str(WORKLOAD_PARAMS[1]['num-users'])
+
+    def test_closed_emits_option3_70b_model(self, tmp_path):
+        bm = _make_run_benchmark(tmp_path)
+        out = bm._build_option_kvcache_args(3, is_closed=True)
+        assert out[out.index('--model') + 1] == 'llama3.1-70b-instruct'
+
+    def test_open_user_args_override_defaults(self, tmp_path):
+        """Issue #498: in OPEN, user CLI args must reach kv-cache.py."""
+        bm = _make_run_benchmark(tmp_path)
+        bm.args.model = 'llama3.1-70b-instruct'
+        bm.args.num_users = 42
+        bm.args.duration = 600
+        bm.args.gpu_mem_gb = 8
+        bm.args.cpu_mem_gb = 16
+        bm.args.generation_mode = 'realistic'
+        out = bm._build_option_kvcache_args(1, is_closed=False)
+        assert out[out.index('--model') + 1] == 'llama3.1-70b-instruct'
+        assert out[out.index('--num-users') + 1] == '42'
+        assert out[out.index('--duration') + 1] == '600'
+        assert out[out.index('--gpu-mem-gb') + 1] == '8'
+        assert out[out.index('--cpu-mem-gb') + 1] == '16'
+        assert out[out.index('--generation-mode') + 1] == 'realistic'
+
+    def test_open_falls_back_to_workload_params_when_args_missing(self, tmp_path):
+        """In OPEN, attributes the user did not set come from WORKLOAD_PARAMS."""
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        bm = _make_run_benchmark(tmp_path)
+        for attr in ('model', 'num_users', 'duration', 'gpu_mem_gb',
+                     'cpu_mem_gb', 'generation_mode'):
+            if hasattr(bm.args, attr):
+                delattr(bm.args, attr)
+        out = bm._build_option_kvcache_args(2, is_closed=False)
+        assert out[out.index('--model') + 1] == WORKLOAD_PARAMS[2]['model']
+        assert out[out.index('--num-users') + 1] == str(WORKLOAD_PARAMS[2]['num-users'])
+        assert out[out.index('--cpu-mem-gb') + 1] == str(WORKLOAD_PARAMS[2]['cpu-mem-gb'])
+
+    def test_max_concurrent_allocs_always_from_workload_params(self, tmp_path):
+        """max-concurrent-allocs is not user-exposed even in OPEN — it
+        always tracks the per-option WORKLOAD_PARAMS value."""
+        from mlpstorage_py.benchmarks.kvcache import WORKLOAD_PARAMS
+        bm = _make_run_benchmark(tmp_path)
+        out_open = bm._build_option_kvcache_args(3, is_closed=False)
+        out_closed = bm._build_option_kvcache_args(3, is_closed=True)
+        expected = str(WORKLOAD_PARAMS[3]['max-concurrent-allocs'])
+        assert out_open[out_open.index('--max-concurrent-allocs') + 1] == expected
+        assert out_closed[out_closed.index('--max-concurrent-allocs') + 1] == expected
+
+
+class TestWrapperCommandForwardsPerOptionArgs:
+    """End-to-end check that the wrapper invocation built by _execute_run
+    contains the per-option model/num-users/etc., so kv-cache.py receives
+    them downstream of mlperf_wrapper.py."""
+
+    def _capture_first_cmd(self, bm, fake_agg_result):
+        executed = []
+        def fake_execute(cmd, **kwargs):
+            executed.append(cmd)
+            return ('', '', 0)
+        with patch.object(bm, '_execute_command', side_effect=fake_execute), \
+             patch.object(bm, '_interruptible_sleep'), \
+             patch.object(bm, '_aggregate_option_results', return_value=fake_agg_result), \
+             patch.object(bm, '_write_run_summary'), \
+             patch.object(bm, 'write_metadata'):
+            bm._execute_run()
+        return executed[0]
+
+    def _fake_agg(self):
+        return {
+            'option': 1, 'aggregated_read_bandwidth_gbps': 0.0,
+            'aggregated_write_bandwidth_gbps': 0.0,
+            'aggregated_avg_throughput_tokens_per_sec': 0.0,
+            'aggregated_storage_throughput_tokens_per_sec': 0.0,
+            'aggregated_p95_latency_ms': 0.0,
+            'rank_count': 1, 'trial_count': 1,
+            'partial_failure': False, 'missing_files': [], 'cpu_tier_ranks': [],
+        }
+
+    def test_closed_option1_wrapper_cmd_contains_8b_model(self, tmp_path):
+        bm = _make_run_benchmark(tmp_path)
+        bm.args.mode = 'closed'
+        # CLOSED requires trials=3 and inter_option_delay=20 (the fixture's
+        # values); overriding them would trigger CLOSED enforcement and
+        # short-circuit before any command is built.
+        cmd0 = self._capture_first_cmd(bm, self._fake_agg())
+        assert '--model llama3.1-8b' in cmd0
+        assert '--num-users 200' in cmd0
+        assert '--max-concurrent-allocs 16' in cmd0
+
+    def test_open_user_model_appears_in_wrapper_cmd(self, tmp_path):
+        """Issue #498: user --model must reach the wrapper-built command."""
+        bm = _make_run_benchmark(tmp_path)
+        bm.args.mode = 'open'
+        bm.args.model = 'llama3.1-70b-instruct'
+        bm.args.num_users = 33
+        bm.args.trials = 1
+        bm.args.inter_option_delay = 0
+        cmd0 = self._capture_first_cmd(bm, self._fake_agg())
+        assert '--model llama3.1-70b-instruct' in cmd0
+        assert '--num-users 33' in cmd0
+
+    def test_wrapper_cmd_contains_config_path(self, tmp_path):
+        """mlpstorage now owns the config path; it must be passed to the wrapper."""
+        bm = _make_run_benchmark(tmp_path)
+        bm.args.trials = 1
+        bm.args.inter_option_delay = 0
+        cmd0 = self._capture_first_cmd(bm, self._fake_agg())
+        assert '--config' in cmd0
diff --git a/tests/unit/test_mlperf_wrapper.py b/tests/unit/test_mlperf_wrapper.py
index 30fa622d..228e3d48 100755
--- a/tests/unit/test_mlperf_wrapper.py
+++ b/tests/unit/test_mlperf_wrapper.py
@@ -2,8 +2,9 @@
 
 Tests cover:
 - get_rank(): env var reading, OMPI precedence over PMI, invalid value fallback
-- BASE_SEED and WORKLOAD_PARAMS constants
-- main(): rank dir creation, effective seed, output flag, option params, config path
+- BASE_SEED constant
+- main(): rank dir creation, effective seed, output flag, forwarded args
+  pass-through, per-rank overrides come last (argparse-style override)
 """
 
 import importlib.util
@@ -60,32 +61,17 @@ class TestBaseConstants:
     def test_base_seed_is_42(self, wrapper_module):
         assert wrapper_module.BASE_SEED == 42
 
-
-class TestOptionParams:
-    """Tests for WORKLOAD_PARAMS fixed MLPerf values."""
-
-    def test_option_keys_are_1_2_3(self, wrapper_module):
-        assert set(wrapper_module.WORKLOAD_PARAMS.keys()) == {1, 2, 3}
-
-    def test_option1_model_is_8b(self, wrapper_module):
-        assert wrapper_module.WORKLOAD_PARAMS[1]['model'] == 'llama3.1-8b'
-
-    def test_option3_model_is_70b(self, wrapper_module):
-        assert wrapper_module.WORKLOAD_PARAMS[3]['model'] == 'llama3.1-70b-instruct'
-
-    def test_generation_mode_always_none(self, wrapper_module):
-        for opt in [1, 2, 3]:
-            assert wrapper_module.WORKLOAD_PARAMS[opt]['generation-mode'] == 'none'
-
-    def test_option2_cpu_mem_gb_is_4(self, wrapper_module):
-        assert wrapper_module.WORKLOAD_PARAMS[2]['cpu-mem-gb'] == 4
+    def test_wrapper_does_not_define_workload_params(self, wrapper_module):
+        """WORKLOAD_PARAMS lives in mlpstorage_py.benchmarks.kvcache now;
+        the wrapper must remain a dumb rank-aware launcher."""
+        assert not hasattr(wrapper_module, 'WORKLOAD_PARAMS')
 
 
 class TestMain:
     """Tests for main() — invokes subprocess and exits."""
 
     def _run_main(self, wrapper_module, monkeypatch, tmp_path, extra_argv=None,
-                  ompi_rank=None):
+                  ompi_rank=None, start_delay=0, end_delay=0):
         """Helper: sets up env, patches argv+subprocess, calls main(), captures cmd."""
         monkeypatch.delenv('OMPI_COMM_WORLD_RANK', raising=False)
         monkeypatch.delenv('PMI_RANK', raising=False)
@@ -96,10 +82,11 @@ def _run_main(self, wrapper_module, monkeypatch, tmp_path, extra_argv=None,
         cache_dir = tmp_path / 'cache'
         argv = [
             'mlperf_wrapper.py',
-            '--option', '1',
-            '--seed', '42',
-            '--base-output-dir', str(output_dir),
-            '--cache-dir', str(cache_dir),
+            '--rank-output-base', str(output_dir),
+            '--rank-cache-base', str(cache_dir),
+            '--seed-base', '42',
+            '--start-delay', str(start_delay),
+            '--end-delay', str(end_delay),
         ]
         if extra_argv:
             argv.extend(extra_argv)
@@ -130,7 +117,7 @@ def test_creates_rank_cache_dir(self, wrapper_module, monkeypatch, tmp_path):
     def test_effective_seed_is_base_plus_rank(self, wrapper_module, monkeypatch, tmp_path):
         cmd, _, _ = self._run_main(
             wrapper_module, monkeypatch, tmp_path,
-            extra_argv=['--seed', '10'],
+            extra_argv=['--seed-base', '10'],
             ompi_rank=2,
         )
         seed_idx = cmd.index('--seed')
@@ -143,19 +130,44 @@ def test_output_flag_points_to_rank_dir(self, wrapper_module, monkeypatch, tmp_p
         assert 'rank_0' in output_val
         assert 'kvcache_results_' in output_val
 
-    def test_workload_params_injected_for_option1(self, wrapper_module, monkeypatch, tmp_path):
-        cmd, _, _ = self._run_main(wrapper_module, monkeypatch, tmp_path)
-        assert '--model' in cmd
-        model_idx = cmd.index('--model')
-        assert cmd[model_idx + 1] == 'llama3.1-8b'
+    def test_cache_dir_points_to_rank_cache_dir(self, wrapper_module, monkeypatch, tmp_path):
+        cmd, _, cache_dir = self._run_main(wrapper_module, monkeypatch, tmp_path)
+        cache_idx = cmd.index('--cache-dir')
+        cache_val = cmd[cache_idx + 1]
+        assert str(cache_dir / 'rank_0') == cache_val
 
-    def test_config_default_is_adjacent_config_yaml(self, wrapper_module, monkeypatch, tmp_path):
-        cmd, _, _ = self._run_main(wrapper_module, monkeypatch, tmp_path)
-        assert any('config.yaml' in arg for arg in cmd)
-
-    def test_config_override_accepted(self, wrapper_module, monkeypatch, tmp_path):
+    def test_forwarded_args_passed_through_verbatim(self, wrapper_module, monkeypatch, tmp_path):
+        """Wrapper forwards unrecognized args to kv-cache.py without modification."""
         cmd, _, _ = self._run_main(
             wrapper_module, monkeypatch, tmp_path,
-            extra_argv=['--config', '/custom/config.yaml'],
+            extra_argv=['--model', 'llama3.1-8b', '--num-users', '200',
+                        '--config', '/tmp/cfg.yaml'],
         )
-        assert '/custom/config.yaml' in cmd
+        assert '--model' in cmd
+        assert cmd[cmd.index('--model') + 1] == 'llama3.1-8b'
+        assert '--num-users' in cmd
+        assert cmd[cmd.index('--num-users') + 1] == '200'
+        assert '--config' in cmd
+        assert cmd[cmd.index('--config') + 1] == '/tmp/cfg.yaml'
+
+    def test_per_rank_overrides_come_last(self, wrapper_module, monkeypatch, tmp_path):
+        """Per-rank --seed/--output/--cache-dir are appended after forwarded
+        args so argparse store-action picks them over any duplicates."""
+        cmd, _, _ = self._run_main(
+            wrapper_module, monkeypatch, tmp_path,
+            extra_argv=['--seed', '999', '--cache-dir', '/forwarded/cache'],
+        )
+        # Last --seed wins; wrapper's effective seed (42 + rank 0 = 42)
+        # must appear AFTER the forwarded --seed.
+        seed_positions = [i for i, x in enumerate(cmd) if x == '--seed']
+        assert len(seed_positions) == 2
+        assert cmd[seed_positions[-1] + 1] == '42'
+
+        cache_positions = [i for i, x in enumerate(cmd) if x == '--cache-dir']
+        assert len(cache_positions) == 2
+        assert '/forwarded/cache' not in cmd[cache_positions[-1] + 1]
+
+    def test_no_config_default_injected(self, wrapper_module, monkeypatch, tmp_path):
+        """Wrapper no longer falls back to an adjacent config.yaml — caller owns the path."""
+        cmd, _, _ = self._run_main(wrapper_module, monkeypatch, tmp_path)
+        assert '--config' not in cmd