From fbfccd9a6e2fee9206d839f232b57cb11678ea01 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Sat, 6 Jun 2026 14:35:51 -0700
Subject: [PATCH 1/2] Enable per dataset max-osl

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 .../commands/benchmark/execute.py             |  21 +++-
 src/inference_endpoint/config/schema.py       |  39 ++++++
 .../templates/concurrency_template_full.yaml  |   2 +
 .../templates/offline_template_full.yaml      |   2 +
 .../templates/online_template_full.yaml       |   2 +
 .../openai/completions_adapter.py             |   1 +
 tests/unit/commands/test_benchmark.py         | 119 ++++++++++++++++++
 tests/unit/config/test_schema.py              |  59 +++++++++
 8 files changed, 240 insertions(+), 5 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index e3c5505b9..2eeda9ec1 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -288,9 +288,13 @@ def _load_datasets(
                 acc_cfg.accuracy_config.extras or {},
             )
         )
-        ds.load(
-            api_type=config.endpoint_config.api_type, model_params=config.model_params
-        )
+        try:
+            ds_model_params = acc_cfg.effective_model_params(config.model_params)
+        except Exception as e:
+            raise InputValidationError(
+                f"Dataset '{acc_cfg.name}': invalid model_params_override: {e}"
+            ) from e
+        ds.load(api_type=config.endpoint_config.api_type, model_params=ds_model_params)
         logger.info(f"Loaded {ds} - {ds.num_samples()} samples")
 
     if not accuracy_cfgs:
@@ -298,10 +302,17 @@ def _load_datasets(
     if len(performance_cfgs) > 1:
         raise InputValidationError("Multiple performance datasets not supported")
 
+    perf_cfg = performance_cfgs[0]
+    try:
+        perf_model_params = perf_cfg.effective_model_params(config.model_params)
+    except Exception as e:
+        raise InputValidationError(
+            f"Dataset '{perf_cfg.name}': invalid model_params_override: {e}"
+        ) from e
     try:
-        dataloader = DataLoaderFactory.create_loader(performance_cfgs[0])
+        dataloader = DataLoaderFactory.create_loader(perf_cfg)
         dataloader.load(
-            api_type=config.endpoint_config.api_type, model_params=config.model_params
+            api_type=config.endpoint_config.api_type, model_params=perf_model_params
         )
         logger.info(f"Loaded {dataloader.num_samples()} samples")
     except FileNotFoundError as e:
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 2cfa35d73..3ecd3d5bc 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -313,6 +313,17 @@ class Dataset(BaseModel):
     multi_turn: MultiTurnConfig | None = Field(
         None, description="Multi-turn conversation configuration"
     )
+    model_params_override: dict[str, Any] | None = Field(
+        None,
+        description=(
+            "Per-dataset overrides for the top-level model_params (sparse — "
+            "only the fields you want to override). Merged on top of "
+            "BenchmarkConfig.model_params at dataset-load time. Useful for "
+            "MLPerf-style runs where accuracy and performance use different "
+            "output budgets in the same fleet, e.g. "
+            "model_params_override: {max_new_tokens: 32768, streaming: 'on'}."
+        ),
+    )
 
     @model_validator(mode="after")
     def _auto_derive_name(self) -> Self:
@@ -321,6 +332,34 @@ def _auto_derive_name(self) -> Self:
             object.__setattr__(self, "name", Path(self.path).stem)
         return self
 
+    @model_validator(mode="after")
+    def _validate_model_params_override(self) -> Self:
+        """Fail fast on unknown keys; we cannot validate values here because
+        merging requires the base model_params, which lives on BenchmarkConfig.
+        """
+        if self.model_params_override:
+            valid = set(ModelParams.model_fields)
+            bad = sorted(set(self.model_params_override) - valid)
+            if bad:
+                raise ValueError(
+                    f"Dataset '{self.name}': unknown keys in "
+                    f"model_params_override: {bad}. "
+                    f"Valid keys: {sorted(valid)}"
+                )
+        return self
+
+    def effective_model_params(self, base: ModelParams) -> ModelParams:
+        """Return base merged with this dataset's overrides.
+
+        Re-validates the merged dict through ``ModelParams.model_validate``
+        so that cross-field constraints (e.g. value ranges) catch bad
+        overrides before downstream code touches them.
+        """
+        if not self.model_params_override:
+            return base
+        merged = {**base.model_dump(), **self.model_params_override}
+        return ModelParams.model_validate(merged)
+
 
 class AccuracyConfig(BaseModel):
     """Accuracy configuration.
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 4fef4afcb..8bec418d0 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -27,6 +27,7 @@ datasets:  # Dataset configs
     prompt: text_input
   accuracy_config: null  # Accuracy evaluation settings
   multi_turn: null  # Multi-turn conversation configuration
+  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
 - name: accuracy
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
@@ -42,6 +43,7 @@ datasets:  # Dataset configs
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
   multi_turn: null  # Multi-turn conversation configuration
+  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
 settings:
   runtime:
     min_duration_ms: 600000  # Min duration (ms, or with suffix: 600s, 10m)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 1f61837fe..1047e7867 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -27,6 +27,7 @@ datasets:  # Dataset configs
     prompt: text_input
   accuracy_config: null  # Accuracy evaluation settings
   multi_turn: null  # Multi-turn conversation configuration
+  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
 - name: accuracy
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
@@ -42,6 +43,7 @@ datasets:  # Dataset configs
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
   multi_turn: null  # Multi-turn conversation configuration
+  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
 settings:
   runtime:
     min_duration_ms: 600000  # Min duration (ms, or with suffix: 600s, 10m)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index a212fa95b..702858644 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -27,6 +27,7 @@ datasets:  # Dataset configs
     prompt: text_input
   accuracy_config: null  # Accuracy evaluation settings
   multi_turn: null  # Multi-turn conversation configuration
+  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
 - name: accuracy
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
@@ -42,6 +43,7 @@ datasets:  # Dataset configs
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
   multi_turn: null  # Multi-turn conversation configuration
+  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
 settings:
   runtime:
     min_duration_ms: 600000  # Min duration (ms, or with suffix: 600s, 10m)
diff --git a/src/inference_endpoint/openai/completions_adapter.py b/src/inference_endpoint/openai/completions_adapter.py
index 22b374136..548edac85 100644
--- a/src/inference_endpoint/openai/completions_adapter.py
+++ b/src/inference_endpoint/openai/completions_adapter.py
@@ -62,6 +62,7 @@ def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]:
             "repetition_penalty": model_params.repetition_penalty,
             "presence_penalty": model_params.presence_penalty,
             "frequency_penalty": model_params.frequency_penalty,
+            "skip_special_tokens": False,
         }
         return [
             Harmonize(),
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 1c90554fb..5be94d667 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -16,6 +16,7 @@
 """Tests for benchmark CLI models, config building, and command handlers."""
 
 import asyncio
+import json
 import random
 import tempfile
 from pathlib import Path
@@ -34,6 +35,7 @@
     BenchmarkContext,
     ResponseCollector,
     _build_phases,
+    _load_datasets,
     _run_benchmark_async,
     setup_benchmark,
 )
@@ -1277,3 +1279,120 @@ def test_no_override_yields_none_when_model_has_no_tokenizer(
             ctx = setup_benchmark(config, TestMode.PERF)
 
         assert ctx.tokenizer_name is None
+
+
+class TestLoadDatasetsModelParamsOverride:
+    """End-to-end check that _load_datasets honors per-dataset
+    model_params_override and propagates the overridden value down to the
+    static columns the adapter adds to each row."""
+
+    def _write_jsonl(self, path: Path, rows: list[dict]) -> None:
+        path.write_text("\n".join(json.dumps(r) for r in rows) + "\n")
+
+    def _build_config(
+        self,
+        perf_path: Path,
+        acc_path: Path,
+        acc_override: dict | None,
+        perf_override: dict | None = None,
+    ) -> BenchmarkConfig:
+        return BenchmarkConfig(
+            type=TestType.OFFLINE,
+            model_params={"name": "test-model", "max_new_tokens": 1024},
+            endpoint_config={
+                "endpoints": ["http://localhost:8000"],
+                "api_type": "openai",
+            },
+            datasets=[
+                {
+                    "name": "perf",
+                    "type": "performance",
+                    "path": str(perf_path),
+                    **(
+                        {"model_params_override": perf_override}
+                        if perf_override
+                        else {}
+                    ),
+                },
+                {
+                    "name": "acc",
+                    "type": "accuracy",
+                    "path": str(acc_path),
+                    "accuracy_config": {
+                        "eval_method": "pass_at_1",
+                        "ground_truth": "ground_truth",
+                        "extractor": "boxed_math_extractor",
+                    },
+                    **({"model_params_override": acc_override} if acc_override else {}),
+                },
+            ],
+        )
+
+    @pytest.mark.unit
+    def test_override_propagates_to_loaded_rows(self, tmp_path):
+        """Override on accuracy dataset → its rows get max_completion_tokens=32768;
+        unmodified perf dataset keeps the global 1024."""
+        perf_path = tmp_path / "perf.jsonl"
+        acc_path = tmp_path / "acc.jsonl"
+        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
+        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
+
+        config = self._build_config(
+            perf_path, acc_path, acc_override={"max_new_tokens": 32768}
+        )
+        perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path)
+
+        # openai (chat completions) adapter emits the key `max_completion_tokens`
+        # via AddStaticColumns. Each loaded row should carry its dataset's value.
+        assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024
+        assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768
+
+    @pytest.mark.unit
+    def test_no_override_inherits_global(self, tmp_path):
+        """Without overrides, both datasets use the global model_params."""
+        perf_path = tmp_path / "perf.jsonl"
+        acc_path = tmp_path / "acc.jsonl"
+        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
+        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
+
+        config = self._build_config(perf_path, acc_path, acc_override=None)
+        perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path)
+
+        assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024
+        assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 1024
+
+    @pytest.mark.unit
+    def test_perf_dataset_override_also_honored(self, tmp_path):
+        """Symmetric check: overrides on the performance entry also flow
+        through (relevant for MLPerf-style perf with shorter max_new_tokens)."""
+        perf_path = tmp_path / "perf.jsonl"
+        acc_path = tmp_path / "acc.jsonl"
+        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
+        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
+
+        config = self._build_config(
+            perf_path,
+            acc_path,
+            acc_override={"max_new_tokens": 32768},
+            perf_override={"max_new_tokens": 10240},
+        )
+        perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path)
+
+        assert perf_ds.load_sample(0)["max_completion_tokens"] == 10240
+        assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768
+
+    @pytest.mark.unit
+    def test_invalid_override_value_raises_input_validation_error(self, tmp_path):
+        """A value-level invalidity (e.g. bad streaming enum) is caught at
+        load time and surfaces as InputValidationError, not a generic
+        SetupError, so the user sees a clear actionable message."""
+        perf_path = tmp_path / "perf.jsonl"
+        acc_path = tmp_path / "acc.jsonl"
+        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
+        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
+
+        config = self._build_config(
+            perf_path, acc_path, acc_override={"streaming": "garbage"}
+        )
+        with pytest.raises(InputValidationError, match="invalid model_params_override"):
+            _load_datasets(config, tmp_path)
diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py
index a60770121..d19dce713 100644
--- a/tests/unit/config/test_schema.py
+++ b/tests/unit/config/test_schema.py
@@ -123,6 +123,65 @@ def test_auto_derive_name(self):
         ds = Dataset(path="datasets/my_data.jsonl")
         assert ds.name == "my_data"
 
+    @pytest.mark.unit
+    def test_model_params_override_accepts_known_keys(self):
+        ds = Dataset(
+            name="acc",
+            type=DatasetType.ACCURACY,
+            path="acc.jsonl",
+            model_params_override={"max_new_tokens": 32768, "streaming": "on"},
+        )
+        assert ds.model_params_override == {
+            "max_new_tokens": 32768,
+            "streaming": "on",
+        }
+
+    @pytest.mark.unit
+    def test_model_params_override_rejects_unknown_key(self):
+        with pytest.raises(
+            ValueError, match=r"unknown keys in model_params_override.*bogus"
+        ):
+            Dataset(
+                name="acc",
+                path="a.jsonl",
+                model_params_override={"bogus": 1},
+            )
+
+    @pytest.mark.unit
+    def test_model_params_override_none_is_noop(self):
+        base = ModelParams(name="m", max_new_tokens=1024, streaming=StreamingMode.ON)
+        ds = Dataset(name="x", path="x.jsonl")
+        assert ds.effective_model_params(base) is base
+
+    @pytest.mark.unit
+    def test_effective_model_params_merges_sparse_dict(self):
+        base = ModelParams(name="m", temperature=0.5, top_p=0.9, max_new_tokens=1024)
+        ds = Dataset(
+            name="x",
+            path="x.jsonl",
+            model_params_override={"max_new_tokens": 32768},
+        )
+        merged = ds.effective_model_params(base)
+        # overridden field changes...
+        assert merged.max_new_tokens == 32768
+        # ...everything else is preserved from base
+        assert merged.name == "m"
+        assert merged.temperature == 0.5
+        assert merged.top_p == 0.9
+
+    @pytest.mark.unit
+    def test_effective_model_params_validates_value(self):
+        """ModelParams.model_validate is invoked on the merged dict, so a
+        type-invalid override is rejected (e.g. wrong type for streaming)."""
+        base = ModelParams(name="m")
+        ds = Dataset(
+            name="x",
+            path="x.jsonl",
+            model_params_override={"streaming": "garbage"},
+        )
+        with pytest.raises(ValueError):
+            ds.effective_model_params(base)
+
 
 class TestBenchmarkConfig:
     @pytest.mark.unit

From 553a6d2a2d16f9c9c1488d2a4f7a1937e279d797 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Sat, 6 Jun 2026 23:47:24 -0700
Subject: [PATCH 2/2] Address comments

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 .../commands/benchmark/execute.py             |  17 ++-
 src/inference_endpoint/config/schema.py       |  76 +++++++++---
 .../templates/concurrency_template_full.yaml  |   4 +-
 .../templates/offline_template_full.yaml      |   4 +-
 .../templates/online_template_full.yaml       |   4 +-
 .../openai/completions_adapter.py             |   1 -
 tests/unit/commands/test_benchmark.py         | 109 +++++++++++-------
 tests/unit/config/test_schema.py              |  78 ++++++++++---
 8 files changed, 206 insertions(+), 87 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 2eeda9ec1..8b9ee0499 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -42,6 +42,7 @@
 import msgspec
 import msgspec.json
 from huggingface_hub import model_info
+from pydantic import ValidationError
 from tqdm import tqdm
 from transformers import AutoTokenizer
 from transformers.utils import logging as transformers_logging
@@ -289,10 +290,10 @@ def _load_datasets(
             )
         )
         try:
-            ds_model_params = acc_cfg.effective_model_params(config.model_params)
-        except Exception as e:
+            ds_model_params = acc_cfg.effective_generation_config(config.model_params)
+        except (ValidationError, ValueError) as e:
             raise InputValidationError(
-                f"Dataset '{acc_cfg.name}': invalid model_params_override: {e}"
+                f"Dataset '{acc_cfg.name}': invalid generation_config_override: {e}"
             ) from e
         ds.load(api_type=config.endpoint_config.api_type, model_params=ds_model_params)
         logger.info(f"Loaded {ds} - {ds.num_samples()} samples")
@@ -304,10 +305,10 @@ def _load_datasets(
 
     perf_cfg = performance_cfgs[0]
     try:
-        perf_model_params = perf_cfg.effective_model_params(config.model_params)
-    except Exception as e:
+        perf_model_params = perf_cfg.effective_generation_config(config.model_params)
+    except (ValidationError, ValueError) as e:
         raise InputValidationError(
-            f"Dataset '{perf_cfg.name}': invalid model_params_override: {e}"
+            f"Dataset '{perf_cfg.name}': invalid generation_config_override: {e}"
         ) from e
     try:
         dataloader = DataLoaderFactory.create_loader(perf_cfg)
@@ -316,9 +317,7 @@ def _load_datasets(
         )
         logger.info(f"Loaded {dataloader.num_samples()} samples")
     except FileNotFoundError as e:
-        raise InputValidationError(
-            f"Dataset file not found: {performance_cfgs[0].path}"
-        ) from e
+        raise InputValidationError(f"Dataset file not found: {perf_cfg.path}") from e
     except Exception as e:
         raise SetupError(f"Failed to load dataset: {e}") from e
 
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 3ecd3d5bc..6fc6d3afc 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -54,6 +54,23 @@ class SystemDefaults(BaseModel):
     DEFAULT_METRIC: ClassVar[metrics.Metric] = metrics.Throughput(0.0)
 
 
+def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
+    """Recursively merge ``override`` into ``base`` and return the result.
+
+    For overlapping keys whose values are both dicts, recurse; otherwise the
+    override value wins. Mutates a *copy* — callers can safely pass model_dump()
+    output. Used by ``Dataset.effective_generation_config`` so a sparse nested
+    override (e.g. ``{osl_distribution: {max: 512}}``) preserves siblings.
+    """
+    out = dict(base)
+    for k, v in override.items():
+        if isinstance(v, dict) and isinstance(out.get(k), dict):
+            out[k] = _deep_merge(out[k], v)
+        else:
+            out[k] = v
+    return out
+
+
 class LoadPatternType(str, Enum):
     """Load pattern types."""
 
@@ -313,7 +330,25 @@ class Dataset(BaseModel):
     multi_turn: MultiTurnConfig | None = Field(
         None, description="Multi-turn conversation configuration"
     )
-    model_params_override: dict[str, Any] | None = Field(
+    # TODO(post-mortem): generation config is per-phase (perf vs. accuracy),
+    # not per-dataset — phases are derived from datasets and the override is
+    # keyed to dataset identity. This lives on Dataset as a short-term WAR
+    # so MLPerf-style accuracy + perf can share one fleet. The proper fix is
+    # a first-class GenerationConfig carried on PhaseConfig, decoupled from
+    # the dataset entry. Field/method names use "generation_config" to keep
+    # the eventual migration mechanical.
+    #
+    # Caveats on per-dataset overrides today:
+    #   - `name` flows into the request `model` field but the tokenizer and
+    #     aggregator are launched from the global `model_params.name`, so a
+    #     per-dataset rename mismatches ISL/OSL accounting.
+    #   - `streaming` flows into the request but the single MetricsAggregator
+    #     is launched with the global `model_params.streaming` flag, so a
+    #     per-dataset streaming flip will not produce TTFT/TPOT for that
+    #     phase. Keep streaming on `model_params` (per-run) for now.
+    #   - Nested dicts (`osl_distribution`, `chat_template_kwargs`) are
+    #     deep-merged so sparse overrides preserve sibling defaults.
+    generation_config_override: dict[str, Any] | None = Field(
         None,
         description=(
             "Per-dataset overrides for the top-level model_params (sparse — "
@@ -321,7 +356,10 @@ class Dataset(BaseModel):
             "BenchmarkConfig.model_params at dataset-load time. Useful for "
             "MLPerf-style runs where accuracy and performance use different "
             "output budgets in the same fleet, e.g. "
-            "model_params_override: {max_new_tokens: 32768, streaming: 'on'}."
+            "generation_config_override: {max_new_tokens: 32768, "
+            "temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are "
+            "accepted (kwargs-style) but not honored by the single-aggregator "
+            "metrics path — set those on top-level model_params."
         ),
     )
 
@@ -333,31 +371,37 @@ def _auto_derive_name(self) -> Self:
         return self
 
     @model_validator(mode="after")
-    def _validate_model_params_override(self) -> Self:
-        """Fail fast on unknown keys; we cannot validate values here because
-        merging requires the base model_params, which lives on BenchmarkConfig.
+    def _validate_generation_config_override(self) -> Self:
+        """Fail fast on unknown keys; values are validated at merge time
+        (see ``effective_generation_config``) because cross-field validation
+        needs the base ``ModelParams`` from ``BenchmarkConfig``.
         """
-        if self.model_params_override:
+        if self.generation_config_override:
             valid = set(ModelParams.model_fields)
-            bad = sorted(set(self.model_params_override) - valid)
+            bad = sorted(set(self.generation_config_override) - valid)
             if bad:
                 raise ValueError(
                     f"Dataset '{self.name}': unknown keys in "
-                    f"model_params_override: {bad}. "
+                    f"generation_config_override: {bad}. "
                     f"Valid keys: {sorted(valid)}"
                 )
         return self
 
-    def effective_model_params(self, base: ModelParams) -> ModelParams:
-        """Return base merged with this dataset's overrides.
-
-        Re-validates the merged dict through ``ModelParams.model_validate``
-        so that cross-field constraints (e.g. value ranges) catch bad
-        overrides before downstream code touches them.
+    def effective_generation_config(self, base: ModelParams) -> ModelParams:
+        """Return base merged with this dataset's generation-config overrides.
+
+        Nested dicts are deep-merged so a sparse nested override preserves
+        sibling defaults (e.g. ``{osl_distribution: {max: 512}}`` keeps the
+        base ``type/mean/std/min``). The merged dict is re-validated through
+        ``ModelParams.model_validate`` so type-invalid scalar overrides (e.g.
+        ``temperature: 'hot'``) are rejected. Note that this only catches
+        scalar invalidity — a sparse nested override whose merged result
+        passes default-validation will not raise (callers that need stricter
+        nested validation should set ``base`` to an explicit instance).
         """
-        if not self.model_params_override:
+        if not self.generation_config_override:
             return base
-        merged = {**base.model_dump(), **self.model_params_override}
+        merged = _deep_merge(base.model_dump(), self.generation_config_override)
         return ModelParams.model_validate(merged)
 
 
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 8bec418d0..4308a860f 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -27,7 +27,7 @@ datasets:  # Dataset configs
     prompt: text_input
   accuracy_config: null  # Accuracy evaluation settings
   multi_turn: null  # Multi-turn conversation configuration
-  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
+  generation_config_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params.
 - name: accuracy
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
@@ -43,7 +43,7 @@ datasets:  # Dataset configs
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
   multi_turn: null  # Multi-turn conversation configuration
-  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
+  generation_config_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params.
 settings:
   runtime:
     min_duration_ms: 600000  # Min duration (ms, or with suffix: 600s, 10m)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 1047e7867..ad307f7db 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -27,7 +27,7 @@ datasets:  # Dataset configs
     prompt: text_input
   accuracy_config: null  # Accuracy evaluation settings
   multi_turn: null  # Multi-turn conversation configuration
-  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
+  generation_config_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params.
 - name: accuracy
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
@@ -43,7 +43,7 @@ datasets:  # Dataset configs
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
   multi_turn: null  # Multi-turn conversation configuration
-  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
+  generation_config_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params.
 settings:
   runtime:
     min_duration_ms: 600000  # Min duration (ms, or with suffix: 600s, 10m)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 702858644..2230eaa7a 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -27,7 +27,7 @@ datasets:  # Dataset configs
     prompt: text_input
   accuracy_config: null  # Accuracy evaluation settings
   multi_turn: null  # Multi-turn conversation configuration
-  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
+  generation_config_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params.
 - name: accuracy
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
@@ -43,7 +43,7 @@ datasets:  # Dataset configs
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
   multi_turn: null  # Multi-turn conversation configuration
-  model_params_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}.
+  generation_config_override: null  # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params.
 settings:
   runtime:
     min_duration_ms: 600000  # Min duration (ms, or with suffix: 600s, 10m)
diff --git a/src/inference_endpoint/openai/completions_adapter.py b/src/inference_endpoint/openai/completions_adapter.py
index 548edac85..22b374136 100644
--- a/src/inference_endpoint/openai/completions_adapter.py
+++ b/src/inference_endpoint/openai/completions_adapter.py
@@ -62,7 +62,6 @@ def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]:
             "repetition_penalty": model_params.repetition_penalty,
             "presence_penalty": model_params.presence_penalty,
             "frequency_penalty": model_params.frequency_penalty,
-            "skip_special_tokens": False,
         }
         return [
             Harmonize(),
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 5be94d667..83c4fd9d5 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -1281,14 +1281,29 @@ def test_no_override_yields_none_when_model_has_no_tokenizer(
         assert ctx.tokenizer_name is None
 
 
-class TestLoadDatasetsModelParamsOverride:
-    """End-to-end check that _load_datasets honors per-dataset
-    model_params_override and propagates the overridden value down to the
-    static columns the adapter adds to each row."""
+class _OverrideTestBase:
+    """Shared helpers for the two end-to-end ``_load_datasets`` override classes
+    below (parametrized over the chat vs text-completions adapter)."""
+
+    # Subclasses set these:
+    api_type: str = ""
+    max_tokens_key: str = ""  # static column name AddStaticColumns adds
 
     def _write_jsonl(self, path: Path, rows: list[dict]) -> None:
         path.write_text("\n".join(json.dumps(r) for r in rows) + "\n")
 
+    def _prompt_rows(self, prompt: str, ground_truth: str | None = None) -> list[dict]:
+        """Adapter-shaped row. Chat adapter wants a 'prompt' column; the
+        completions adapter wants pre-tokenized 'input_tokens' (so the
+        Harmonize transform early-exits and we avoid the HF tokenizer
+        dependency in unit tests)."""
+        row: dict = {"prompt": prompt}
+        if self.api_type == "openai_completions":
+            row = {"input_tokens": [1, 2, 3, 4]}
+        if ground_truth is not None:
+            row["ground_truth"] = ground_truth
+        return [row]
+
     def _build_config(
         self,
         perf_path: Path,
@@ -1301,7 +1316,7 @@ def _build_config(
             model_params={"name": "test-model", "max_new_tokens": 1024},
             endpoint_config={
                 "endpoints": ["http://localhost:8000"],
-                "api_type": "openai",
+                "api_type": self.api_type,
             },
             datasets=[
                 {
@@ -1309,7 +1324,7 @@ def _build_config(
                     "type": "performance",
                     "path": str(perf_path),
                     **(
-                        {"model_params_override": perf_override}
+                        {"generation_config_override": perf_override}
                         if perf_override
                         else {}
                     ),
@@ -1323,53 +1338,48 @@ def _build_config(
                         "ground_truth": "ground_truth",
                         "extractor": "boxed_math_extractor",
                     },
-                    **({"model_params_override": acc_override} if acc_override else {}),
+                    **(
+                        {"generation_config_override": acc_override}
+                        if acc_override
+                        else {}
+                    ),
                 },
             ],
         )
 
-    @pytest.mark.unit
-    def test_override_propagates_to_loaded_rows(self, tmp_path):
-        """Override on accuracy dataset → its rows get max_completion_tokens=32768;
-        unmodified perf dataset keeps the global 1024."""
+    def _write_fixture(self, tmp_path: Path) -> tuple[Path, Path]:
         perf_path = tmp_path / "perf.jsonl"
         acc_path = tmp_path / "acc.jsonl"
-        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
-        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
+        self._write_jsonl(perf_path, self._prompt_rows("perf-prompt"))
+        self._write_jsonl(acc_path, self._prompt_rows("acc-prompt", ground_truth="42"))
+        return perf_path, acc_path
 
+    @pytest.mark.unit
+    def test_override_propagates_to_loaded_rows(self, tmp_path):
+        """Override on accuracy dataset → its rows get the overridden value;
+        unmodified perf dataset keeps the global 1024."""
+        perf_path, acc_path = self._write_fixture(tmp_path)
         config = self._build_config(
             perf_path, acc_path, acc_override={"max_new_tokens": 32768}
         )
         perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path)
-
-        # openai (chat completions) adapter emits the key `max_completion_tokens`
-        # via AddStaticColumns. Each loaded row should carry its dataset's value.
-        assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024
-        assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768
+        assert perf_ds.load_sample(0)[self.max_tokens_key] == 1024
+        assert acc_datasets[0].load_sample(0)[self.max_tokens_key] == 32768
 
     @pytest.mark.unit
     def test_no_override_inherits_global(self, tmp_path):
         """Without overrides, both datasets use the global model_params."""
-        perf_path = tmp_path / "perf.jsonl"
-        acc_path = tmp_path / "acc.jsonl"
-        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
-        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
-
+        perf_path, acc_path = self._write_fixture(tmp_path)
         config = self._build_config(perf_path, acc_path, acc_override=None)
         perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path)
-
-        assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024
-        assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 1024
+        assert perf_ds.load_sample(0)[self.max_tokens_key] == 1024
+        assert acc_datasets[0].load_sample(0)[self.max_tokens_key] == 1024
 
     @pytest.mark.unit
     def test_perf_dataset_override_also_honored(self, tmp_path):
         """Symmetric check: overrides on the performance entry also flow
         through (relevant for MLPerf-style perf with shorter max_new_tokens)."""
-        perf_path = tmp_path / "perf.jsonl"
-        acc_path = tmp_path / "acc.jsonl"
-        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
-        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
-
+        perf_path, acc_path = self._write_fixture(tmp_path)
         config = self._build_config(
             perf_path,
             acc_path,
@@ -1377,22 +1387,41 @@ def test_perf_dataset_override_also_honored(self, tmp_path):
             perf_override={"max_new_tokens": 10240},
         )
         perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path)
-
-        assert perf_ds.load_sample(0)["max_completion_tokens"] == 10240
-        assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768
+        assert perf_ds.load_sample(0)[self.max_tokens_key] == 10240
+        assert acc_datasets[0].load_sample(0)[self.max_tokens_key] == 32768
 
     @pytest.mark.unit
     def test_invalid_override_value_raises_input_validation_error(self, tmp_path):
         """A value-level invalidity (e.g. bad streaming enum) is caught at
         load time and surfaces as InputValidationError, not a generic
         SetupError, so the user sees a clear actionable message."""
-        perf_path = tmp_path / "perf.jsonl"
-        acc_path = tmp_path / "acc.jsonl"
-        self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}])
-        self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}])
-
+        perf_path, acc_path = self._write_fixture(tmp_path)
         config = self._build_config(
             perf_path, acc_path, acc_override={"streaming": "garbage"}
         )
-        with pytest.raises(InputValidationError, match="invalid model_params_override"):
+        with pytest.raises(
+            InputValidationError, match="invalid generation_config_override"
+        ):
             _load_datasets(config, tmp_path)
+
+
+class TestLoadDatasetsGenerationConfigOverrideChat(_OverrideTestBase):
+    """End-to-end ``_load_datasets`` check against the OpenAI **chat**
+    completions adapter, which emits ``max_completion_tokens``."""
+
+    api_type = "openai"
+    max_tokens_key = "max_completion_tokens"
+
+
+class TestLoadDatasetsGenerationConfigOverrideCompletions(_OverrideTestBase):
+    """End-to-end ``_load_datasets`` check against the OpenAI **text**
+    completions adapter (``/v1/completions``), which emits ``max_tokens``.
+
+    This is the headline target of PR #344 — MLPerf-style runs use
+    ``api_type: openai_completions`` for pre-tokenized inputs — so an
+    integration test on this code path is essential. Rows carry pre-baked
+    ``input_tokens`` so the adapter's ``Harmonize()`` transform early-exits
+    and the test stays free of HF tokenizer downloads."""
+
+    api_type = "openai_completions"
+    max_tokens_key = "max_tokens"
diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py
index d19dce713..0c99565c1 100644
--- a/tests/unit/config/test_schema.py
+++ b/tests/unit/config/test_schema.py
@@ -124,44 +124,44 @@ def test_auto_derive_name(self):
         assert ds.name == "my_data"
 
     @pytest.mark.unit
-    def test_model_params_override_accepts_known_keys(self):
+    def test_generation_config_override_accepts_known_keys(self):
         ds = Dataset(
             name="acc",
             type=DatasetType.ACCURACY,
             path="acc.jsonl",
-            model_params_override={"max_new_tokens": 32768, "streaming": "on"},
+            generation_config_override={"max_new_tokens": 32768, "temperature": 0.0},
         )
-        assert ds.model_params_override == {
+        assert ds.generation_config_override == {
             "max_new_tokens": 32768,
-            "streaming": "on",
+            "temperature": 0.0,
         }
 
     @pytest.mark.unit
-    def test_model_params_override_rejects_unknown_key(self):
+    def test_generation_config_override_rejects_unknown_key(self):
         with pytest.raises(
-            ValueError, match=r"unknown keys in model_params_override.*bogus"
+            ValueError, match=r"unknown keys in generation_config_override.*bogus"
         ):
             Dataset(
                 name="acc",
                 path="a.jsonl",
-                model_params_override={"bogus": 1},
+                generation_config_override={"bogus": 1},
             )
 
     @pytest.mark.unit
-    def test_model_params_override_none_is_noop(self):
+    def test_generation_config_override_none_is_noop(self):
         base = ModelParams(name="m", max_new_tokens=1024, streaming=StreamingMode.ON)
         ds = Dataset(name="x", path="x.jsonl")
-        assert ds.effective_model_params(base) is base
+        assert ds.effective_generation_config(base) is base
 
     @pytest.mark.unit
-    def test_effective_model_params_merges_sparse_dict(self):
+    def test_effective_generation_config_merges_sparse_dict(self):
         base = ModelParams(name="m", temperature=0.5, top_p=0.9, max_new_tokens=1024)
         ds = Dataset(
             name="x",
             path="x.jsonl",
-            model_params_override={"max_new_tokens": 32768},
+            generation_config_override={"max_new_tokens": 32768},
         )
-        merged = ds.effective_model_params(base)
+        merged = ds.effective_generation_config(base)
         # overridden field changes...
         assert merged.max_new_tokens == 32768
         # ...everything else is preserved from base
@@ -170,17 +170,65 @@ def test_effective_model_params_merges_sparse_dict(self):
         assert merged.top_p == 0.9
 
     @pytest.mark.unit
-    def test_effective_model_params_validates_value(self):
+    def test_effective_generation_config_validates_value(self):
         """ModelParams.model_validate is invoked on the merged dict, so a
         type-invalid override is rejected (e.g. wrong type for streaming)."""
         base = ModelParams(name="m")
         ds = Dataset(
             name="x",
             path="x.jsonl",
-            model_params_override={"streaming": "garbage"},
+            generation_config_override={"streaming": "garbage"},
         )
         with pytest.raises(ValueError):
-            ds.effective_model_params(base)
+            ds.effective_generation_config(base)
+
+    @pytest.mark.unit
+    def test_effective_generation_config_deep_merges_nested_dict(self):
+        """Sparse overrides of nested fields (osl_distribution,
+        chat_template_kwargs) preserve sibling defaults from the base rather
+        than wholesale-replacing the nested object. Pins the deep-merge
+        behavior added in response to PR review feedback.
+        """
+        base = ModelParams(
+            name="m",
+            osl_distribution=OSLDistribution(
+                type=OSLDistributionType.NORMAL, mean=1000, std=200, min=512, max=2048
+            ),
+        )
+        ds = Dataset(
+            name="x",
+            path="x.jsonl",
+            generation_config_override={"osl_distribution": {"max": 512}},
+        )
+        merged = ds.effective_generation_config(base)
+        # the explicitly overridden nested field changes...
+        assert merged.osl_distribution.max == 512
+        # ...and the unspecified siblings are preserved from base
+        assert merged.osl_distribution.type == OSLDistributionType.NORMAL
+        assert merged.osl_distribution.mean == 1000
+        assert merged.osl_distribution.std == 200
+        assert merged.osl_distribution.min == 512
+
+    @pytest.mark.unit
+    def test_effective_generation_config_deep_merges_chat_template_kwargs(self):
+        """Deep-merge also applies to free-form nested dicts like
+        chat_template_kwargs; sparse overrides preserve sibling entries.
+        """
+        base = ModelParams(
+            name="m", chat_template_kwargs={"enable_thinking": True, "tools": []}
+        )
+        ds = Dataset(
+            name="x",
+            path="x.jsonl",
+            generation_config_override={
+                "chat_template_kwargs": {"enable_thinking": False}
+            },
+        )
+        merged = ds.effective_generation_config(base)
+        assert merged.chat_template_kwargs == {
+            "enable_thinking": False,
+            "tools": [],
+        }
 
 
 class TestBenchmarkConfig: