From fbfccd9a6e2fee9206d839f232b57cb11678ea01 Mon Sep 17 00:00:00 2001 From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com> Date: Sat, 6 Jun 2026 14:35:51 -0700 Subject: [PATCH 1/2] Enable per dataset max-osl Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com> --- .../commands/benchmark/execute.py | 21 +++- src/inference_endpoint/config/schema.py | 39 ++++++ .../templates/concurrency_template_full.yaml | 2 + .../templates/offline_template_full.yaml | 2 + .../templates/online_template_full.yaml | 2 + .../openai/completions_adapter.py | 1 + tests/unit/commands/test_benchmark.py | 119 ++++++++++++++++++ tests/unit/config/test_schema.py | 59 +++++++++ 8 files changed, 240 insertions(+), 5 deletions(-) diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index e3c5505b9..2eeda9ec1 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -288,9 +288,13 @@ def _load_datasets( acc_cfg.accuracy_config.extras or {}, ) ) - ds.load( - api_type=config.endpoint_config.api_type, model_params=config.model_params - ) + try: + ds_model_params = acc_cfg.effective_model_params(config.model_params) + except Exception as e: + raise InputValidationError( + f"Dataset '{acc_cfg.name}': invalid model_params_override: {e}" + ) from e + ds.load(api_type=config.endpoint_config.api_type, model_params=ds_model_params) logger.info(f"Loaded {ds} - {ds.num_samples()} samples") if not accuracy_cfgs: @@ -298,10 +302,17 @@ def _load_datasets( if len(performance_cfgs) > 1: raise InputValidationError("Multiple performance datasets not supported") + perf_cfg = performance_cfgs[0] + try: + perf_model_params = perf_cfg.effective_model_params(config.model_params) + except Exception as e: + raise InputValidationError( + f"Dataset '{perf_cfg.name}': invalid model_params_override: {e}" + ) from e try: - dataloader = DataLoaderFactory.create_loader(performance_cfgs[0]) + dataloader = DataLoaderFactory.create_loader(perf_cfg) dataloader.load( - api_type=config.endpoint_config.api_type, model_params=config.model_params + api_type=config.endpoint_config.api_type, model_params=perf_model_params ) logger.info(f"Loaded {dataloader.num_samples()} samples") except FileNotFoundError as e: diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 2cfa35d73..3ecd3d5bc 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -313,6 +313,17 @@ class Dataset(BaseModel): multi_turn: MultiTurnConfig | None = Field( None, description="Multi-turn conversation configuration" ) + model_params_override: dict[str, Any] | None = Field( + None, + description=( + "Per-dataset overrides for the top-level model_params (sparse — " + "only the fields you want to override). Merged on top of " + "BenchmarkConfig.model_params at dataset-load time. Useful for " + "MLPerf-style runs where accuracy and performance use different " + "output budgets in the same fleet, e.g. " + "model_params_override: {max_new_tokens: 32768, streaming: 'on'}." + ), + ) @model_validator(mode="after") def _auto_derive_name(self) -> Self: @@ -321,6 +332,34 @@ def _auto_derive_name(self) -> Self: object.__setattr__(self, "name", Path(self.path).stem) return self + @model_validator(mode="after") + def _validate_model_params_override(self) -> Self: + """Fail fast on unknown keys; we cannot validate values here because + merging requires the base model_params, which lives on BenchmarkConfig. + """ + if self.model_params_override: + valid = set(ModelParams.model_fields) + bad = sorted(set(self.model_params_override) - valid) + if bad: + raise ValueError( + f"Dataset '{self.name}': unknown keys in " + f"model_params_override: {bad}. " + f"Valid keys: {sorted(valid)}" + ) + return self + + def effective_model_params(self, base: ModelParams) -> ModelParams: + """Return base merged with this dataset's overrides. + + Re-validates the merged dict through ``ModelParams.model_validate`` + so that cross-field constraints (e.g. value ranges) catch bad + overrides before downstream code touches them. + """ + if not self.model_params_override: + return base + merged = {**base.model_dump(), **self.model_params_override} + return ModelParams.model_validate(merged) + class AccuracyConfig(BaseModel): """Accuracy configuration. diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 4fef4afcb..8bec418d0 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -27,6 +27,7 @@ datasets: # Dataset configs prompt: text_input accuracy_config: null # Accuracy evaluation settings multi_turn: null # Multi-turn conversation configuration + model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. - name: accuracy type: accuracy # Dataset purpose: performance or accuracy | options: performance, accuracy path: '' # Dataset file path @@ -42,6 +43,7 @@ datasets: # Dataset configs extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation multi_turn: null # Multi-turn conversation configuration + model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. settings: runtime: min_duration_ms: 600000 # Min duration (ms, or with suffix: 600s, 10m) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 1f61837fe..1047e7867 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -27,6 +27,7 @@ datasets: # Dataset configs prompt: text_input accuracy_config: null # Accuracy evaluation settings multi_turn: null # Multi-turn conversation configuration + model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. - name: accuracy type: accuracy # Dataset purpose: performance or accuracy | options: performance, accuracy path: '' # Dataset file path @@ -42,6 +43,7 @@ datasets: # Dataset configs extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation multi_turn: null # Multi-turn conversation configuration + model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. settings: runtime: min_duration_ms: 600000 # Min duration (ms, or with suffix: 600s, 10m) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index a212fa95b..702858644 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -27,6 +27,7 @@ datasets: # Dataset configs prompt: text_input accuracy_config: null # Accuracy evaluation settings multi_turn: null # Multi-turn conversation configuration + model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. - name: accuracy type: accuracy # Dataset purpose: performance or accuracy | options: performance, accuracy path: '' # Dataset file path @@ -42,6 +43,7 @@ datasets: # Dataset configs extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation multi_turn: null # Multi-turn conversation configuration + model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. settings: runtime: min_duration_ms: 600000 # Min duration (ms, or with suffix: 600s, 10m) diff --git a/src/inference_endpoint/openai/completions_adapter.py b/src/inference_endpoint/openai/completions_adapter.py index 22b374136..548edac85 100644 --- a/src/inference_endpoint/openai/completions_adapter.py +++ b/src/inference_endpoint/openai/completions_adapter.py @@ -62,6 +62,7 @@ def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]: "repetition_penalty": model_params.repetition_penalty, "presence_penalty": model_params.presence_penalty, "frequency_penalty": model_params.frequency_penalty, + "skip_special_tokens": False, } return [ Harmonize(), diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 1c90554fb..5be94d667 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -16,6 +16,7 @@ """Tests for benchmark CLI models, config building, and command handlers.""" import asyncio +import json import random import tempfile from pathlib import Path @@ -34,6 +35,7 @@ BenchmarkContext, ResponseCollector, _build_phases, + _load_datasets, _run_benchmark_async, setup_benchmark, ) @@ -1277,3 +1279,120 @@ def test_no_override_yields_none_when_model_has_no_tokenizer( ctx = setup_benchmark(config, TestMode.PERF) assert ctx.tokenizer_name is None + + +class TestLoadDatasetsModelParamsOverride: + """End-to-end check that _load_datasets honors per-dataset + model_params_override and propagates the overridden value down to the + static columns the adapter adds to each row.""" + + def _write_jsonl(self, path: Path, rows: list[dict]) -> None: + path.write_text("\n".join(json.dumps(r) for r in rows) + "\n") + + def _build_config( + self, + perf_path: Path, + acc_path: Path, + acc_override: dict | None, + perf_override: dict | None = None, + ) -> BenchmarkConfig: + return BenchmarkConfig( + type=TestType.OFFLINE, + model_params={"name": "test-model", "max_new_tokens": 1024}, + endpoint_config={ + "endpoints": ["http://localhost:8000"], + "api_type": "openai", + }, + datasets=[ + { + "name": "perf", + "type": "performance", + "path": str(perf_path), + **( + {"model_params_override": perf_override} + if perf_override + else {} + ), + }, + { + "name": "acc", + "type": "accuracy", + "path": str(acc_path), + "accuracy_config": { + "eval_method": "pass_at_1", + "ground_truth": "ground_truth", + "extractor": "boxed_math_extractor", + }, + **({"model_params_override": acc_override} if acc_override else {}), + }, + ], + ) + + @pytest.mark.unit + def test_override_propagates_to_loaded_rows(self, tmp_path): + """Override on accuracy dataset → its rows get max_completion_tokens=32768; + unmodified perf dataset keeps the global 1024.""" + perf_path = tmp_path / "perf.jsonl" + acc_path = tmp_path / "acc.jsonl" + self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) + self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) + + config = self._build_config( + perf_path, acc_path, acc_override={"max_new_tokens": 32768} + ) + perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path) + + # openai (chat completions) adapter emits the key `max_completion_tokens` + # via AddStaticColumns. Each loaded row should carry its dataset's value. + assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024 + assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768 + + @pytest.mark.unit + def test_no_override_inherits_global(self, tmp_path): + """Without overrides, both datasets use the global model_params.""" + perf_path = tmp_path / "perf.jsonl" + acc_path = tmp_path / "acc.jsonl" + self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) + self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) + + config = self._build_config(perf_path, acc_path, acc_override=None) + perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path) + + assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024 + assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 1024 + + @pytest.mark.unit + def test_perf_dataset_override_also_honored(self, tmp_path): + """Symmetric check: overrides on the performance entry also flow + through (relevant for MLPerf-style perf with shorter max_new_tokens).""" + perf_path = tmp_path / "perf.jsonl" + acc_path = tmp_path / "acc.jsonl" + self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) + self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) + + config = self._build_config( + perf_path, + acc_path, + acc_override={"max_new_tokens": 32768}, + perf_override={"max_new_tokens": 10240}, + ) + perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path) + + assert perf_ds.load_sample(0)["max_completion_tokens"] == 10240 + assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768 + + @pytest.mark.unit + def test_invalid_override_value_raises_input_validation_error(self, tmp_path): + """A value-level invalidity (e.g. bad streaming enum) is caught at + load time and surfaces as InputValidationError, not a generic + SetupError, so the user sees a clear actionable message.""" + perf_path = tmp_path / "perf.jsonl" + acc_path = tmp_path / "acc.jsonl" + self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) + self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) + + config = self._build_config( + perf_path, acc_path, acc_override={"streaming": "garbage"} + ) + with pytest.raises(InputValidationError, match="invalid model_params_override"): + _load_datasets(config, tmp_path) diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py index a60770121..d19dce713 100644 --- a/tests/unit/config/test_schema.py +++ b/tests/unit/config/test_schema.py @@ -123,6 +123,65 @@ def test_auto_derive_name(self): ds = Dataset(path="datasets/my_data.jsonl") assert ds.name == "my_data" + @pytest.mark.unit + def test_model_params_override_accepts_known_keys(self): + ds = Dataset( + name="acc", + type=DatasetType.ACCURACY, + path="acc.jsonl", + model_params_override={"max_new_tokens": 32768, "streaming": "on"}, + ) + assert ds.model_params_override == { + "max_new_tokens": 32768, + "streaming": "on", + } + + @pytest.mark.unit + def test_model_params_override_rejects_unknown_key(self): + with pytest.raises( + ValueError, match=r"unknown keys in model_params_override.*bogus" + ): + Dataset( + name="acc", + path="a.jsonl", + model_params_override={"bogus": 1}, + ) + + @pytest.mark.unit + def test_model_params_override_none_is_noop(self): + base = ModelParams(name="m", max_new_tokens=1024, streaming=StreamingMode.ON) + ds = Dataset(name="x", path="x.jsonl") + assert ds.effective_model_params(base) is base + + @pytest.mark.unit + def test_effective_model_params_merges_sparse_dict(self): + base = ModelParams(name="m", temperature=0.5, top_p=0.9, max_new_tokens=1024) + ds = Dataset( + name="x", + path="x.jsonl", + model_params_override={"max_new_tokens": 32768}, + ) + merged = ds.effective_model_params(base) + # overridden field changes... + assert merged.max_new_tokens == 32768 + # ...everything else is preserved from base + assert merged.name == "m" + assert merged.temperature == 0.5 + assert merged.top_p == 0.9 + + @pytest.mark.unit + def test_effective_model_params_validates_value(self): + """ModelParams.model_validate is invoked on the merged dict, so a + type-invalid override is rejected (e.g. wrong type for streaming).""" + base = ModelParams(name="m") + ds = Dataset( + name="x", + path="x.jsonl", + model_params_override={"streaming": "garbage"}, + ) + with pytest.raises(ValueError): + ds.effective_model_params(base) + class TestBenchmarkConfig: @pytest.mark.unit From 553a6d2a2d16f9c9c1488d2a4f7a1937e279d797 Mon Sep 17 00:00:00 2001 From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com> Date: Sat, 6 Jun 2026 23:47:24 -0700 Subject: [PATCH 2/2] Address comments Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com> --- .../commands/benchmark/execute.py | 17 ++- src/inference_endpoint/config/schema.py | 76 +++++++++--- .../templates/concurrency_template_full.yaml | 4 +- .../templates/offline_template_full.yaml | 4 +- .../templates/online_template_full.yaml | 4 +- .../openai/completions_adapter.py | 1 - tests/unit/commands/test_benchmark.py | 109 +++++++++++------- tests/unit/config/test_schema.py | 78 ++++++++++--- 8 files changed, 206 insertions(+), 87 deletions(-) diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 2eeda9ec1..8b9ee0499 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -42,6 +42,7 @@ import msgspec import msgspec.json from huggingface_hub import model_info +from pydantic import ValidationError from tqdm import tqdm from transformers import AutoTokenizer from transformers.utils import logging as transformers_logging @@ -289,10 +290,10 @@ def _load_datasets( ) ) try: - ds_model_params = acc_cfg.effective_model_params(config.model_params) - except Exception as e: + ds_model_params = acc_cfg.effective_generation_config(config.model_params) + except (ValidationError, ValueError) as e: raise InputValidationError( - f"Dataset '{acc_cfg.name}': invalid model_params_override: {e}" + f"Dataset '{acc_cfg.name}': invalid generation_config_override: {e}" ) from e ds.load(api_type=config.endpoint_config.api_type, model_params=ds_model_params) logger.info(f"Loaded {ds} - {ds.num_samples()} samples") @@ -304,10 +305,10 @@ def _load_datasets( perf_cfg = performance_cfgs[0] try: - perf_model_params = perf_cfg.effective_model_params(config.model_params) - except Exception as e: + perf_model_params = perf_cfg.effective_generation_config(config.model_params) + except (ValidationError, ValueError) as e: raise InputValidationError( - f"Dataset '{perf_cfg.name}': invalid model_params_override: {e}" + f"Dataset '{perf_cfg.name}': invalid generation_config_override: {e}" ) from e try: dataloader = DataLoaderFactory.create_loader(perf_cfg) @@ -316,9 +317,7 @@ def _load_datasets( ) logger.info(f"Loaded {dataloader.num_samples()} samples") except FileNotFoundError as e: - raise InputValidationError( - f"Dataset file not found: {performance_cfgs[0].path}" - ) from e + raise InputValidationError(f"Dataset file not found: {perf_cfg.path}") from e except Exception as e: raise SetupError(f"Failed to load dataset: {e}") from e diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 3ecd3d5bc..6fc6d3afc 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -54,6 +54,23 @@ class SystemDefaults(BaseModel): DEFAULT_METRIC: ClassVar[metrics.Metric] = metrics.Throughput(0.0) +def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: + """Recursively merge ``override`` into ``base`` and return the result. + + For overlapping keys whose values are both dicts, recurse; otherwise the + override value wins. Mutates a *copy* — callers can safely pass model_dump() + output. Used by ``Dataset.effective_generation_config`` so a sparse nested + override (e.g. ``{osl_distribution: {max: 512}}``) preserves siblings. + """ + out = dict(base) + for k, v in override.items(): + if isinstance(v, dict) and isinstance(out.get(k), dict): + out[k] = _deep_merge(out[k], v) + else: + out[k] = v + return out + + class LoadPatternType(str, Enum): """Load pattern types.""" @@ -313,7 +330,25 @@ class Dataset(BaseModel): multi_turn: MultiTurnConfig | None = Field( None, description="Multi-turn conversation configuration" ) - model_params_override: dict[str, Any] | None = Field( + # TODO(post-mortem): generation config is per-phase (perf vs. accuracy), + # not per-dataset — phases are derived from datasets and the override is + # keyed to dataset identity. This lives on Dataset as a short-term WAR + # so MLPerf-style accuracy + perf can share one fleet. The proper fix is + # a first-class GenerationConfig carried on PhaseConfig, decoupled from + # the dataset entry. Field/method names use "generation_config" to keep + # the eventual migration mechanical. + # + # Caveats on per-dataset overrides today: + # - `name` flows into the request `model` field but the tokenizer and + # aggregator are launched from the global `model_params.name`, so a + # per-dataset rename mismatches ISL/OSL accounting. + # - `streaming` flows into the request but the single MetricsAggregator + # is launched with the global `model_params.streaming` flag, so a + # per-dataset streaming flip will not produce TTFT/TPOT for that + # phase. Keep streaming on `model_params` (per-run) for now. + # - Nested dicts (`osl_distribution`, `chat_template_kwargs`) are + # deep-merged so sparse overrides preserve sibling defaults. + generation_config_override: dict[str, Any] | None = Field( None, description=( "Per-dataset overrides for the top-level model_params (sparse — " @@ -321,7 +356,10 @@ class Dataset(BaseModel): "BenchmarkConfig.model_params at dataset-load time. Useful for " "MLPerf-style runs where accuracy and performance use different " "output budgets in the same fleet, e.g. " - "model_params_override: {max_new_tokens: 32768, streaming: 'on'}." + "generation_config_override: {max_new_tokens: 32768, " + "temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are " + "accepted (kwargs-style) but not honored by the single-aggregator " + "metrics path — set those on top-level model_params." ), ) @@ -333,31 +371,37 @@ def _auto_derive_name(self) -> Self: return self @model_validator(mode="after") - def _validate_model_params_override(self) -> Self: - """Fail fast on unknown keys; we cannot validate values here because - merging requires the base model_params, which lives on BenchmarkConfig. + def _validate_generation_config_override(self) -> Self: + """Fail fast on unknown keys; values are validated at merge time + (see ``effective_generation_config``) because cross-field validation + needs the base ``ModelParams`` from ``BenchmarkConfig``. """ - if self.model_params_override: + if self.generation_config_override: valid = set(ModelParams.model_fields) - bad = sorted(set(self.model_params_override) - valid) + bad = sorted(set(self.generation_config_override) - valid) if bad: raise ValueError( f"Dataset '{self.name}': unknown keys in " - f"model_params_override: {bad}. " + f"generation_config_override: {bad}. " f"Valid keys: {sorted(valid)}" ) return self - def effective_model_params(self, base: ModelParams) -> ModelParams: - """Return base merged with this dataset's overrides. - - Re-validates the merged dict through ``ModelParams.model_validate`` - so that cross-field constraints (e.g. value ranges) catch bad - overrides before downstream code touches them. + def effective_generation_config(self, base: ModelParams) -> ModelParams: + """Return base merged with this dataset's generation-config overrides. + + Nested dicts are deep-merged so a sparse nested override preserves + sibling defaults (e.g. ``{osl_distribution: {max: 512}}`` keeps the + base ``type/mean/std/min``). The merged dict is re-validated through + ``ModelParams.model_validate`` so type-invalid scalar overrides (e.g. + ``temperature: 'hot'``) are rejected. Note that this only catches + scalar invalidity — a sparse nested override whose merged result + passes default-validation will not raise (callers that need stricter + nested validation should set ``base`` to an explicit instance). """ - if not self.model_params_override: + if not self.generation_config_override: return base - merged = {**base.model_dump(), **self.model_params_override} + merged = _deep_merge(base.model_dump(), self.generation_config_override) return ModelParams.model_validate(merged) diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 8bec418d0..4308a860f 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -27,7 +27,7 @@ datasets: # Dataset configs prompt: text_input accuracy_config: null # Accuracy evaluation settings multi_turn: null # Multi-turn conversation configuration - model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. + generation_config_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params. - name: accuracy type: accuracy # Dataset purpose: performance or accuracy | options: performance, accuracy path: '' # Dataset file path @@ -43,7 +43,7 @@ datasets: # Dataset configs extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation multi_turn: null # Multi-turn conversation configuration - model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. + generation_config_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params. settings: runtime: min_duration_ms: 600000 # Min duration (ms, or with suffix: 600s, 10m) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 1047e7867..ad307f7db 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -27,7 +27,7 @@ datasets: # Dataset configs prompt: text_input accuracy_config: null # Accuracy evaluation settings multi_turn: null # Multi-turn conversation configuration - model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. + generation_config_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params. - name: accuracy type: accuracy # Dataset purpose: performance or accuracy | options: performance, accuracy path: '' # Dataset file path @@ -43,7 +43,7 @@ datasets: # Dataset configs extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation multi_turn: null # Multi-turn conversation configuration - model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. + generation_config_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params. settings: runtime: min_duration_ms: 600000 # Min duration (ms, or with suffix: 600s, 10m) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 702858644..2230eaa7a 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -27,7 +27,7 @@ datasets: # Dataset configs prompt: text_input accuracy_config: null # Accuracy evaluation settings multi_turn: null # Multi-turn conversation configuration - model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. + generation_config_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params. - name: accuracy type: accuracy # Dataset purpose: performance or accuracy | options: performance, accuracy path: '' # Dataset file path @@ -43,7 +43,7 @@ datasets: # Dataset configs extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation multi_turn: null # Multi-turn conversation configuration - model_params_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. model_params_override: {max_new_tokens: 32768, streaming: 'on'}. + generation_config_override: null # Per-dataset overrides for the top-level model_params (sparse — only the fields you want to override). Merged on top of BenchmarkConfig.model_params at dataset-load time. Useful for MLPerf-style runs where accuracy and performance use different output budgets in the same fleet, e.g. generation_config_override: {max_new_tokens: 32768, temperature: 0.0}. NOTE: per-dataset `streaming` and `name` are accepted (kwargs-style) but not honored by the single-aggregator metrics path — set those on top-level model_params. settings: runtime: min_duration_ms: 600000 # Min duration (ms, or with suffix: 600s, 10m) diff --git a/src/inference_endpoint/openai/completions_adapter.py b/src/inference_endpoint/openai/completions_adapter.py index 548edac85..22b374136 100644 --- a/src/inference_endpoint/openai/completions_adapter.py +++ b/src/inference_endpoint/openai/completions_adapter.py @@ -62,7 +62,6 @@ def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]: "repetition_penalty": model_params.repetition_penalty, "presence_penalty": model_params.presence_penalty, "frequency_penalty": model_params.frequency_penalty, - "skip_special_tokens": False, } return [ Harmonize(), diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 5be94d667..83c4fd9d5 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -1281,14 +1281,29 @@ def test_no_override_yields_none_when_model_has_no_tokenizer( assert ctx.tokenizer_name is None -class TestLoadDatasetsModelParamsOverride: - """End-to-end check that _load_datasets honors per-dataset - model_params_override and propagates the overridden value down to the - static columns the adapter adds to each row.""" +class _OverrideTestBase: + """Shared helpers for the two end-to-end ``_load_datasets`` override classes + below (parametrized over the chat vs text-completions adapter).""" + + # Subclasses set these: + api_type: str = "" + max_tokens_key: str = "" # static column name AddStaticColumns adds def _write_jsonl(self, path: Path, rows: list[dict]) -> None: path.write_text("\n".join(json.dumps(r) for r in rows) + "\n") + def _prompt_rows(self, prompt: str, ground_truth: str | None = None) -> list[dict]: + """Adapter-shaped row. Chat adapter wants a 'prompt' column; the + completions adapter wants pre-tokenized 'input_tokens' (so the + Harmonize transform early-exits and we avoid the HF tokenizer + dependency in unit tests).""" + row: dict = {"prompt": prompt} + if self.api_type == "openai_completions": + row = {"input_tokens": [1, 2, 3, 4]} + if ground_truth is not None: + row["ground_truth"] = ground_truth + return [row] + def _build_config( self, perf_path: Path, @@ -1301,7 +1316,7 @@ def _build_config( model_params={"name": "test-model", "max_new_tokens": 1024}, endpoint_config={ "endpoints": ["http://localhost:8000"], - "api_type": "openai", + "api_type": self.api_type, }, datasets=[ { @@ -1309,7 +1324,7 @@ def _build_config( "type": "performance", "path": str(perf_path), **( - {"model_params_override": perf_override} + {"generation_config_override": perf_override} if perf_override else {} ), @@ -1323,53 +1338,48 @@ def _build_config( "ground_truth": "ground_truth", "extractor": "boxed_math_extractor", }, - **({"model_params_override": acc_override} if acc_override else {}), + **( + {"generation_config_override": acc_override} + if acc_override + else {} + ), }, ], ) - @pytest.mark.unit - def test_override_propagates_to_loaded_rows(self, tmp_path): - """Override on accuracy dataset → its rows get max_completion_tokens=32768; - unmodified perf dataset keeps the global 1024.""" + def _write_fixture(self, tmp_path: Path) -> tuple[Path, Path]: perf_path = tmp_path / "perf.jsonl" acc_path = tmp_path / "acc.jsonl" - self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) - self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) + self._write_jsonl(perf_path, self._prompt_rows("perf-prompt")) + self._write_jsonl(acc_path, self._prompt_rows("acc-prompt", ground_truth="42")) + return perf_path, acc_path + @pytest.mark.unit + def test_override_propagates_to_loaded_rows(self, tmp_path): + """Override on accuracy dataset → its rows get the overridden value; + unmodified perf dataset keeps the global 1024.""" + perf_path, acc_path = self._write_fixture(tmp_path) config = self._build_config( perf_path, acc_path, acc_override={"max_new_tokens": 32768} ) perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path) - - # openai (chat completions) adapter emits the key `max_completion_tokens` - # via AddStaticColumns. Each loaded row should carry its dataset's value. - assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024 - assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768 + assert perf_ds.load_sample(0)[self.max_tokens_key] == 1024 + assert acc_datasets[0].load_sample(0)[self.max_tokens_key] == 32768 @pytest.mark.unit def test_no_override_inherits_global(self, tmp_path): """Without overrides, both datasets use the global model_params.""" - perf_path = tmp_path / "perf.jsonl" - acc_path = tmp_path / "acc.jsonl" - self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) - self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) - + perf_path, acc_path = self._write_fixture(tmp_path) config = self._build_config(perf_path, acc_path, acc_override=None) perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path) - - assert perf_ds.load_sample(0)["max_completion_tokens"] == 1024 - assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 1024 + assert perf_ds.load_sample(0)[self.max_tokens_key] == 1024 + assert acc_datasets[0].load_sample(0)[self.max_tokens_key] == 1024 @pytest.mark.unit def test_perf_dataset_override_also_honored(self, tmp_path): """Symmetric check: overrides on the performance entry also flow through (relevant for MLPerf-style perf with shorter max_new_tokens).""" - perf_path = tmp_path / "perf.jsonl" - acc_path = tmp_path / "acc.jsonl" - self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) - self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) - + perf_path, acc_path = self._write_fixture(tmp_path) config = self._build_config( perf_path, acc_path, @@ -1377,22 +1387,41 @@ def test_perf_dataset_override_also_honored(self, tmp_path): perf_override={"max_new_tokens": 10240}, ) perf_ds, acc_datasets, _ = _load_datasets(config, tmp_path) - - assert perf_ds.load_sample(0)["max_completion_tokens"] == 10240 - assert acc_datasets[0].load_sample(0)["max_completion_tokens"] == 32768 + assert perf_ds.load_sample(0)[self.max_tokens_key] == 10240 + assert acc_datasets[0].load_sample(0)[self.max_tokens_key] == 32768 @pytest.mark.unit def test_invalid_override_value_raises_input_validation_error(self, tmp_path): """A value-level invalidity (e.g. bad streaming enum) is caught at load time and surfaces as InputValidationError, not a generic SetupError, so the user sees a clear actionable message.""" - perf_path = tmp_path / "perf.jsonl" - acc_path = tmp_path / "acc.jsonl" - self._write_jsonl(perf_path, [{"prompt": "perf-prompt"}]) - self._write_jsonl(acc_path, [{"prompt": "acc-prompt", "ground_truth": "42"}]) - + perf_path, acc_path = self._write_fixture(tmp_path) config = self._build_config( perf_path, acc_path, acc_override={"streaming": "garbage"} ) - with pytest.raises(InputValidationError, match="invalid model_params_override"): + with pytest.raises( + InputValidationError, match="invalid generation_config_override" + ): _load_datasets(config, tmp_path) + + +class TestLoadDatasetsGenerationConfigOverrideChat(_OverrideTestBase): + """End-to-end ``_load_datasets`` check against the OpenAI **chat** + completions adapter, which emits ``max_completion_tokens``.""" + + api_type = "openai" + max_tokens_key = "max_completion_tokens" + + +class TestLoadDatasetsGenerationConfigOverrideCompletions(_OverrideTestBase): + """End-to-end ``_load_datasets`` check against the OpenAI **text** + completions adapter (``/v1/completions``), which emits ``max_tokens``. + + This is the headline target of PR #344 — MLPerf-style runs use + ``api_type: openai_completions`` for pre-tokenized inputs — so an + integration test on this code path is essential. Rows carry pre-baked + ``input_tokens`` so the adapter's ``Harmonize()`` transform early-exits + and the test stays free of HF tokenizer downloads.""" + + api_type = "openai_completions" + max_tokens_key = "max_tokens" diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py index d19dce713..0c99565c1 100644 --- a/tests/unit/config/test_schema.py +++ b/tests/unit/config/test_schema.py @@ -124,44 +124,44 @@ def test_auto_derive_name(self): assert ds.name == "my_data" @pytest.mark.unit - def test_model_params_override_accepts_known_keys(self): + def test_generation_config_override_accepts_known_keys(self): ds = Dataset( name="acc", type=DatasetType.ACCURACY, path="acc.jsonl", - model_params_override={"max_new_tokens": 32768, "streaming": "on"}, + generation_config_override={"max_new_tokens": 32768, "temperature": 0.0}, ) - assert ds.model_params_override == { + assert ds.generation_config_override == { "max_new_tokens": 32768, - "streaming": "on", + "temperature": 0.0, } @pytest.mark.unit - def test_model_params_override_rejects_unknown_key(self): + def test_generation_config_override_rejects_unknown_key(self): with pytest.raises( - ValueError, match=r"unknown keys in model_params_override.*bogus" + ValueError, match=r"unknown keys in generation_config_override.*bogus" ): Dataset( name="acc", path="a.jsonl", - model_params_override={"bogus": 1}, + generation_config_override={"bogus": 1}, ) @pytest.mark.unit - def test_model_params_override_none_is_noop(self): + def test_generation_config_override_none_is_noop(self): base = ModelParams(name="m", max_new_tokens=1024, streaming=StreamingMode.ON) ds = Dataset(name="x", path="x.jsonl") - assert ds.effective_model_params(base) is base + assert ds.effective_generation_config(base) is base @pytest.mark.unit - def test_effective_model_params_merges_sparse_dict(self): + def test_effective_generation_config_merges_sparse_dict(self): base = ModelParams(name="m", temperature=0.5, top_p=0.9, max_new_tokens=1024) ds = Dataset( name="x", path="x.jsonl", - model_params_override={"max_new_tokens": 32768}, + generation_config_override={"max_new_tokens": 32768}, ) - merged = ds.effective_model_params(base) + merged = ds.effective_generation_config(base) # overridden field changes... assert merged.max_new_tokens == 32768 # ...everything else is preserved from base @@ -170,17 +170,65 @@ def test_effective_model_params_merges_sparse_dict(self): assert merged.top_p == 0.9 @pytest.mark.unit - def test_effective_model_params_validates_value(self): + def test_effective_generation_config_validates_value(self): """ModelParams.model_validate is invoked on the merged dict, so a type-invalid override is rejected (e.g. wrong type for streaming).""" base = ModelParams(name="m") ds = Dataset( name="x", path="x.jsonl", - model_params_override={"streaming": "garbage"}, + generation_config_override={"streaming": "garbage"}, ) with pytest.raises(ValueError): - ds.effective_model_params(base) + ds.effective_generation_config(base) + + @pytest.mark.unit + def test_effective_generation_config_deep_merges_nested_dict(self): + """Sparse overrides of nested fields (osl_distribution, + chat_template_kwargs) preserve sibling defaults from the base rather + than wholesale-replacing the nested object. Pins the deep-merge + behavior added in response to PR review feedback. + """ + base = ModelParams( + name="m", + osl_distribution=OSLDistribution( + type=OSLDistributionType.NORMAL, mean=1000, std=200, min=512, max=2048 + ), + ) + ds = Dataset( + name="x", + path="x.jsonl", + generation_config_override={"osl_distribution": {"max": 512}}, + ) + merged = ds.effective_generation_config(base) + # the explicitly overridden nested field changes... + assert merged.osl_distribution.max == 512 + # ...and the unspecified siblings are preserved from base + assert merged.osl_distribution.type == OSLDistributionType.NORMAL + assert merged.osl_distribution.mean == 1000 + assert merged.osl_distribution.std == 200 + assert merged.osl_distribution.min == 512 + + @pytest.mark.unit + def test_effective_generation_config_deep_merges_chat_template_kwargs(self): + """Deep-merge also applies to free-form nested dicts like + chat_template_kwargs; sparse overrides preserve sibling entries. + """ + base = ModelParams( + name="m", chat_template_kwargs={"enable_thinking": True, "tools": []} + ) + ds = Dataset( + name="x", + path="x.jsonl", + generation_config_override={ + "chat_template_kwargs": {"enable_thinking": False} + }, + ) + merged = ds.effective_generation_config(base) + assert merged.chat_template_kwargs == { + "enable_thinking": False, + "tools": [], + } class TestBenchmarkConfig: