apple · Lewis300 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/python/src/coreai_models/export/_constants.py b/python/src/coreai_models/export/_constants.py
@@ -17,3 +17,7 @@
 # Trace-time `input_ids` length and `position_ids` offset for export/quantization
 QUANT_TRACE_QUERY_LEN = 16
 QUANT_TRACE_OFFSET = 8
+
+# Default max context length for iOS exports. Users can raise it via
+# --max-context-length (up to the model's max_position_embeddings).
+IOS_DEFAULT_MAX_CONTEXT_LENGTH = 4096
diff --git a/python/src/coreai_models/export/pipeline.py b/python/src/coreai_models/export/pipeline.py
@@ -25,6 +25,7 @@
 from transformers import AutoConfig, AutoTokenizer
 
 from coreai_models.export._constants import (
+    IOS_DEFAULT_MAX_CONTEXT_LENGTH,
     QUANT_TRACE_OFFSET,
     QUANT_TRACE_QUERY_LEN,
     TRACE_KV_CACHE_SEQ_LEN,
@@ -151,7 +152,26 @@ async def _async_export_model(config: ExportConfig) -> str:
     # ---- 2. Load model ----
     target_dtype = _resolve_precision(config.compute_precision)
 
+    # The model's native context window from its HuggingFace config. Any
+    # user-provided override must not exceed it.
+    native_max_ctx = getattr(hf_config, "max_position_embeddings", None)
+
     max_context_length = config.max_context_length
+    if max_context_length is None and config.variant == "iOS":
+        max_context_length = min(
+            IOS_DEFAULT_MAX_CONTEXT_LENGTH, native_max_ctx or IOS_DEFAULT_MAX_CONTEXT_LENGTH
+        )
+
+    if (
+        max_context_length is not None
+        and native_max_ctx is not None
+        and max_context_length > native_max_ctx
+    ):
+        raise ValueError(
+            f"--max-context-length ({max_context_length}) exceeds the model's "
+            f"max_position_embeddings ({native_max_ctx}). Choose a value <= {native_max_ctx}."
+        )
+
     if max_context_length is not None:
         hf_config.max_position_embeddings = max_context_length
     if config.num_layers is not None:

diff --git a/python/src/coreai_models/model_registry.py b/python/src/coreai_models/model_registry.py
@@ -27,6 +27,8 @@
 from dataclasses import asdict, dataclass
 from pathlib import Path
 
+from coreai_models.export._constants import IOS_DEFAULT_MAX_CONTEXT_LENGTH
+
 # ---------------------------------------------------------------------------
 # Data model
 # ---------------------------------------------------------------------------
@@ -138,7 +140,7 @@ class UtilityModel:
         "iOS",
         "none",
         "float16",
-        4096,
+        IOS_DEFAULT_MAX_CONTEXT_LENGTH,
         compression_config="models/qwen3/qwen3_0_6b_mixed_4bit_8bit.yaml",
     ),
     ModelPreset(
@@ -149,7 +151,7 @@ class UtilityModel:
         "iOS",
         "4bit_weight_palettized_group8",
         "float16",
-        4096,
+        IOS_DEFAULT_MAX_CONTEXT_LENGTH,
     ),
     ModelPreset(
         "qwen3-4b",
@@ -159,7 +161,7 @@ class UtilityModel:
         "iOS",
         "none",
         "float16",
-        4096,
+        IOS_DEFAULT_MAX_CONTEXT_LENGTH,
         compression_config="models/qwen3/qwen3_4b_mixed_4bit_8bit.yaml",
     ),
 ]

diff --git a/python/tests/test_model_conversion/test_infra.py b/python/tests/test_model_conversion/test_infra.py
@@ -0,0 +1,131 @@
+# Copyright 2026 Apple Inc.
+#
+# Use of this source code is governed by a BSD-3-clause license that can
+# be found in the LICENSE file or at https://opensource.org/licenses/BSD-3-Clause
+
+"""Export-pipeline infrastructure tests"""
+
+import asyncio
+from pathlib import Path
+
+import pytest
+from coreai.authoring import AIModelAsset
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3Config,
+)
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3ForCausalLM as HFQwen3ForCausalLM,
+)
+
+from coreai_models.export import pipeline as export_pipeline
+from coreai_models.export._constants import IOS_DEFAULT_MAX_CONTEXT_LENGTH
+from coreai_models.export.ios import KEY_CACHE_INPUT_NAME, VALUE_CACHE_INPUT_NAME
+from coreai_models.export.pipeline import ExportConfig, _async_export_model
+
+
+def _tiny_qwen3_config(max_position_embeddings: int) -> Qwen3Config:
+    """A small, randomly-initialized Qwen3 config for fast pipeline exports."""
+    return Qwen3Config(
+        vocab_size=256,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        max_position_embeddings=max_position_embeddings,
+        tie_word_embeddings=True,
+    )
+
+
+def _save_tiny_qwen3(config: Qwen3Config, dest: Path) -> str:
+    """Instantiate a Qwen3 model from ``config`` and save it so the pipeline can
+    load it via ``from_pretrained``. Returns the model directory path."""
+    model = HFQwen3ForCausalLM(config)
+    model.save_pretrained(str(dest))
+    return str(dest)
+
+
+def _cache_context_length(descriptor_type: str) -> int:
+    """Extract the context-length (last) dimension from a state descriptor's
+    type string, e.g. ``"NDArray (Float16, 2 × 1 × 32 × 1 × 2048)"`` -> 2048."""
+    inside = descriptor_type[descriptor_type.index("(") + 1 : descriptor_type.rindex(")")]
+    shape_part = inside.split(",", 1)[1]
+    dims = [int(dim.strip()) for dim in shape_part.split("×")]
+    return dims[-1]
+
+
+class TestIOSPipelineMaxContextLength:
+    @staticmethod
+    def test_rejects_context_length_above_hf_config(tmp_path: Path) -> None:
+        """The pipeline must reject a --max-context-length larger than the
+        model's ``max_position_embeddings`` from its HuggingFace config."""
+        native_max = 4096
+        config = _tiny_qwen3_config(max_position_embeddings=native_max)
+        model_dir = _save_tiny_qwen3(config, tmp_path / "model")
+
+        export_config = ExportConfig(
+            hf_model_id=model_dir,
+            variant="iOS",
+            max_context_length=native_max + 1,
+            compute_precision="float16",
+            compression="none",
+            output_dir=str(tmp_path / "out"),
+            overwrite=True,
+        )
+
+        with pytest.raises(ValueError, match="max_position_embeddings"):
+            asyncio.run(_async_export_model(export_config))
+
+    @staticmethod
+    def test_defaults_ios_context_length_to_4096(
+        tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """With no explicit --max-context-length, iOS exports default to 4096.
+
+        Loads the dumped program and checks every function's KV-cache states:
+        the context-length dim must match between key/value caches, and the
+        maximum context length across all cache-bearing functions must be 4096.
+        """
+        # Native context window well above the iOS default so we know the cap
+        # (4096) — not the model's own limit — is what bounds the export.
+        config = _tiny_qwen3_config(max_position_embeddings=8192)
+        model_dir = _save_tiny_qwen3(config, tmp_path / "model")
+
+        # The tokenizer/metadata bundling step is unrelated to what we assert
+        # here and would require tokenizer files for this synthetic model.
+        monkeypatch.setattr(export_pipeline, "bundle_llm_asset", lambda **kwargs: None)
+
+        export_config = ExportConfig(
+            hf_model_id=model_dir,
+            variant="iOS",
+            max_context_length=None,
+            compute_precision="float16",
+            compression="none",
+            output_dir=str(tmp_path / "out"),
+            overwrite=True,
+        )
+
+        bundle_path = asyncio.run(_async_export_model(export_config))
+
+        aimodel_path = next(Path(bundle_path).glob("*.aimodel"))
+        summary = AIModelAsset.load(aimodel_path).summary(include_statistics=False)
+
+        max_context_length = 0
+        functions_with_cache = 0
+        for function_name in summary.function_names:
+            states = dict(summary.function_states(function_name))
+            if KEY_CACHE_INPUT_NAME not in states or VALUE_CACHE_INPUT_NAME not in states:
+                continue
+            functions_with_cache += 1
+
+            key_ctx = _cache_context_length(states[KEY_CACHE_INPUT_NAME])
+            value_ctx = _cache_context_length(states[VALUE_CACHE_INPUT_NAME])
+            assert key_ctx == value_ctx, (
+                f"{function_name}: key/value cache context-length dims differ "
+                f"({key_ctx} vs {value_ctx})"
+            )
+            max_context_length = max(max_context_length, key_ctx)
+
+        assert functions_with_cache > 0, "expected at least one function with KV-cache states"
+        assert max_context_length == IOS_DEFAULT_MAX_CONTEXT_LENGTH == 4096