Move cache types and optimizer state helpers into local engine

Kion · claude · Kion · commit 166a16dbfca8 · 2026-03-06T16:20:40.000-08:00
Relocate LoraCacheEntry, LoraAdapterConfig, DistillStepResult, and the
cpu/gpu_optimizer_state helpers from the shared training module into
claas/training/engine/local/cache.py since they are only used by the
local engine's CPU caching path. The Modal worker never uses caching.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/claas/training/cache.py b/claas/training/cache.py
diff --git a/claas/training/distillation.py b/claas/training/distillation.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import copy
 import json
 import logging
 import os
@@ -13,10 +12,12 @@
 import torch
 
 from claas.core.types import DistillBatchRequestPayload, DistillResponse, SDPOLossInput
-from claas.training.cache import (
+from claas.training.engine.local.cache import (
     DistillStepResult,
     LoraAdapterConfig,
     LoraCacheEntry,
+    cpu_optimizer_state,
+    gpu_optimizer_state,
 )
 from claas.training.sdpo_loss import compute_sdpo_loss
 from claas.training.storage import (
@@ -52,50 +53,6 @@ class PreparedSample(TypedDict):
     behavior_logprobs: torch.Tensor
 
 
-def _cpu_optimizer_state(state_dict: dict[str, object]) -> dict[str, object]:
-    """Deep-copy optimizer state with all tensors moved to CPU."""
-    result: dict[str, object] = {}
-    for key, value in state_dict.items():
-        if key == "state":
-            param_states = cast("dict[int, dict[str, object]]", value)
-            cpu_states: dict[int, dict[str, object]] = {}
-            for param_id, param_state in param_states.items():
-                cpu_param: dict[str, object] = {}
-                for k, v in param_state.items():
-                    if isinstance(v, torch.Tensor):
-                        cpu_param[k] = v.detach().cpu().clone()
-                    else:
-                        cpu_param[k] = copy.deepcopy(v)
-                cpu_states[param_id] = cpu_param
-            result[key] = cpu_states
-        else:
-            result[key] = copy.deepcopy(value)
-    return result
-
-
-def _gpu_optimizer_state(
-    state_dict: dict[str, object],
-    device: torch.device,
-) -> dict[str, object]:
-    """Deep-copy optimizer state with all tensors moved to a target device."""
-    result: dict[str, object] = {}
-    for key, value in state_dict.items():
-        if key == "state":
-            param_states = cast("dict[int, dict[str, object]]", value)
-            gpu_states: dict[int, dict[str, object]] = {}
-            for param_id, param_state in param_states.items():
-                gpu_param: dict[str, object] = {}
-                for k, v in param_state.items():
-                    if isinstance(v, torch.Tensor):
-                        gpu_param[k] = v.detach().to(device).clone()
-                    else:
-                        gpu_param[k] = copy.deepcopy(v)
-                gpu_states[param_id] = gpu_param
-            result[key] = gpu_states
-        else:
-            result[key] = copy.deepcopy(value)
-    return result
-
 
 class DistillationTrainer:
     """Runs one SDPO distillation update using a loaded base model."""
@@ -357,7 +314,7 @@ def _build_cache_entry(
             raw_state = model.state_dict()
 
         lora_state = {k: v.detach().cpu().clone() for k, v in raw_state.items()}
-        opt_state = _cpu_optimizer_state(optimizer.state_dict())
+        opt_state = cpu_optimizer_state(optimizer.state_dict())
 
         return LoraCacheEntry(
             lora_state_dict=lora_state,
@@ -421,7 +378,7 @@ def distill(
 
             if cached is not None:
                 optimizer.load_state_dict(
-                    _gpu_optimizer_state(cached.optimizer_state_dict, self.device)
+                    gpu_optimizer_state(cached.optimizer_state_dict, self.device)
                 )
             elif lora_local_path is not None:
                 self._load_optimizer_state(lora_local_path, optimizer)
diff --git a/claas/training/engine/local/cache.py b/claas/training/engine/local/cache.py
@@ -0,0 +1,85 @@
+"""Typed cache structures and helpers for CPU-resident LoRA state between training steps."""
+
+from __future__ import annotations
+
+import copy
+from dataclasses import dataclass
+from typing import cast
+
+import torch
+
+from claas.core.types import DistillResponse
+
+
+@dataclass(frozen=True, slots=True)
+class LoraAdapterConfig:
+    """Typed representation of LoRA adapter configuration."""
+
+    r: int
+    lora_alpha: int
+    target_modules: list[str]
+    lora_dropout: float
+    bias: str
+    task_type: str
+
+
+@dataclass(frozen=True, slots=True)
+class LoraCacheEntry:
+    """CPU-resident snapshot of LoRA adapter state between training steps."""
+
+    lora_state_dict: dict[str, torch.Tensor]
+    optimizer_state_dict: dict[str, object]
+    adapter_config: LoraAdapterConfig
+
+
+@dataclass(frozen=True, slots=True)
+class DistillStepResult:
+    """Result of a distillation step with both response and cache entry."""
+
+    response: DistillResponse
+    cache_entry: LoraCacheEntry
+
+
+def cpu_optimizer_state(state_dict: dict[str, object]) -> dict[str, object]:
+    """Deep-copy optimizer state with all tensors moved to CPU."""
+    result: dict[str, object] = {}
+    for key, value in state_dict.items():
+        if key == "state":
+            param_states = cast("dict[int, dict[str, object]]", value)
+            cpu_states: dict[int, dict[str, object]] = {}
+            for param_id, param_state in param_states.items():
+                cpu_param: dict[str, object] = {}
+                for k, v in param_state.items():
+                    if isinstance(v, torch.Tensor):
+                        cpu_param[k] = v.detach().cpu().clone()
+                    else:
+                        cpu_param[k] = copy.deepcopy(v)
+                cpu_states[param_id] = cpu_param
+            result[key] = cpu_states
+        else:
+            result[key] = copy.deepcopy(value)
+    return result
+
+
+def gpu_optimizer_state(
+    state_dict: dict[str, object],
+    device: torch.device,
+) -> dict[str, object]:
+    """Deep-copy optimizer state with all tensors moved to a target device."""
+    result: dict[str, object] = {}
+    for key, value in state_dict.items():
+        if key == "state":
+            param_states = cast("dict[int, dict[str, object]]", value)
+            gpu_states: dict[int, dict[str, object]] = {}
+            for param_id, param_state in param_states.items():
+                gpu_param: dict[str, object] = {}
+                for k, v in param_state.items():
+                    if isinstance(v, torch.Tensor):
+                        gpu_param[k] = v.detach().to(device).clone()
+                    else:
+                        gpu_param[k] = copy.deepcopy(v)
+                gpu_states[param_id] = gpu_param
+            result[key] = gpu_states
+        else:
+            result[key] = copy.deepcopy(value)
+    return result
diff --git a/claas/training/engine/local/engine.py b/claas/training/engine/local/engine.py
@@ -20,9 +20,9 @@
     LoraRuntimeRef,
     ServiceHealth,
 )
-from claas.training.cache import LoraCacheEntry
 from claas.training.distillation import DistillationTrainer
 from claas.training.engine.base import TrainingEngine
+from claas.training.engine.local.cache import LoraCacheEntry
 from claas.training.storage import (
     configure_storage_backend,
     create_initial_lora,
diff --git a/tests/test_distillation_optimizer_state.py b/tests/test_distillation_optimizer_state.py
@@ -6,11 +6,12 @@
 
 torch = pytest.importorskip("torch")
 
-from claas.training.cache import LoraAdapterConfig, LoraCacheEntry  # noqa: E402
-from claas.training.distillation import (  # noqa: E402
-    DistillationTrainer,
-    _cpu_optimizer_state,
-    _gpu_optimizer_state,
+from claas.training.distillation import DistillationTrainer  # noqa: E402
+from claas.training.engine.local.cache import (  # noqa: E402
+    LoraAdapterConfig,
+    LoraCacheEntry,
+    cpu_optimizer_state,
+    gpu_optimizer_state,
 )
 
 
@@ -97,34 +98,34 @@ def test_optimizer_state_missing_gracefully_skips(trainer: DistillationTrainer,
     assert len(optimizer.state) == 0
 
 
-def test_cpu_optimizer_state_moves_tensors_to_cpu() -> None:
-    """_cpu_optimizer_state produces a state dict with all tensors on CPU."""
+def testcpu_optimizer_state_moves_tensors_to_cpu() -> None:
+    """cpu_optimizer_state produces a state dict with all tensors on CPU."""
     model = _SimpleLoraModel()
     optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
     loss = model.first.sum()
     loss.backward()
     optimizer.step()
 
     original = optimizer.state_dict()
-    cpu_state = _cpu_optimizer_state(original)
+    cpu_state = cpu_optimizer_state(original)
 
     for param_state in cpu_state["state"].values():
         for v in param_state.values():
             if isinstance(v, torch.Tensor):
                 assert v.device == torch.device("cpu")
 
 
-def test_cpu_gpu_optimizer_state_roundtrip() -> None:
-    """_cpu_optimizer_state / _gpu_optimizer_state round-trip preserves values."""
+def test_cpugpu_optimizer_state_roundtrip() -> None:
+    """cpu_optimizer_state / gpu_optimizer_state round-trip preserves values."""
     model = _SimpleLoraModel()
     optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
     loss = model.first.sum()
     loss.backward()
     optimizer.step()
 
     original = optimizer.state_dict()
-    cpu_state = _cpu_optimizer_state(original)
-    roundtripped = _gpu_optimizer_state(cpu_state, torch.device("cpu"))
+    cpu_state = cpu_optimizer_state(original)
+    roundtripped = gpu_optimizer_state(cpu_state, torch.device("cpu"))
 
     # Step counts match
     for param_id in original["state"]:
@@ -138,8 +139,8 @@ def test_cpu_gpu_optimizer_state_roundtrip() -> None:
             assert torch.equal(orig_tensor, rt_tensor)
 
 
-def test_cpu_optimizer_state_does_not_mutate_original() -> None:
-    """_cpu_optimizer_state deep-copies — mutating the copy leaves the original intact."""
+def testcpu_optimizer_state_does_not_mutate_original() -> None:
+    """cpu_optimizer_state deep-copies — mutating the copy leaves the original intact."""
     model = _SimpleLoraModel()
     optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
     loss = model.first.sum()
@@ -149,7 +150,7 @@ def test_cpu_optimizer_state_does_not_mutate_original() -> None:
     original = optimizer.state_dict()
     original_exp_avg = original["state"][0]["exp_avg"].clone()
 
-    cpu_state = _cpu_optimizer_state(original)
+    cpu_state = cpu_optimizer_state(original)
     # Mutate the copy
     cpu_state["state"][0]["exp_avg"].zero_()
 
diff --git a/tests/test_local_training_engine.py b/tests/test_local_training_engine.py
@@ -13,7 +13,7 @@
     DistillResponse,
     TrainingConfig,
 )
-from claas.training.cache import (  # noqa: E402
+from claas.training.engine.local.cache import (  # noqa: E402
     DistillStepResult,
     LoraAdapterConfig,
     LoraCacheEntry,

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`DistillResponse,`
`14`	`14`	`TrainingConfig,`
`15`	`15`	`)`
`16`		`-from claas.training.cache import ( # noqa: E402`
	`16`	`+from claas.training.engine.local.cache import ( # noqa: E402`
`17`	`17`	`DistillStepResult,`
`18`	`18`	`LoraAdapterConfig,`
`19`	`19`	`LoraCacheEntry,`