diff --git a/AUDIT_RECHECK.md b/AUDIT_RECHECK.md new file mode 100644 index 00000000..b82ee5ac --- /dev/null +++ b/AUDIT_RECHECK.md @@ -0,0 +1,22 @@ +# Post-PR Recheck Audit + +Date: 2026-05-17 (UTC) + +## Why this file +User requested a visible commit-level proof that the repository was rechecked for mistakes/conflicts after the VEM integration PR. + +## Recheck steps performed +1. Searched for unresolved merge markers (`<<<<<<<`, `=======`, `>>>>>>>`). +2. Ran repository compile sanity (`python -m compileall -q .`). +3. Ran integration smoke (`python check_integrations.py`). +4. Ran world-model eval smoke (`evaluate_world_model.py`). +5. Ran perception eval smoke (`evaluate_perception.py`). + +## Outcome +- No merge conflict markers found. +- Compile sanity passed. +- Integration smoke passed. +- Both evaluation scripts executed and emitted metrics. + +## Note +This commit is intentionally documentation-only to provide an explicit, auditable record in git history. diff --git a/CODE_ADDRESS_INDEX.md b/CODE_ADDRESS_INDEX.md new file mode 100644 index 00000000..7568c669 --- /dev/null +++ b/CODE_ADDRESS_INDEX.md @@ -0,0 +1,677 @@ +# CODE ADDRESS INDEX + +Comprehensive repository address map. Updated to current line-level state. + +Total files indexed: **59** + +## `.github/workflows/sync-from-upstream.yml` +- Type: text +- Total lines: 19 +- Address anchors: + - L1: `name: Auto Sync from Upstream` + - L2: `on:` + - L6: `jobs:` + +## `.gitignore` +- Type: text +- Total lines: 169 +- Address anchors: none detected + +## `.gitmodules` +- Type: text +- Total lines: 9 +- Address anchors: none detected + +## `.vscode/launch.json` +- Type: text +- Total lines: 26 +- Address anchors: none detected + +## `.vscode/settings.json` +- Type: text +- Total lines: 3 +- Address anchors: none detected + +## `LICENSE` +- Type: text +- Total lines: 202 +- Address anchors: none detected + +## `README.md` +- Type: text +- Total lines: 112 +- Address anchors: + - L1: `# Visual Execution Model (VEM)` + - L9: `## Purpose` + - L17: `## Vision` + - L20: `## Goal` + - L28: `## Technical Architecture (Concept Map)` + - L30: `### 1) Spatio-Temporal Representation (Vision Encoder)` + - L35: `### 2) Geometric Inductive Biases` + - L40: `### 3) Continuous-Time Latent Dynamics` + - L45: `### 4) Hierarchical Predictive Reasoning` + - L50: `### 5) World Rendering and Latent Scene Composition` + - L54: `### 6) Latent Planning & Decision Support` + - L58: `### 7) Multi-Modal and Robustness Extensions` + - L62: `### 8) Training Stack` + - L70: `## Repository Workflow (Single Framework)` + - L72: `### Configurations` + - L76: `### Training Entrypoint` + - L79: `# or` + - L86: `### Practical Notes` + - L96: `### Final Execution Checklist (Do This)` + - L104: `## Roadmap Direction` + +## `arc_eval.ipynb` +- Type: text +- Total lines: 252 +- Address anchors: none detected + +## `assets/hrm.png` +- Type: binary/non-utf8 +- Total lines: 0 +- Address anchors: n/a + +## `assets/npyjs.js` +- Type: text +- Total lines: 176 +- Address anchors: none detected + +## `check_integrations.py` +- Type: text +- Total lines: 56 +- Address anchors: + - L8: `def load_config(path: str):` + - L13: `def main():` + +## `config/arch/hrm_v1.yaml` +- Type: text +- Total lines: 21 +- Address anchors: + - L1: `name: hrm.hrm_act_v1@HierarchicalReasoningModel_ACTV1` + - L2: `loss:` + - L6: `halt_exploration_prob: 0.1` + - L7: `halt_max_steps: 16` + - L9: `H_cycles: 2` + - L10: `L_cycles: 2` + - L12: `H_layers: 4` + - L13: `L_layers: 4` + - L15: `hidden_size: 512` + - L16: `num_heads: 8 # min(2, hidden_size // 64)` + - L17: `expansion: 4` + - L19: `puzzle_emb_ndim: ${.hidden_size}` + - L21: `pos_encodings: rope` + +## `config/cfg_pretrain.yaml` +- Type: text +- Total lines: 31 +- Address anchors: + - L3: `defaults:` + - L7: `hydra:` + - L11: `data_path: data/arc-aug-1000` + - L14: `global_batch_size: 768` + - L16: `epochs: 100000` + - L17: `eval_interval: 10000` + - L18: `checkpoint_every_eval: True` + - L20: `lr: 1e-4` + - L21: `lr_min_ratio: 1.0` + - L22: `lr_warmup_steps: 2000` + - L25: `beta1: 0.9` + - L26: `beta2: 0.95` + - L27: `weight_decay: 0.1` + - L28: `puzzle_emb_weight_decay: 0.1` + - L31: `puzzle_emb_lr: 1e-2` + +## `config/vjepa_10b.yaml` +- Type: text +- Total lines: 85 +- Address anchors: + - L2: `encoder:` + - L14: `predictor:` + - L72: `training:` + +## `config/vjepa_micro.yaml` +- Type: text +- Total lines: 34 +- Address anchors: + - L2: `encoder:` + - L14: `predictor:` + - L27: `training:` + +## `dataset/build_arc_dataset.py` +- Type: text +- Total lines: 291 +- Address anchors: + - L19: `class DataProcessConfig(BaseModel):` + - L37: `class ARCPuzzle:` + - L43: `def arc_grid_to_np(grid: List[List[int]]):` + - L54: `def np_grid_to_seq_translational_augment(inp: np.ndarray, out: np.ndarray, do_translation: bool):` + - L81: `def puzzle_hash(puzzle: dict):` + - L83: `def _grid_hash(grid: np.ndarray):` + - L98: `def convert_single_arc_puzzle(results: dict, default_name: str, puzzle: dict, aug_count: int, dest_mapping: Dict[str, Tuple[str, str]]):` + - L122: `def _map_grid(grid: np.ndarray):` + - L148: `def load_puzzles_arcagi(results: dict, dataset_path: str, config: DataProcessConfig):` + - L184: `def convert_dataset(config: DataProcessConfig):` + - L286: `def main(config: DataProcessConfig):` + +## `dataset/build_maze_dataset.py` +- Type: text +- Total lines: 142 +- Address anchors: + - L22: `class DataProcessConfig(BaseModel):` + - L30: `def convert_subset(set_name: str, config: DataProcessConfig):` + - L89: `def _seq_to_numpy(seq):` + - L136: `def preprocess_data(config: DataProcessConfig):` + +## `dataset/common.py` +- Type: text +- Total lines: 51 +- Address anchors: + - L12: `class PuzzleDatasetMetadata(pydantic.BaseModel):` + - L27: `def dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray:` + - L50: `def inverse_dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray:` + +## `dataset/generate_dummy_data.py` +- Type: text +- Total lines: 29 +- Address anchors: + - L5: `def generate_dummy_video(path, frames=32, res=(224, 224)):` + +## `dataset/video_dataset.py` +- Type: text +- Total lines: 108 +- Address anchors: + - L8: `class AdvancedVideoDataset(IterableDataset):` + - L13: `def __init__(self,` + - L33: `def _get_video_stream(self, path):` + - L42: `def _generate_3d_block_mask(self):` + - L66: `def __iter__(self):` + - L106: `def get_dataloader(video_paths, batch_size=1, **kwargs):` + +## `docs/FRONTIER_GAP_ANALYSIS.md` +- Type: text +- Total lines: 35 +- Address anchors: + - L1: `# Frontier Capability Gap Analysis and Implementation Plan` + - L3: `## Scope` + - L9: `## Comparison Matrix` + - L20: `## Implemented in this change` + - L22: `### 1) MCTS action prior upgrade` + - L28: `## Why this is prioritized first` + - L31: `## Next technical steps (ordered)` + +## `docs/HUMAN_VISION_EXECUTION_EVAL_SPEC.md` +- Type: text +- Total lines: 48 +- Address anchors: + - L1: `# Human-Vision + Execution Evaluation Spec (Initial)` + - L5: `## Purpose Alignment` + - L13: `## Track A: Perception Robustness (implemented baseline)` + - L27: `## Track B: World-Model Dynamics (implemented baseline)` + - L39: `## Track C: Execution/Cognition (next)` + - L44: `## Promotion Rule (Phase-1/2)` + +## `docs/RIGOROUS_DEVELOPMENT_PROTOCOL.md` +- Type: text +- Total lines: 33 +- Address anchors: + - L1: `# Rigorous Development Protocol (Phase-1)` + - L5: `## Gate A — Sanity / Determinism` + - L10: `## Gate B — World-model Metrics` + - L25: `## Gate C — Change Promotion` + - L31: `## Notes` + +## `evaluate.py` +- Type: text +- Total lines: 68 +- Address anchors: + - L13: `class EvalConfig(pydantic.BaseModel):` + - L19: `def launch():` + +## `evaluate_perception.py` +- Type: text +- Total lines: 90 +- Address anchors: + - L26: `def apply_perturbation(video: torch.Tensor, mode: str) -> torch.Tensor:` + - L39: `def latent_consistency(model: VJEPA, video: torch.Tensor, perturbed: torch.Tensor) -> float:` + - L46: `def main() -> None:` + +## `evaluate_world_model.py` +- Type: text +- Total lines: 166 +- Address anchors: + - L27: `def set_seed(seed: int) -> None:` + - L34: `class EvalManifest:` + - L46: `def get_commit_hash(default: str = "unknown") -> str:` + - L62: `def latent_rollout(` + - L76: `def evaluate_metrics(model: VJEPA, device: torch.device, rollout_steps: int, num_actions: int) -> Dict[str, float]:` + - L116: `def main() -> None:` + +## `models/adaptive_depth.py` +- Type: text +- Total lines: 195 +- Address anchors: + - L24: `class AdaptiveDepthController(nn.Module):` + - L41: `def __init__(` + - L54: `def should_continue(` + - L116: `class AdaptiveDepthWrapper(nn.Module):` + - L131: `def __init__(` + - L146: `def forward(` + +## `models/common.py` +- Type: text +- Total lines: 32 +- Address anchors: + - L7: `def trunc_normal_init_(tensor: torch.Tensor, std: float = 1.0, lower: float = -2.0, upper: float = 2.0):` + +## `models/hrm/hrm_act_v1.py` +- Type: text +- Total lines: 283 +- Address anchors: + - L16: `class HierarchicalReasoningModel_ACTV1InnerCarry:` + - L22: `class HierarchicalReasoningModel_ACTV1Carry:` + - L31: `class HierarchicalReasoningModel_ACTV1Config(BaseModel):` + - L60: `class HierarchicalReasoningModel_ACTV1Block(nn.Module):` + - L61: `def __init__(self, config: HierarchicalReasoningModel_ACTV1Config) -> None:` + - L77: `def forward(self, cos_sin: CosSin, hidden_states: torch.Tensor) -> torch.Tensor:` + - L86: `class HierarchicalReasoningModel_ACTV1ReasoningModule(nn.Module):` + - L87: `def __init__(self, layers: List[HierarchicalReasoningModel_ACTV1Block]):` + - L92: `def forward(self, hidden_states: torch.Tensor, input_injection: torch.Tensor, **kwargs) -> torch.Tensor:` + - L102: `class HierarchicalReasoningModel_ACTV1_Inner(nn.Module):` + - L103: `def __init__(self, config: HierarchicalReasoningModel_ACTV1Config) -> None:` + - L146: `def _input_embeddings(self, input: torch.Tensor, puzzle_identifiers: torch.Tensor):` + - L168: `def empty_carry(self, batch_size: int):` + - L174: `def reset_carry(self, reset_flag: torch.Tensor, carry: HierarchicalReasoningModel_ACTV1InnerCarry):` + - L180: `def forward(self, carry: HierarchicalReasoningModel_ACTV1InnerCarry, batch: Dict[str, torch.Tensor]) -> Tuple[HierarchicalReasoningModel_ACTV1InnerCarry, torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:` + - L216: `class HierarchicalReasoningModel_ACTV1(nn.Module):` + - L219: `def __init__(self, config_dict: dict):` + - L225: `def puzzle_emb(self):` + - L228: `def initial_carry(self, batch: Dict[str, torch.Tensor]):` + - L240: `def forward(self, carry: HierarchicalReasoningModel_ACTV1Carry, batch: Dict[str, torch.Tensor]) -> Tuple[HierarchicalReasoningModel_ACTV1Carry, Dict[str, torch.Tensor]]:` + +## `models/hybrid_ssm.py` +- Type: text +- Total lines: 228 +- Address anchors: + - L26: `class SelectiveSSM(nn.Module):` + - L41: `def __init__(` + - L85: `def forward(self, x: torch.Tensor) -> torch.Tensor:` + - L141: `class HybridSSMAttentionBlock(nn.Module):` + - L161: `def __init__(` + - L199: `def forward(` + +## `models/information_bottleneck.py` +- Type: text +- Total lines: 227 +- Address anchors: + - L28: `class VariationalInformationBottleneck(nn.Module):` + - L42: `def __init__(` + - L71: `def _reparameterize(` + - L90: `def forward(` + - L135: `class InformationBottleneckAttention(nn.Module):` + - L150: `def __init__(` + - L179: `def forward(` + +## `models/layers.py` +- Type: text +- Total lines: 167 +- Address anchors: + - L13: `def flash_attn_func(q, k, v, causal=False):` + - L29: `def _find_multiple(a, b):` + - L33: `def rotate_half(x: torch.Tensor):` + - L40: `def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):` + - L53: `class CastedLinear(nn.Module):` + - L54: `def __init__(self,` + - L68: `def forward(self, input: torch.Tensor) -> torch.Tensor:` + - L72: `class CastedEmbedding(nn.Module):` + - L73: `def __init__(self,` + - L86: `def forward(self, input: torch.Tensor) -> torch.Tensor:` + - L90: `class RotaryEmbedding(nn.Module):` + - L91: `def __init__(self, dim, max_position_embeddings, base, device=None):` + - L104: `def forward(self):` + - L108: `class Attention(nn.Module):` + - L109: `def __init__(self, hidden_size, head_dim, num_heads, num_key_value_heads, causal=False):` + - L122: `def forward(self, cos_sin: CosSin, hidden_states: torch.Tensor) -> torch.Tensor:` + - L148: `class SwiGLU(nn.Module):` + - L149: `def __init__(self, hidden_size: int, expansion: float):` + - L156: `def forward(self, x):` + - L161: `def rms_norm(hidden_states: torch.Tensor, variance_epsilon: float) -> torch.Tensor:` + +## `models/losses.py` +- Type: text +- Total lines: 101 +- Address anchors: + - L11: `def s(x, epsilon=1e-30):` + - L19: `def log_stablemax(x, dim=-1):` + - L24: `def stablemax_cross_entropy(logits, labels, ignore_index: int = -100):` + - L34: `def softmax_cross_entropy(logits, labels, ignore_index: int = -100):` + - L40: `class ACTLossHead(nn.Module):` + - L41: `def __init__(self, model: nn.Module, loss_type: str):` + - L46: `def initial_carry(self, *args, **kwargs):` + - L49: `def forward(` + +## `models/multimodal_grounding.py` +- Type: text +- Total lines: 258 +- Address anchors: + - L24: `class ModalityEncoder(nn.Module):` + - L37: `def __init__(` + - L60: `def forward(` + - L88: `class CrossModalAttention(nn.Module):` + - L101: `def __init__(` + - L122: `def forward(` + - L158: `class MultiModalGrounding(nn.Module):` + - L180: `def __init__(` + - L213: `def forward(` + +## `models/muon_optimizer.py` +- Type: text +- Total lines: 191 +- Address anchors: + - L26: `class Muon(Optimizer):` + - L44: `def __init__(` + - L67: `def step(self, closure=None):` + - L137: `def _newton_schulz_orthogonalize(G: torch.Tensor, steps: int = 5) -> torch.Tensor:` + - L170: `def _distributed_allreduce_grads(` + +## `models/proper_equivariance.py` +- Type: text +- Total lines: 335 +- Address anchors: + - L27: `class SO3Rotation(nn.Module):` + - L37: `def axis_angle_to_matrix(axis_angle: torch.Tensor) -> torch.Tensor:` + - L74: `def matrix_to_quaternion(R: torch.Tensor) -> torch.Tensor:` + - L93: `class WignerDMatrices(nn.Module):` + - L107: `def wigner_d_small(beta: torch.Tensor, l: int) -> torch.Tensor:` + - L170: `def rotation_to_euler(R: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:` + - L187: `class ProperSE3EquivariantLayer(nn.Module):` + - L204: `def __init__(` + - L243: `def _positional_encoding(self, positions: torch.Tensor) -> torch.Tensor:` + - L259: `def forward(` + +## `models/sparse_embedding.py` +- Type: text +- Total lines: 132 +- Address anchors: + - L11: `class CastedSparseEmbedding(nn.Module):` + - L12: `def __init__(self, num_embeddings: int, embedding_dim: int, batch_size: int, init_std: float, cast_to: torch.dtype):` + - L28: `def forward(self, inputs: torch.Tensor) -> torch.Tensor:` + - L41: `class CastedSparseEmbeddingSignSGD_Distributed(Optimizer):` + - L42: `def __init__(` + - L63: `def step(self, closure=None): # type: ignore` + - L98: `def _sparse_emb_signsgd_dist(` + +## `models/spectral_conv.py` +- Type: text +- Total lines: 193 +- Address anchors: + - L25: `class GraphLaplacian(nn.Module):` + - L34: `def __init__(self, k_neighbors: int = 8):` + - L38: `def forward(self, x: torch.Tensor) -> torch.Tensor:` + - L74: `class SpectralGraphConv(nn.Module):` + - L88: `def __init__(` + - L120: `def _chebyshev_polynomials(` + - L159: `def forward(self, x: torch.Tensor) -> torch.Tensor:` + +## `models/topological.py` +- Type: text +- Total lines: 215 +- Address anchors: + - L25: `class DifferentiableBettiNumbers(nn.Module):` + - L41: `def __init__(` + - L68: `def _compute_distance_matrix(self, x: torch.Tensor) -> torch.Tensor:` + - L74: `def _soft_threshold(` + - L87: `def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:` + - L156: `class TopologicalAwareness(nn.Module):` + - L171: `def __init__(` + - L191: `def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:` + +## `models/ttt_layer.py` +- Type: text +- Total lines: 206 +- Address anchors: + - L25: `class TTTLinear(nn.Module):` + - L45: `def __init__(` + - L74: `def forward(` + - L148: `class TTTLinearWithAttention(nn.Module):` + - L163: `def __init__(` + - L191: `def forward(` + +## `models/uncertainty.py` +- Type: text +- Total lines: 206 +- Address anchors: + - L24: `class VariationalLinear(nn.Module):` + - L41: `def __init__(` + - L61: `def forward(self, x: torch.Tensor) -> torch.Tensor:` + - L78: `def kl_divergence(self) -> torch.Tensor:` + - L106: `class UncertaintyQuantification(nn.Module):` + - L121: `def __init__(` + - L151: `def forward(` + +## `models/vjepa/flow_matching.py` +- Type: text +- Total lines: 281 +- Address anchors: + - L28: `class SinusoidalTimeEmbedding(nn.Module):` + - L31: `def __init__(self, dim: int):` + - L35: `def forward(self, t: torch.Tensor) -> torch.Tensor:` + - L53: `class VelocityField(nn.Module):` + - L61: `def __init__(self, dim: int, hidden_dim: int, condition_dim: int):` + - L89: `def forward(` + - L125: `class ConditionalFlowMatching(nn.Module):` + - L150: `def __init__(` + - L165: `def forward(` + - L211: `def sample(` + - L257: `def sample_rectified(` + +## `models/vjepa/gaussian_splatting.py` +- Type: text +- Total lines: 203 +- Address anchors: + - L28: `class LatentGaussianSplatting(nn.Module):` + - L40: `def __init__(self, dim: int, num_gaussians: int = 256):` + - L75: `def _parse_gaussians(self, params: torch.Tensor) -> dict:` + - L118: `def _quaternion_to_rotation_matrix(q: torch.Tensor) -> torch.Tensor:` + - L146: `def forward(` + +## `models/vjepa/layers.py` +- Type: text +- Total lines: 157 +- Address anchors: + - L8: `class LieGroupEquivariantLayer(nn.Module):` + - L14: `def __init__(self, dim: int, rank: int = 8):` + - L31: `def forward(self, x: torch.Tensor, group_element: torch.Tensor) -> torch.Tensor:` + - L50: `class LatentRayMarcher(nn.Module):` + - L56: `def __init__(self, dim: int, num_samples: int = 16):` + - L71: `def forward(self, latents: torch.Tensor, ray_dirs: torch.Tensor) -> torch.Tensor:` + - L113: `def apply_rotary_pos_emb_3d(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):` + - L114: `def rotate_half(x):` + - L128: `class RotaryEmbedding3D(nn.Module):` + - L129: `def __init__(self, dim: int, max_t: int, max_h: int, max_w: int, base: float = 10000.0, device=None):` + - L141: `def _get_freqs(self, length: int, dim: int, device):` + - L147: `def _build_cache(self, device):` + - L152: `def forward(self, t: int, h: int, w: int) -> Tuple[torch.Tensor, torch.Tensor]:` + +## `models/vjepa/losses.py` +- Type: text +- Total lines: 31 +- Address anchors: + - L4: `def vicreg_loss(x, y, sim_coeff=25.0, std_coeff=25.0, cov_coeff=1.0):` + - L22: `def covariance_loss(z):` + +## `models/vjepa/memory.py` +- Type: text +- Total lines: 392 +- Address anchors: + - L28: `class ResonatorNetwork(nn.Module):` + - L49: `def __init__(` + - L66: `def set_cleanup_memory(self, memory: torch.Tensor) -> None:` + - L70: `def cleanup(self, x: torch.Tensor) -> torch.Tensor:` + - L96: `def resonator_step(` + - L119: `def _unbind(self, composite: torch.Tensor, key: torch.Tensor) -> torch.Tensor:` + - L125: `def forward(` + - L178: `class HolographicMemory(nn.Module):` + - L198: `def __init__(` + - L219: `def _bind_hrr(self, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:` + - L225: `def _unbind_hrr(self, composite: torch.Tensor, key: torch.Tensor) -> torch.Tensor:` + - L231: `def _bind_fhrr(self, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:` + - L243: `def _unbind_fhrr(self, composite: torch.Tensor, key: torch.Tensor) -> torch.Tensor:` + - L249: `def bind(self, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:` + - L255: `def unbind(self, composite: torch.Tensor, key: torch.Tensor) -> torch.Tensor:` + - L261: `def superpose(self, vectors: torch.Tensor, dim: int = 1) -> torch.Tensor:` + - L284: `def forward(` + - L312: `def retrieve(self, memory: torch.Tensor, key: torch.Tensor) -> torch.Tensor:` + - L327: `def retrieve_with_cleanup(` + - L348: `def multi_retrieve(` + - L385: `def set_cleanup_memory(self, memory: torch.Tensor) -> None:` + +## `models/vjepa/physics_engine.py` +- Type: text +- Total lines: 107 +- Address anchors: + - L6: `class HRMPhysicsODE(nn.Module):` + - L12: `def __init__(self, dim: int, action_dim: Optional[int] = 128):` + - L30: `def forward(self, t: float, z: torch.Tensor) -> torch.Tensor:` + - L61: `class ContinuousTimeHRM(nn.Module):` + - L67: `def __init__(self, dim: int, action_dim: int = 128):` + - L71: `def forward(self, z: torch.Tensor, delta_t: torch.Tensor | float = 1.0, action: Optional[torch.Tensor] = None):` + +## `models/vjepa/planning.py` +- Type: text +- Total lines: 467 +- Address anchors: + - L28: `class MCTSNode:` + - L49: `def __init__(` + - L67: `def mean_value(self) -> float:` + - L74: `def is_expanded(self) -> bool:` + - L79: `def effective_visits(self) -> int:` + - L83: `def puct_score(self, parent_visits: int, c_puct: float = 1.41) -> float:` + - L120: `class MCTS:` + - L139: `def __init__(` + - L160: `def _imagine_future(` + - L192: `def _select(self, node: MCTSNode) -> MCTSNode:` + - L222: `def _expand(` + - L289: `def _backpropagate(self, node: MCTSNode, value: float) -> None:` + - L315: `def _get_action_probabilities(self, root: MCTSNode, num_actions: int) -> torch.Tensor:` + - L349: `def plan(` + - L402: `class LatentPlannerMCTS:` + - L417: `def __init__(` + - L433: `def plan(` + - L451: `def plan_with_uncertainty(` + +## `models/vjepa/predictor.py` +- Type: text +- Total lines: 260 +- Address anchors: + - L22: `class VJEPAPredictorInner(nn.Module):` + - L28: `def __init__(self,` + - L156: `def forward(self,` + +## `models/vjepa/symplectic_integrator.py` +- Type: text +- Total lines: 137 +- Address anchors: + - L31: `class SymplecticEulerIntegrator(nn.Module):` + - L48: `def __init__(self, dim: int, action_dim: Optional[int] = None):` + - L67: `def set_action(self, action: Optional[torch.Tensor]) -> None:` + - L71: `def hamiltonian(self, q: torch.Tensor, p: torch.Tensor) -> torch.Tensor:` + - L87: `def forward(` + - L126: `def compute_energy(self, z: torch.Tensor) -> torch.Tensor:` + +## `models/vjepa/utils.py` +- Type: text +- Total lines: 44 +- Address anchors: + - L3: `def get_block_mask(t, h, w, mask_ratio=0.6):` + - L21: `def apply_mask(x, mask):` + +## `models/vjepa/vit.py` +- Type: text +- Total lines: 94 +- Address anchors: + - L9: `class PatchEmbed3D(nn.Module):` + - L15: `def __init__(self, patch_size=(2, 16, 16), in_chans=3, embed_dim=768):` + - L20: `def forward(self, x):` + - L30: `class VisionTransformerBlock(nn.Module):` + - L31: `def __init__(self, dim, num_heads, expansion, norm_eps=1e-5):` + - L43: `def _forward_inner(self, x, cos_sin):` + - L48: `def forward(self, x, cos_sin):` + - L54: `class VisionEncoder(nn.Module):` + - L55: `def __init__(self,` + - L86: `def forward(self, x):` + +## `models/vjepa/vjepa_model.py` +- Type: text +- Total lines: 141 +- Address anchors: + - L12: `class VJEPA(nn.Module):` + - L23: `def __init__(self,` + - L81: `def update_target_encoder(self):` + - L86: `def forward(self, batch: Dict[str, torch.Tensor]):` + - L139: `class VisualExecutionModel(VJEPA):` + +## `pretrain.py` +- Type: text +- Total lines: 453 +- Address anchors: + - L26: `class LossConfig(pydantic.BaseModel):` + - L32: `class ArchConfig(pydantic.BaseModel):` + - L39: `class PretrainConfig(pydantic.BaseModel):` + - L74: `class TrainState:` + - L84: `def create_dataloader(config: PretrainConfig, split: str, rank: int, world_size: int, **kwargs):` + - L108: `def create_model(config: PretrainConfig, train_metadata: PuzzleDatasetMetadata, world_size: int):` + - L162: `def cosine_schedule_with_warmup_lr_lambda(` + - L172: `def init_train_state(config: PretrainConfig, train_metadata: PuzzleDatasetMetadata, world_size: int):` + - L190: `def save_train_state(config: PretrainConfig, train_state: TrainState):` + - L199: `def compute_lr(base_lr: float, config: PretrainConfig, train_state: TrainState):` + - L209: `def train_batch(config: PretrainConfig, train_state: TrainState, batch: Any, global_batch_size: int, rank: int, world_size: int):` + - L266: `def evaluate(config: PretrainConfig, train_state: TrainState, eval_loader: torch.utils.data.DataLoader, eval_metadata: PuzzleDatasetMetadata, rank: int, world_size: int):` + - L333: `def save_code_and_config(config: PretrainConfig):` + - L359: `def load_synced_config(hydra_config: DictConfig, rank: int, world_size: int) -> PretrainConfig:` + - L381: `def launch(hydra_config: DictConfig):` + +## `puzzle_dataset.py` +- Type: text +- Total lines: 199 +- Address anchors: + - L14: `def _sample_batch(rng: np.random.Generator, group_order: np.ndarray, puzzle_indices: np.ndarray, group_indices: np.ndarray, start_index: int, global_batch_size: int):` + - L41: `class PuzzleDatasetConfig(pydantic.BaseModel):` + - L53: `class PuzzleDataset(IterableDataset):` + - L54: `def __init__(self, config: PuzzleDatasetConfig, split: str = "train"):` + - L68: `def _load_metadata(self) -> PuzzleDatasetMetadata:` + - L72: `def _lazy_load_dataset(self):` + - L95: `def _collate_batch(self, batch):` + - L118: `def _iter_test(self):` + - L151: `def _iter_train(self):` + - L189: `def __iter__(self):` + +## `puzzle_visualizer.html` +- Type: text +- Total lines: 426 +- Address anchors: none detected + +## `requirements.txt` +- Type: text +- Total lines: 16 +- Address anchors: none detected + +## `utils/functions.py` +- Type: text +- Total lines: 19 +- Address anchors: + - L5: `def load_model_class(identifier: str, prefix: str = "models."):` + - L15: `def get_model_source_path(identifier: str, prefix: str = "models."):` + +## `vjepa_train.py` +- Type: text +- Total lines: 221 +- Address anchors: + - L19: `def build_optimizer(model, config):` + - L93: `class CombinedOptimizer:` + - L98: `def __init__(self, optimizers):` + - L101: `def zero_grad(self, set_to_none=False):` + - L105: `def step(self, closure=None):` + - L110: `def param_groups(self):` + - L117: `def train(config_path="config/vjepa_micro.yaml"):` + diff --git a/README.md b/README.md index a6b77cc2..1ed8f2a5 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,112 @@ -# Hierarchical Reasoning Model - V-JEPA Integration (AGI Scale) +# Visual Execution Model (VEM) -This repository hosts the advanced integration of the Hierarchical Reasoning Model (HRM) with the Video Joint-Embedding Predictive Architecture (V-JEPA), scaled to a **10 Billion parameter** architecture for deep physical world understanding. +A single integrated framework for **continuous-time world modeling** from video. -## Core Vision -Transitioning from discrete puzzle-solving to continuous-time, latent-space reasoning. The model is designed to learn **intuitive physics** (depth, shadows, object permanence, continuity) autonomously from raw video data, achieving a human-like understanding of the physical world. +Visual Execution Model (VEM) unifies hierarchical reasoning, predictive representation learning, dynamics, geometry, planning, uncertainty, and multimodal grounding inside one model stack (not separate models). -## Key Architectural Pillars +--- + +## Purpose +Build a practical foundation for models that can: +1. Learn physical regularities directly from raw video, +2. Reason over future latent trajectories, +3. Support intervention-aware planning in latent space. + +This repository is organized as one unified model pipeline with scalable sizes and optional modules, so every capability is part of the same framework and execution graph. + +## Vision +Our vision is a model that develops robust **intuitive physics** (e.g., continuity, object permanence, motion consistency, and causal effects of actions) by combining representation learning, geometric priors, and dynamics-aware objectives. + +## Goal +Deliver a scalable and analyzable training stack that can evolve from micro-scale experiments to large configurations (including 10B-class settings) while preserving: +- modularity, +- mathematical interpretability, +- and reproducible workflow. + +--- + +## Technical Architecture (Concept Map) + +### 1) Spatio-Temporal Representation (Vision Encoder) +- **3D patch embedding** over `(T, H, W)` video volumes. +- **3D-RoPE** positional encoding in time-height-width coordinates. +- ViT-style latent tokenization for downstream predictive modeling. -### 1. Vision Encoder (The Eyes) -* **3D Patch Embedding**: Processes video clips as spatio-temporal volumes. -* **3D-RoPE**: 3D Rotary Positional Embeddings that natively encode Time, Height, and Width coordinates. -* **10B Scale ViT**: A massive Vision Transformer designed to capture high-density visual information. +### 2) Geometric Inductive Biases +- **Lie-group / equivariance-oriented layers** for transformation-aware latent features. +- **Stiefel-manifold style orthogonality constraints/projections** to stabilize relational geometry. +- **Proper SE(3)-inspired processing** for physically meaningful transformations. -### 2. Physical Relativity (Lie Group Equivariance) -* **Stiefel Manifold Projections**: Implements $O(D)$ complexity equivariant transformations using Cayley transforms, ensuring physical laws are relative across 10B parameter manifolds. +### 3) Continuous-Time Latent Dynamics +- **Hamiltonian-style latent dynamics** components. +- **Neural ODE adjoint** pathway (`torchdiffeq`) for memory-efficient continuous-time learning. +- **Symplectic integration path** for structure-preserving latent evolution at inference-style rollout. -### 3. Continuous-Time Brain (Hamiltonian Neural ODEs & Predictive Coding) -* **Symplectic Physics Engine (HNN)**: Uses **Hamiltonian Neural Networks** to compute the continuous-time dynamics ($dq/dt$, $dp/dt$). This guarantees absolute energy conservation and strict adherence to classical mechanics within the latent imagination space. -* **Adjoint Neural ODEs**: Uses the **Neural ODE Adjoint Method** (`torchdiffeq`) for constant-memory backpropagation, enabling infinite-depth continuous reasoning at 10B scale. -* **Top-Down Predictive Coding**: A hierarchical "handshake" where High-Level planning ($z_H$) suppresses error signals from Low-Level sensors ($z_L$), mimicking the human visual cortex. -* **Holographic Memory**: Vector Symbolic Architecture (VSA) based memory that binds and stores complex physical experiences into dense, high-dimensional holographic states. +### 4) Hierarchical Predictive Reasoning +- High/Low cycle interaction (`H_cycles`, `L_cycles`) for iterative latent refinement. +- Predictive coding flavor with top-down influence and bottom-up correction pressure. +- Adaptive compute hooks (e.g., ACT/depth controller) for confidence-aware depth. -### 4. Light & Shadow Intuition (Neural Radiance Latents) -* **Volumetric Ray-Marching**: Treats the latent space as a **Differentiable Continuous Radiance Field (NeRF)**. The model "traces" light and reflections through its imagined 3D manifold. +### 5) World Rendering and Latent Scene Composition +- **Latent Gaussian Splatting** path for explicit scene primitive aggregation. +- NeRF-inspired latent rendering concepts for geometry/appearance reasoning. -### 5. Latent Planning (The Imagination) -* **Latent MCTS**: Monte Carlo Tree Search operating entirely in latent space, allowing the model to "imagine" and evaluate thousands of future physical outcomes. -* **Action Conditioning**: Future states predicted conditioned on specific physical actions/interventions. +### 6) Latent Planning & Decision Support +- **Latent MCTS** module for action-conditioned future evaluation. +- Value estimation head for ranking latent future states. -### 6. Advanced Training Engine -* **VICReg Objective**: Variance-Covariance regularization to prevent representation collapse. -* **3D Block Masking**: Spatio-temporal masking that forces the model to infer large missing segments of the world. +### 7) Multi-Modal and Robustness Extensions +- Hooks for **audio** and **tactile/proprioceptive** grounding. +- **Uncertainty estimation**, **information bottleneck**, **topology-aware**, and **spectral** auxiliary modules. -## Getting Started +### 8) Training Stack +- **VICReg** objective (invariance + variance/covariance regularization). +- Spatio-temporal masking regime. +- Optimizer backends: **AdamW**, **Muon**, or **Hybrid Muon+AdamW**. +- EMA target encoder for stable JEPA-style targets. -### Configuration -Adjust the 10B parameter specs in `config/vjepa_10b.yaml`. +--- + +## Repository Workflow (Single Framework) + +### Configurations +- **Micro scale profile (same model, small size)**: `config/vjepa_micro.yaml` +- **Large scale profile (same model, 10B-class size target)**: `config/vjepa_10b.yaml` -### Training +### Training Entrypoint ```bash -python vjepa_train.py +python vjepa_train.py --config config/vjepa_micro.yaml +# or +python vjepa_train.py --config config/vjepa_10b.yaml ``` -## Future Multimodal Grounding -The architecture is designed to be modality-agnostic, with hooks ready for future integration of **Audio** and **Tactile (Proprioceptive)** data. +`vjepa_train.py` accepts `--config` and loads runtime behavior from YAML. Both configs run the same Visual Execution Model framework at different scales. +`training.epochs` can be set in YAML (defaults to `100` if omitted). + +### Practical Notes +- Place video files in `data/` for training. +- If `data/` is absent, the script attempts to create it and generate a small synthetic test video via `ffmpeg`. +- For phase-1 rigorous world-model checks, run: + `python evaluate_world_model.py --config config/vjepa_micro.yaml --seed 42` + (saves JSON manifests in `eval_runs/`). +- For perception robustness checks (color/shadow/noise/shift), run: + `python evaluate_perception.py --config config/vjepa_micro.yaml --seed 42` + (saves JSON manifests in `eval_runs/`). + +### Final Execution Checklist (Do This) +1. `python -m compileall -q .` +2. `python evaluate_world_model.py --config config/vjepa_micro.yaml --seed 42` +3. `python evaluate_perception.py --config config/vjepa_micro.yaml --seed 42` +4. `python vjepa_train.py --config config/vjepa_micro.yaml` (with real videos in `data/`, or with `ffmpeg` installed) --- -*This project is dedicated to pushing the boundaries of artificial general intelligence through the lens of hierarchical physical reasoning.* + +## Roadmap Direction +- Stronger experiment tracking and benchmark reports. +- Expanded multimodal pretraining/evaluation. +- Systematic ablations on dynamics engines (ODE vs. flow matching vs. symplectic rollout). +- Better reproducibility packaging for large-scale distributed runs. + +--- + +This project is focused on pushing **hierarchical physical reasoning** toward robust, scalable world models with clear technical structure and research extensibility. diff --git a/RECTIFICATION_STATUS.md b/RECTIFICATION_STATUS.md new file mode 100644 index 00000000..c3914cba --- /dev/null +++ b/RECTIFICATION_STATUS.md @@ -0,0 +1,34 @@ +# Rectification Status (2026-05-17 UTC) + +This file records the concrete rectifications requested in conversation. + +## 1) Naming status +- Repository top-level name in README is **Visual Execution Model (VEM)**. + +## 2) Restoration status (important files) +The following previously removed files are restored and present: +- `puzzle_dataset.py` +- `pretrain.py` +- `evaluate.py` +- `dataset/build_arc_dataset.py` +- `dataset/build_maze_dataset.py` +- `models/hrm/hrm_act_v1.py` +- `models/sparse_embedding.py` +- `config/cfg_pretrain.yaml` +- `config/arch/hrm_v1.yaml` +- `puzzle_visualizer.html` +- `arc_eval.ipynb` + +## 3) Integration and conflict checks rerun +- Merge conflict marker scan +- Python compile sanity +- Integration smoke (`check_integrations.py`) +- World-model eval smoke +- Perception eval smoke + +## 4) Address map status +- `CODE_ADDRESS_INDEX.md` is synchronized to current repository layout and line-level anchors. + +## 5) Mistake rectification summary +- Reversal of accidental over-deletion has been completed. +- Current branch preserves both VEM integration work and restored legacy components. diff --git a/check_integrations.py b/check_integrations.py new file mode 100644 index 00000000..9d8363f7 --- /dev/null +++ b/check_integrations.py @@ -0,0 +1,56 @@ +import torch +import yaml + +from models.vjepa.vjepa_model import VisualExecutionModel +from models.vjepa.planning import MCTS + + +def load_config(path: str): + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) + + +def main(): + cfg = load_config("config/vjepa_micro.yaml") + model = VisualExecutionModel( + encoder_config=cfg["encoder"], + predictor_config=cfg["predictor"], + ema_momentum=cfg["training"].get("ema_momentum", 0.996), + action_dim=cfg.get("action_dim", 128), + ) + model.eval() + + bsz = 1 + t = cfg["encoder"].get("max_t", 8) + h = cfg["encoder"].get("img_size", 64) + w = cfg["encoder"].get("img_size", 64) + video = torch.randn(bsz, t, 3, h, w) + + pt, ph, pw = cfg["encoder"]["patch_size"] + seq_len = (t // pt) * (h // ph) * (w // pw) + num_mask = max(1, seq_len // 4) + mask = torch.randperm(seq_len)[:num_mask] + + batch = { + "video": video, + "mask": mask, + "delta_t": torch.ones(bsz, 1), + "action": torch.randn(bsz, cfg.get("action_dim", 128)), + } + + out = model(batch) + assert "predicted" in out and "target" in out and "value" in out + + mcts = MCTS(model=model, n_simulations=4) + root_state = out["all_context"].mean(dim=1) + actions = torch.randn(8, cfg.get("action_dim", 128)) + chosen = mcts.plan(root_state, actions) + if isinstance(chosen, tuple): + chosen = chosen[0] + assert chosen.shape[-1] == cfg.get("action_dim", 128) + + print("Integration check passed: model forward + MCTS planning wired correctly.") + + +if __name__ == "__main__": + main() diff --git a/docs/FRONTIER_GAP_ANALYSIS.md b/docs/FRONTIER_GAP_ANALYSIS.md new file mode 100644 index 00000000..c3626aeb --- /dev/null +++ b/docs/FRONTIER_GAP_ANALYSIS.md @@ -0,0 +1,35 @@ +# Frontier Capability Gap Analysis and Implementation Plan + +## Scope +This document compares: +1. Existing repository capabilities. +2. State-of-the-art (frontier labs + top academic trends) capability expectations. +3. Immediate implementation decisions. + +## Comparison Matrix + +| Area | Existing in repo | Frontier expectation | Gap | Action | +|---|---|---|---|---| +| Latent world modeling | V-JEPA-style masked latent prediction and EMA target path | Long-horizon stable latent rollouts with robust eval | Partial | Add dedicated world-model evaluation harness (next step) | +| Continuous-time dynamics | Hamiltonian/ODE/symplectic modules present | Quantitative invariance + long-horizon stability metrics | Missing benchmarks | Add metrics + ablations (next step) | +| Latent planning (MCTS) | MCTS scaffold existed with placeholder action priors | Policy-informed planning priors and uncertainty-aware scoring | Prior quality gap | Implemented policy-query action priors in MCTS | +| Uncertainty | Uncertainty module present | Planning/calibration integration | Partial | Integrate uncertainty into planner scoring (planned) | +| Multimodal grounding | Audio/tactile hooks + cross-modal attention | Curriculum and modality-drop robustness metrics | Partial | Add modality-drop ablations (planned) | +| Reproducible workflow | Configurable training entrypoint | Evaluation protocol + run manifests + acceptance gates | Partial | Add benchmark specs and run manifests (planned) | + +## Implemented in this change + +### 1) MCTS action prior upgrade +Previously, planning used a placeholder prior logits tensor. We replaced that with a learned action-prior mechanism: +- A new `policy_query_head` in `VJEPA` maps latent state to an action-space query vector. +- MCTS computes action priors from dot-product similarity between candidate actions and the learned query. +- This upgrades search from uniform/placeholder priors to model-informed priors. + +## Why this is prioritized first +Planning quality depends heavily on action priors. Replacing placeholder priors is a high-leverage improvement that directly improves practical controllable rollout search. + +## Next technical steps (ordered) +1. Add `evaluate_world_model.py` with rollout drift, action-consistency, and calibration metrics. +2. Wire uncertainty estimates into PUCT scoring (risk-aware planning). +3. Add ablation configs for dynamics engines and multimodal drop robustness. +4. Define acceptance thresholds for promotion of each advanced module. diff --git a/docs/HUMAN_VISION_EXECUTION_EVAL_SPEC.md b/docs/HUMAN_VISION_EXECUTION_EVAL_SPEC.md new file mode 100644 index 00000000..21141f88 --- /dev/null +++ b/docs/HUMAN_VISION_EXECUTION_EVAL_SPEC.md @@ -0,0 +1,48 @@ +# Human-Vision + Execution Evaluation Spec (Initial) + +This document translates project purpose into executable evaluation tracks. + +## Purpose Alignment +Target capabilities: +- color and illumination robustness +- depth/geometry continuity +- shadow/reflectance stability +- action-conditioned future consistency +- long-horizon cognitive execution + +## Track A: Perception Robustness (implemented baseline) +Command: +```bash +python evaluate_perception.py --config config/vjepa_micro.yaml --seed 42 +``` + +Outputs: +- `color_jitter_latent_l2` +- `brightness_shadow_latent_l2` +- `gaussian_noise_latent_l2` +- `spatial_shift_latent_l2` + +Lower is better (more invariant latent representations). + +## Track B: World-Model Dynamics (implemented baseline) +Command: +```bash +python evaluate_world_model.py --config config/vjepa_micro.yaml --seed 42 +``` + +Outputs: +- `rollout_drift_l2` +- `trajectory_divergence_l2` +- `max_action_prior` +- `action_prior_entropy` + +## Track C: Execution/Cognition (next) +- goal-conditioned planning success@k +- action counterfactual consistency +- uncertainty-aware risk-return tradeoff + +## Promotion Rule (Phase-1/2) +A change is promoted only if: +1. no regression in compile/smoke execution, +2. no major degradation in Track A/B metrics for same seed/config, +3. rationale + ablation switch is documented. diff --git a/docs/RIGOROUS_DEVELOPMENT_PROTOCOL.md b/docs/RIGOROUS_DEVELOPMENT_PROTOCOL.md new file mode 100644 index 00000000..1c7c8fbe --- /dev/null +++ b/docs/RIGOROUS_DEVELOPMENT_PROTOCOL.md @@ -0,0 +1,33 @@ +# Rigorous Development Protocol (Phase-1) + +This protocol defines minimum rigor gates for model and planner changes. + +## Gate A — Sanity / Determinism +1. Python compile sanity for repository modules. +2. Seeded execution for evaluation scripts. +3. Basic tensor-shape and NaN safety in smoke runs. + +## Gate B — World-model Metrics +Run: + +```bash +python evaluate_world_model.py --config config/vjepa_micro.yaml --seed 42 +``` + +Required outputs: +- `rollout_drift_l2` +- `trajectory_divergence_l2` +- `max_action_prior` +- `action_prior_entropy` + +All metrics are persisted as a JSON run manifest in `eval_runs/`. + +## Gate C — Change Promotion +Any feature PR must include: +1. Before/after metric table (same config + seed). +2. Ablation switch (enable/disable path). +3. Short rationale for metric movement. + +## Notes +- This is a phase-1 lightweight protocol and will be extended with calibration + and long-horizon benchmark suites in subsequent iterations. diff --git a/evaluate_perception.py b/evaluate_perception.py new file mode 100644 index 00000000..01c382a5 --- /dev/null +++ b/evaluate_perception.py @@ -0,0 +1,90 @@ +""" +Phase-2 perception robustness evaluation for HRM + V-JEPA. + +Focus: +- color robustness +- brightness/shadow robustness +- noise robustness +- geometric perturbation robustness + +This uses latent consistency between original and perturbed clips as an +early proxy for perceptual invariance before task-specific benchmarks. +""" + +import argparse +import json +import os +from datetime import datetime, timezone +from typing import Dict + +import torch +import yaml + +from models.vjepa.vjepa_model import VJEPA + + +def apply_perturbation(video: torch.Tensor, mode: str) -> torch.Tensor: + if mode == "color_jitter": + scale = torch.tensor([1.1, 0.9, 1.05], device=video.device).view(1, 1, 3, 1, 1) + return (video * scale).clamp(-3.0, 3.0) + if mode == "brightness_shadow": + return (video * 0.7).clamp(-3.0, 3.0) + if mode == "gaussian_noise": + return video + 0.05 * torch.randn_like(video) + if mode == "spatial_shift": + return torch.roll(video, shifts=2, dims=-1) + raise ValueError(f"Unknown perturbation mode: {mode}") + + +def latent_consistency(model: VJEPA, video: torch.Tensor, perturbed: torch.Tensor) -> float: + with torch.no_grad(): + z_ref = model.context_encoder(video) + z_alt = model.context_encoder(perturbed) + return float((z_ref - z_alt).pow(2).mean().sqrt().item()) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--config", default="config/vjepa_micro.yaml") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--save-dir", default="eval_runs") + args = parser.parse_args() + + torch.manual_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + with open(args.config, "r", encoding="utf-8") as f: + cfg = yaml.safe_load(f) + + model = VJEPA( + cfg["encoder"], + cfg["predictor"], + cfg["training"]["ema_momentum"], + action_dim=128, + ).to(device).eval() + + # synthetic clip for deterministic smoke-evaluation + bs, t, c, h, w = 1, cfg["encoder"]["max_t"] * cfg["encoder"]["patch_size"][0], 3, cfg["encoder"]["img_size"], cfg["encoder"]["img_size"] + video = torch.randn(bs, t, c, h, w, device=device) + + metrics: Dict[str, float] = {} + for mode in ["color_jitter", "brightness_shadow", "gaussian_noise", "spatial_shift"]: + pert = apply_perturbation(video, mode) + metrics[f"{mode}_latent_l2"] = latent_consistency(model, video, pert) + + os.makedirs(args.save_dir, exist_ok=True) + out = { + "timestamp_utc": datetime.now(timezone.utc).isoformat(), + "config_path": args.config, + "seed": args.seed, + "device": str(device), + "metrics": metrics, + } + path = os.path.join(args.save_dir, f"perception_eval_seed_{args.seed}.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(out, f, indent=2) + print(json.dumps(out, indent=2)) + + +if __name__ == "__main__": + main() + diff --git a/evaluate_world_model.py b/evaluate_world_model.py new file mode 100644 index 00000000..ac981b1d --- /dev/null +++ b/evaluate_world_model.py @@ -0,0 +1,166 @@ +""" +World-model evaluation harness for V-JEPA/HRM. + +Phase-1 rigorous evaluation focuses on: + 1) rollout drift (latent-state drift over repeated imagination) + 2) action consistency (whether action-conditioned futures are distinct) + 3) calibration-oriented proxy metrics (uncertainty magnitude + confidence proxy) + +This script is intentionally lightweight and self-contained so it can be +used early in development before full benchmark infrastructure is added. +""" + +import argparse +import json +import os +import random +from dataclasses import dataclass, asdict +from datetime import datetime, timezone +from typing import Dict, List + +import torch +import yaml + +from models.vjepa.vjepa_model import VJEPA + + +def set_seed(seed: int) -> None: + random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +@dataclass +class EvalManifest: + timestamp_utc: str + commit: str + config_path: str + seed: int + device: str + batch_size: int + rollout_steps: int + num_actions: int + metrics: Dict[str, float] + + +def get_commit_hash(default: str = "unknown") -> str: + head = os.path.join(".git", "HEAD") + if not os.path.exists(head): + return default + try: + with open(head, "r", encoding="utf-8") as f: + ref = f.read().strip() + if ref.startswith("ref: "): + ref_path = os.path.join(".git", ref.split(" ", 1)[1]) + with open(ref_path, "r", encoding="utf-8") as f: + return f.read().strip()[:12] + return ref[:12] + except Exception: + return default + + +def latent_rollout( + model: VJEPA, + state: torch.Tensor, + actions: List[torch.Tensor], +) -> List[torch.Tensor]: + states = [state] + cur = state + for action in actions: + dt = torch.ones(cur.shape[0], device=cur.device) + cur = model.predictor.physics_engine(cur, dt, action=action) + states.append(cur) + return states + + +def evaluate_metrics(model: VJEPA, device: torch.device, rollout_steps: int, num_actions: int) -> Dict[str, float]: + model.eval() + with torch.no_grad(): + dim = model.value_head[0].in_features + bs = 1 + seq_len = 16 + z0 = torch.randn(bs, seq_len, dim, device=device) + + actions_a = [torch.randn(bs, seq_len, 128, device=device) for _ in range(rollout_steps)] + actions_b = [torch.randn(bs, seq_len, 128, device=device) for _ in range(rollout_steps)] + + traj_a = latent_rollout(model, z0, actions_a) + traj_b = latent_rollout(model, z0, actions_b) + + # 1) rollout drift: average step-to-step displacement in a rollout + step_drifts = [] + for t in range(1, len(traj_a)): + step_drifts.append((traj_a[t] - traj_a[t - 1]).pow(2).mean().sqrt().item()) + rollout_drift = float(sum(step_drifts) / max(len(step_drifts), 1)) + + # 2) action consistency proxy: trajectories from distinct actions should diverge + trajectory_divergence = float((traj_a[-1] - traj_b[-1]).pow(2).mean().sqrt().item()) + + # 3) planner prior concentration (confidence proxy) + available_actions = torch.randn(num_actions, 128, device=device) + pooled = traj_a[-1].mean(dim=1) + query = model.policy_query_head(pooled).squeeze(0) + logits = torch.matmul(available_actions, query) + probs = torch.softmax(logits, dim=0) + max_prior = float(probs.max().item()) + prior_entropy = float(-(probs * (probs + 1e-9).log()).sum().item()) + + return { + "rollout_drift_l2": rollout_drift, + "trajectory_divergence_l2": trajectory_divergence, + "max_action_prior": max_prior, + "action_prior_entropy": prior_entropy, + } + + +def main() -> None: + parser = argparse.ArgumentParser(description="Evaluate V-JEPA/HRM world-model metrics") + parser.add_argument("--config", default="config/vjepa_micro.yaml") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--rollout-steps", type=int, default=8) + parser.add_argument("--num-actions", type=int, default=32) + parser.add_argument("--save-dir", default="eval_runs") + args = parser.parse_args() + + set_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + with open(args.config, "r", encoding="utf-8") as f: + config = yaml.safe_load(f) + + model = VJEPA( + config["encoder"], + config["predictor"], + config["training"]["ema_momentum"], + action_dim=128, + ).to(device) + + metrics = evaluate_metrics( + model=model, + device=device, + rollout_steps=args.rollout_steps, + num_actions=args.num_actions, + ) + + os.makedirs(args.save_dir, exist_ok=True) + manifest = EvalManifest( + timestamp_utc=datetime.now(timezone.utc).isoformat(), + commit=get_commit_hash(), + config_path=args.config, + seed=args.seed, + device=str(device), + batch_size=1, + rollout_steps=args.rollout_steps, + num_actions=args.num_actions, + metrics=metrics, + ) + + out_path = os.path.join(args.save_dir, f"world_model_eval_{manifest.commit}_{args.seed}.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(asdict(manifest), f, indent=2) + + print("Evaluation complete.") + print(json.dumps(asdict(manifest), indent=2)) + + +if __name__ == "__main__": + main() diff --git a/models/adaptive_depth.py b/models/adaptive_depth.py index caddaf7c..600d43c6 100644 --- a/models/adaptive_depth.py +++ b/models/adaptive_depth.py @@ -154,8 +154,6 @@ def forward( Runs the model iteratively, checking confidence at each step. Halts early for confident samples, continues for uncertain ones. """ - from models.hrm.hrm_act_v1 import HierarchicalReasoningModel_ACTV1Carry - # Initialize new_inner_carry = self.model.inner.reset_carry(carry.halted, carry.inner_carry) new_steps = torch.where(carry.halted, 0, carry.steps) @@ -192,8 +190,6 @@ def forward( outputs["depth_info"] = depth_info return ( - HierarchicalReasoningModel_ACTV1Carry( - new_inner_carry, new_steps, halted, new_current_data - ), + type(carry)(new_inner_carry, new_steps, halted, new_current_data), outputs, ) diff --git a/models/topological.py b/models/topological.py index 22b28b00..eea473bb 100644 --- a/models/topological.py +++ b/models/topological.py @@ -120,17 +120,17 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Te components = (degree / (degree.sum(dim=-1, keepdim=True) + 1e-6)).sum(dim=-1) component_counts.append(components) - component_counts = torch.stack(component_counts, dim=-1) # (bs, n, num_steps) + component_counts = torch.stack(component_counts, dim=-1) # (bs, num_steps) # Betti-0: max components across filtration (persistent) - betti_0 = component_counts.max(dim=-1).values.mean(dim=-1, keepdim=True) + betti_0 = component_counts.max(dim=-1, keepdim=True).values # Betti-1: loops appear when components merge but don't fill in # Approximate: count "births" of 1-cycles # A 1-cycle is born when two previously separate components connect # but the enclosed region is not yet filled - diffs = component_counts[:, :, 1:] - component_counts[:, :, :-1] - loop_evidence = (diffs < 0).float().sum(dim=-1).mean(dim=-1, keepdim=True) + diffs = component_counts[:, 1:] - component_counts[:, :-1] + loop_evidence = (diffs < 0).float().sum(dim=-1, keepdim=True) betti_1 = torch.sigmoid(loop_evidence) # Topological feature vector: combines local and global topology diff --git a/models/vjepa/planning.py b/models/vjepa/planning.py index 50ab55f0..9b2f811c 100644 --- a/models/vjepa/planning.py +++ b/models/vjepa/planning.py @@ -172,7 +172,7 @@ def _imagine_future( Returns: next_state: (1, D) predicted next state. value: scalar value estimate of the next state. - policy_logits: (1, num_actions) action prior logits. + policy_query: (1, action_dim) action-prior query embedding. """ # Use the physics engine for dynamics prediction delta_t = torch.ones(state.shape[0], device=state.device) @@ -183,11 +183,11 @@ def _imagine_future( next_state.mean(dim=1) if next_state.ndim > 2 else next_state ).item() - # Estimate action priors (simple: use cosine similarity with available actions) - # In a full implementation, this would come from a policy network - policy_logits = torch.zeros(1, device=state.device) # placeholder + # Estimate action priors from a learned policy-query head. + pooled_next_state = next_state.mean(dim=1) if next_state.ndim > 2 else next_state + policy_query = self.model.policy_query_head(pooled_next_state) - return next_state, value, policy_logits + return next_state, value, policy_query def _select(self, node: MCTSNode) -> MCTSNode: """ @@ -223,7 +223,7 @@ def _expand( self, node: MCTSNode, available_actions: torch.Tensor, - policy_logits: Optional[torch.Tensor] = None, + policy_query: Optional[torch.Tensor] = None, ) -> float: """ Expand a node by creating children for available actions. @@ -234,7 +234,7 @@ def _expand( Args: node: the node to expand. available_actions: (num_actions, action_dim) action set. - policy_logits: (num_actions,) optional prior logits. + policy_query: (1, action_dim) optional action-prior query vector. Returns: Value estimate of the expanded node. @@ -248,9 +248,18 @@ def _expand( max_children = max(1, int(self.pw_c * (node.visits + 1) ** self.pw_alpha)) max_children = min(max_children, num_actions) - # Compute priors from policy logits - if policy_logits is not None and policy_logits.numel() > 0: - priors = F.softmax(policy_logits[:num_actions] / self.temperature, dim=0) + # Compute priors from policy query vector. + if policy_query is None and hasattr(self.model, "policy_query_head"): + with torch.no_grad(): + pooled_state = node.state.mean(dim=1) if node.state.ndim > 2 else node.state + policy_query = self.model.policy_query_head(pooled_state) + + if policy_query is not None and policy_query.numel() > 0: + # Similarity(action_i, query) -> prior logit + # available_actions: (num_actions, action_dim) + # policy_query: (1, action_dim) + logits = torch.matmul(available_actions, policy_query.squeeze(0)) + priors = F.softmax(logits / self.temperature, dim=0) else: # Uniform prior if no policy network priors = torch.ones(num_actions, device=available_actions.device) / num_actions diff --git a/models/vjepa/vjepa_model.py b/models/vjepa/vjepa_model.py index f11a6174..1ca0e2fd 100644 --- a/models/vjepa/vjepa_model.py +++ b/models/vjepa/vjepa_model.py @@ -11,7 +11,7 @@ class VJEPA(nn.Module): """ - Unified V-JEPA Model with HRM-ODE Predictor and Holographic Memory. + Visual Execution Model (VEM): unified single-framework model. Designed for 10B parameter physical world modeling. Enhancements over base: @@ -61,6 +61,15 @@ def __init__(self, nn.Linear(predictor_config["hidden_size"], 1) ) + # 5b. Policy query head for action-prior scoring in latent MCTS. + # Produces an action-space query vector that can be matched against + # candidate action vectors via dot-product similarity. + self.policy_query_head = nn.Sequential( + nn.Linear(predictor_config["hidden_size"], predictor_config["hidden_size"]), + nn.SiLU(), + nn.Linear(predictor_config["hidden_size"], action_dim) + ) + # 6. Adaptive depth controller for test-time compute scaling self.depth_controller = AdaptiveDepthController( max_depth=predictor_config.get("halt_max_steps", 8), @@ -125,3 +134,8 @@ def forward(self, batch: Dict[str, torch.Tensor]): "all_context": all_latents, "value": value } + + +class VisualExecutionModel(VJEPA): + """Backward-compatible alias for the unified Visual Execution Model name.""" + pass diff --git a/vjepa_train.py b/vjepa_train.py index e90cd0c7..927b87b5 100644 --- a/vjepa_train.py +++ b/vjepa_train.py @@ -1,4 +1,6 @@ import os +import argparse +import shutil import yaml import torch from torch import nn @@ -148,10 +150,14 @@ def train(config_path="config/vjepa_micro.yaml"): os.makedirs(video_dir) print(f"Created directory {video_dir}. Please add videos here.") import subprocess - subprocess.run([ - 'ffmpeg', '-f', 'lavfi', '-i', 'testsrc=duration=5:size=224x224:rate=15', - os.path.join(video_dir, 'test_video.mp4'), '-y' - ], capture_output=True) + ffmpeg_bin = shutil.which("ffmpeg") + if ffmpeg_bin is None: + print("ffmpeg not found; skipping synthetic video generation. Add videos manually to data/.") + else: + subprocess.run([ + ffmpeg_bin, '-f', 'lavfi', '-i', 'testsrc=duration=5:size=224x224:rate=15', + os.path.join(video_dir, 'test_video.mp4'), '-y' + ], capture_output=True, check=False) video_paths = [os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))] if not video_paths: @@ -169,7 +175,8 @@ def train(config_path="config/vjepa_micro.yaml"): # 5. Training Loop model.train() - for epoch in range(100): + epochs = int(config.get("training", {}).get("epochs", 100)) + for epoch in range(epochs): for i, batch in enumerate(dataloader): # Move batch to device batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()} @@ -204,4 +211,11 @@ def train(config_path="config/vjepa_micro.yaml"): # wandb.log({"loss": loss.item()}) if __name__ == "__main__": - train() + parser = argparse.ArgumentParser(description="Train V-JEPA/HRM model") + parser.add_argument( + "--config", + default="config/vjepa_micro.yaml", + help="Path to YAML config file (e.g., config/vjepa_micro.yaml or config/vjepa_10b.yaml)", + ) + args = parser.parse_args() + train(args.config)