diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3a2413a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,17 @@
+# Python cache/artifacts
+__pycache__/
+*.pyc
+
+# Local datasets and weights
+AudioVisualText/AVE_data/
+AudioVisualText/pre-trained/
+AudioVisualText/google-bert-base-uncased/
+
+# Local outputs/logs
+AudioVisualText/results/
+AudioVisualText/slurm-*.out
+AudioVisualText/slurm-*.err
+
+# Local helper scripts
+AudioVisualText/run_infer_ave.sbatch
+AudioVisualText/run_ft_ave.sbatch
diff --git a/AudioVisualText/README_CHANGES.md b/AudioVisualText/README_CHANGES.md
new file mode 100644
index 0000000..831caee
--- /dev/null
+++ b/AudioVisualText/README_CHANGES.md
@@ -0,0 +1,91 @@
+# AudioVisualText Local Changes (Run/Eval/Analysis)
+
+This document summarizes the practical changes made to run AVE finetuning locally, evaluate results, and add gradient-sensitivity analysis.
+
+## 1) Runtime/Path Updates
+
+- Updated local checkpoint paths in finetune and inference scripts:
+  - `AudioVisualText/scripts/finetune/ft_ave.sh`
+  - `AudioVisualText/scripts/finetune/infer_ave.sh`
+- Added dynamic GPU process detection:
+  - `NPROC_PER_NODE=${NPROC_PER_NODE:-$(nvidia-smi -L 2>/dev/null | wc -l)}`
+  - Fallback to `1` when detection returns `0`.
+- Set AVE scripts to use local weights under:
+  - `/nethome/rkhan96/flash/weights/...`
+
+## 2) Precision/Compatibility Fixes
+
+- Training and inference were aligned to FP32 (`bf16=False`) to avoid dtype mismatch issues observed with BF16 on this setup.
+- `AudioVisualText/deepspeed/stage2-offload.json`
+  - `bf16.enabled` set to `false`.
+
+## 3) Gradient Sensitivity Instrumentation
+
+### Config Flags
+
+- Added training flags in:
+  - `AudioVisualText/configs/unified_config.py`
+- New fields:
+  - `grad_sensitivity_enable`
+  - `grad_sensitivity_include_projectors`
+
+### Trainer Logging
+
+- Extended `UnifiedTrainer` in:
+  - `AudioVisualText/trainer.py`
+- Added per-step logging for:
+  - `lora_A_text`, `lora_A_visual`, `lora_A_audio`, `lora_B_shared`
+  - optional `vl_projector`, `al_projector`
+- Logged metrics:
+  - `*_grad_norm`
+  - `*_param_norm`
+  - `*_relative_grad_norm`
+  - `*_num_params`
+- Output file:
+  - `<output_dir>/grad_sensitivity.jsonl`
+
+### DeepSpeed/ZeRO Reliability Fix
+
+- Initial implementation used `param.grad` and produced near-zero gradients in logs.
+- Updated implementation to use parameter backward hooks to accumulate grad norms reliably under DeepSpeed.
+
+## 4) Script Controls for Clean Runs
+
+- Updated `AudioVisualText/scripts/finetune/ft_ave.sh` run naming:
+  - If `RUN_NAME` is set, use it.
+  - Else if `GRAD_SENS_RUN=1`, use `llama_ave_gradsens`.
+  - Else use `llama_ave`.
+- This avoids accidental resume collisions with existing checkpoint directories.
+
+## 5) Evaluation Summary (AVE)
+
+- Finetune result (3 epochs) was reproduced near paper-level:
+  - AVE accuracy: **77.24%**
+  - Reported reference: **77.06%**
+- Parse-valid samples differed due to format strictness in evaluator:
+  - Local run: `394/402`
+  - Reference: `397/402`
+
+## 6) Gradient Analysis Artifacts
+
+### Analysis Script
+
+- Added:
+  - `AudioVisualText/scripts/analysis/plot_grad_sensitivity.py`
+- Script outputs:
+  - `grad_sensitivity_long.csv`
+  - `grad_sensitivity_summary.csv`
+  - PNG plots (if `matplotlib` is installed):
+    - `lora_grad.png`
+    - `lora_rel.png`
+    - `projector_rel.png`
+
+### Current Analysis Output Location
+
+- `AudioVisualText/results/finetune/llama_ave_gradsens_v2/analysis/`
+
+## 7) Notes on Job Interruptions
+
+- One long run was preempted by scheduler, but partial gradient logs were captured.
+- The partial `grad_sensitivity.jsonl` still confirms non-zero gradient signals after hook-based fix.
+
diff --git a/AudioVisualText/configs/unified_config.py b/AudioVisualText/configs/unified_config.py
index 5d765b9..9849b40 100644
--- a/AudioVisualText/configs/unified_config.py
+++ b/AudioVisualText/configs/unified_config.py
@@ -97,6 +97,18 @@ class TrainingArguments(transformers.TrainingArguments):
     ## my
     reserved_modality: str = field(default=None)
     loramethod: str = field(default=None)
+    cross_attn_kv_mode: str = field(
+        default="question",
+        metadata={"help": "Cross-attn KV source for LoRA branches: question or full_text."},
+    )
+    cross_modal_mode: str = field(
+        default="trilinear",
+        metadata={"help": "Cross-modal fusion mode for LoRA: pairwise or trilinear."},
+    )
+    trilinear_pack_tokens: bool = field(
+        default=False,
+        metadata={"help": "If True, compact active tokens before Triton trilinear attention."},
+    )
     blc_alpha: float = field(default=0.5)
     blc_weight: float = field(default=0.5)
 
@@ -106,3 +118,9 @@ class TrainingArguments(transformers.TrainingArguments):
     save_modules: str = field(default='vl_projector,al_projector,lora')
 
     exp_desc: str = field(default='exp')
+
+    # Gradient sensitivity analysis toggles.
+    # When enabled, UnifiedTrainer logs per-modality gradient statistics for
+    # LoRA branches (text/visual/audio A, shared B) and optional projectors.
+    grad_sensitivity_enable: bool = field(default=False)
+    grad_sensitivity_include_projectors: bool = field(default=True)
diff --git a/AudioVisualText/dataset/unified_dataset.py b/AudioVisualText/dataset/unified_dataset.py
index 0d40aae..5e0f653 100644
--- a/AudioVisualText/dataset/unified_dataset.py
+++ b/AudioVisualText/dataset/unified_dataset.py
@@ -54,7 +54,6 @@ def __init__(
         
         print(f'tot training sample nums: {self.tot}')
 
-
     def add_avqa_task_samples(self):
         avqa_annotation_path = 'MUSIC_AVQA_data/train_samples_with_reasoning_avqa.json'
         tot = 0
diff --git a/AudioVisualText/deepspeed/stage2-offload-torch25.json b/AudioVisualText/deepspeed/stage2-offload-torch25.json
new file mode 100644
index 0000000..0aaac8e
--- /dev/null
+++ b/AudioVisualText/deepspeed/stage2-offload-torch25.json
@@ -0,0 +1,51 @@
+{
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "fp16": {
+        "enabled": false,
+        "loss_scale": 0,
+        "initial_scale_power": 16,
+        "loss_scale_window": 1000,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 100000000.0,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 100000000.0,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "offload_optimizer": {
+            "device": "cpu"
+        },
+        "round_robin_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/AudioVisualText/deepspeed/stage2-offload.json b/AudioVisualText/deepspeed/stage2-offload.json
index 85ec8e1..aba1816 100644
--- a/AudioVisualText/deepspeed/stage2-offload.json
+++ b/AudioVisualText/deepspeed/stage2-offload.json
@@ -19,7 +19,7 @@
         }
       },
     "bf16": {
-      "enabled": "auto",
+      "enabled": true,
       "loss_scale": 0,
       "initial_scale_power": 16,
       "loss_scale_window": 1000,
diff --git a/AudioVisualText/docs/moka.pdf b/AudioVisualText/docs/moka.pdf
new file mode 100644
index 0000000..8feed6d
Binary files /dev/null and b/AudioVisualText/docs/moka.pdf differ
diff --git a/AudioVisualText/docs/moka_math_from_code.pdf b/AudioVisualText/docs/moka_math_from_code.pdf
new file mode 100644
index 0000000..6629a98
Binary files /dev/null and b/AudioVisualText/docs/moka_math_from_code.pdf differ
diff --git a/AudioVisualText/docs/moka_math_from_code.tex b/AudioVisualText/docs/moka_math_from_code.tex
new file mode 100644
index 0000000..8645f85
--- /dev/null
+++ b/AudioVisualText/docs/moka_math_from_code.tex
@@ -0,0 +1,281 @@
+\documentclass[11pt]{article}
+\usepackage[margin=1in]{geometry}
+\usepackage{amsmath,amssymb,mathtools,bm}
+\usepackage{booktabs}
+\usepackage{hyperref}
+
+\title{MokA: Forward/Backward Math From Code}
+\author{Derived directly from repository implementation}
+\date{\today}
+
+\begin{document}
+\maketitle
+
+\section{Scope and Grounding}
+This note is derived from these files:
+\begin{itemize}
+\item \texttt{models/unified\_arch.py}: multimodal embedding and modality masks.
+\item \texttt{models/modeling\_llama.py}: masked LoRA projections used by attention/MLP and LM loss.
+\item \texttt{peft\_hyper/tuners/lora.py}: custom \texttt{Linear} with \texttt{lora\_A0/A1/A2} and shared \texttt{lora\_B0}.
+\item \texttt{scripts/finetune/finetune.py}: which modules are wrapped and which parameters are trainable.
+\item \texttt{trainer.py}: gradient-sensitivity metric definitions.
+\end{itemize}
+
+\section{Notation and Shapes}
+Per adapted linear layer:
+\begin{itemize}
+\item Input: $X \in \mathbb{R}^{B\times T\times d_{\text{in}}}$.
+\item Base weight: $W_0 \in \mathbb{R}^{d_{\text{out}}\times d_{\text{in}}}$, bias $b$.
+\item LoRA branch matrices:
+\[
+A_0\in\mathbb{R}^{r\times d_{\text{in}}},\quad
+A_1\in\mathbb{R}^{r\times d_{\text{in}}},\quad
+A_2\in\mathbb{R}^{r\times d_{\text{in}}},\quad
+B_0\in\mathbb{R}^{d_{\text{out}}\times r}.
+\]
+\item Scale $s=\alpha/r$ (code uses \texttt{self.scaling[0]} for all branches).
+\item Binary masks from input construction:
+\[
+M_t,M_v,M_a,M_q\in\{0,1\}^{B\times T\times 1}.
+\]
+\end{itemize}
+
+\paragraph{Important implementation constraint.}
+In code, \texttt{r} is parsed digit-wise (\texttt{str(r)}), and branch outputs are later summed. Therefore branch ranks must match in practice (or the sum would fail).
+
+\section{Multimodal Input Construction}
+The model builds one embedding sequence and four aligned masks:
+\[
+E\in\mathbb{R}^{B\times T\times d_{\text{model}}},\quad
+M_t,M_v,M_a,M_q\in\{0,1\}^{B\times T\times 1}.
+\]
+Text tokens map through token embedding; image/video/audio special tokens are replaced by projector features; labels for non-text inserted modal features are set to $-100$ (ignored in CE). Then sequences are left-padded to common $T$.
+
+\section{Forward: One Adapted Linear Layer}
+PyTorch linear convention is
+\[
+Y_{\text{base}} = XW_0^\top + b.
+\]
+
+Define masked inputs:
+\[
+X_t=X\odot M_t,\quad X_v=X\odot M_v,\quad X_a=X\odot M_a.
+\]
+With dropout operator $D(\cdot)$:
+\[
+U_i = s\,A_i(D(X_i)),\quad i\in\{0,1,2\},
+\]
+where $A_i(\cdot)$ means linear map with weight $A_i$.
+
+\subsection{Cross-modal branch coupling}
+For each sample $b$, video branch:
+\[
+Q_v=U_1^{(b)},\quad K_q=V_q=U_0^{(b)}\odot M_q^{(b)},
+\]
+\[
+S_v=\operatorname{softmax}\!\left(\frac{Q_vK_q^\top}{\sqrt{r}}\right),\quad
+O_v=S_vV_q,
+\]
+\[
+\widetilde U_1^{(b)} = U_1^{(b)} + \beta\,(M_v^{(b)}\odot O_v),
+\]
+with $\beta=\texttt{blc\_weight}$.
+
+Similarly for audio:
+\[
+\widetilde U_2^{(b)} = U_2^{(b)} + \beta\,(M_a^{(b)}\odot O_a),
+\]
+where $O_a$ uses query $U_2^{(b)}$ and key/value from question slice of $U_0^{(b)}$.
+
+Then
+\[
+U_{\Sigma}=U_0+\widetilde U_1+\widetilde U_2,\qquad
+Y_{\text{lora}} = B_0(U_{\Sigma}),
+\]
+and final layer output:
+\[
+Y = Y_{\text{base}} + Y_{\text{lora}}.
+\]
+
+\paragraph{What does ``$B$ multiply $A$'' mean?}
+Ignoring masking/cross-attn and using one branch:
+\[
+Y_{\text{lora},i}=B_0\big(sA_i(X_i)\big)
+= X_i(s\,B_0A_i)^\top.
+\]
+So the effective low-rank update to base weight is
+\[
+\Delta W_i = s\,B_0A_i,\quad
+W_{\text{eff}} = W_0 + \sum_i \Delta W_i.
+\]
+
+\section{Where It Is Used}
+The wrapped linear layers are injected into
+\[
+\{q\_proj,k\_proj,v\_proj,o\_proj,gate\_proj,up\_proj,down\_proj\}
+\]
+across decoder layers. In attention and MLP forward passes, modality masks are passed to these projections.
+
+\section{Language Modeling Loss}
+Given logits $Z\in\mathbb{R}^{B\times T\times V}$ and labels $y$:
+\[
+Z_{\text{shift}}=Z_{:,1:T-1,:},\quad
+y_{\text{shift}}=y_{:,2:T}.
+\]
+Loss is cross-entropy over flattened tokens with ignore index $-100$:
+\[
+\mathcal L = -\frac{1}{N_{\text{valid}}}\sum_{n\in\mathcal I}
+\log\operatorname{softmax}(Z_n)_{y_n}.
+\]
+
+\section{Backward Propagation}
+\subsection{From CE to logits}
+For each valid token row $n$:
+\[
+\frac{\partial \mathcal L}{\partial Z_n}
+=
+\frac{1}{N_{\text{valid}}}\left(\operatorname{softmax}(Z_n)-e_{y_n}\right),
+\]
+and $0$ for ignored rows.
+
+\subsection{Adapter local gradients}
+Let $G=\partial\mathcal L/\partial Y_{\text{lora}}$.
+For $Y_{\text{lora}}=B_0(U_{\Sigma})$:
+\[
+\frac{\partial\mathcal L}{\partial B_0}
+=
+\sum_{b,t} G_{b,t}^{\top}\,U_{\Sigma,b,t},
+\qquad
+\frac{\partial\mathcal L}{\partial U_{\Sigma}}
+=
+G\,B_0.
+\]
+
+Define split upstreams:
+\[
+G_0=\frac{\partial\mathcal L}{\partial U_0},\;
+G_1=\frac{\partial\mathcal L}{\partial U_1},\;
+G_2=\frac{\partial\mathcal L}{\partial U_2},
+\]
+where $G_1,G_2$ include direct residual path and query-through-attention path.
+
+For branch $i$ with $U_i=s\,A_i(D(X_i))$:
+\[
+\frac{\partial\mathcal L}{\partial A_i}
+=
+s\sum_{b,t} G_{i,b,t}^{\top}\,D(X_{i,b,t}).
+\]
+This gives explicitly:
+\[
+\boxed{
+\frac{\partial\mathcal L}{\partial A_1}
+=
+s\sum_{b,t} G_{1,b,t}^{\top}\,D(X_{v,b,t})
+}
+\]
+\[
+\boxed{
+\frac{\partial\mathcal L}{\partial A_2}
+=
+s\sum_{b,t} G_{2,b,t}^{\top}\,D(X_{a,b,t})
+}
+\]
+with
+\[
+X_v=X\odot M_v,\quad X_a=X\odot M_a.
+\]
+
+\paragraph{What is inside $G_1,G_2$?}
+For video (analogous audio):
+\[
+\widetilde U_1 = U_1 + \beta(M_v\odot O_v),\quad O_v=\text{Attn}(U_1,U_0\odot M_q,U_0\odot M_q).
+\]
+Hence
+\[
+\frac{\partial\mathcal L}{\partial U_1}
+=
+\frac{\partial\mathcal L}{\partial \widetilde U_1}
+\left[
+I
++
+\beta\,\frac{\partial (M_v\odot O_v)}{\partial U_1}
+\right].
+\]
+So $A_1$ receives gradient from both:
+\begin{itemize}
+\item direct residual term ($I$),
+\item attention-query Jacobian term.
+\end{itemize}
+
+\subsection{How many gradients are there?}
+Per adapted linear module, there are parameter-gradient tensors:
+\[
+\frac{\partial\mathcal L}{\partial A_0},\;
+\frac{\partial\mathcal L}{\partial A_1},\;
+\frac{\partial\mathcal L}{\partial A_2},\;
+\frac{\partial\mathcal L}{\partial B_0}
+\]
+(plus any trainable non-LoRA parameters you kept in \texttt{save\_modules}).
+
+If $N_{\text{adapt}}$ modules are wrapped, LoRA matrix-gradient tensor count is:
+\[
+4N_{\text{adapt}}.
+\]
+With current target list (7 projections per decoder layer), for $L$ layers:
+\[
+N_{\text{adapt}} = 7L,\quad \text{LoRA gradient tensors}=28L.
+\]
+
+\section{What Gets Updated}
+\begin{itemize}
+\item Base wrapped weight is frozen in custom \texttt{Linear} (\texttt{weight.requires\_grad=False}).
+\item \texttt{mark\_only\_lora\_as\_trainable} first disables non-LoRA params.
+\item Then finetune script applies final filter: a parameter is trainable iff its name contains one of \texttt{save\_modules}.
+\end{itemize}
+So the \emph{actual} update set is:
+\[
+\Theta_{\text{update}}=\{\theta:\texttt{requires\_grad}(\theta)=\texttt{True after final filter}\}.
+\]
+
+\section{Optimizer Update (conceptual)}
+For each $\theta\in\Theta_{\text{update}}$, AdamW-style step:
+\[
+m_t=\beta_1m_{t-1}+(1-\beta_1)g_t,\quad
+v_t=\beta_2v_{t-1}+(1-\beta_2)g_t^2,
+\]
+\[
+\hat m_t=\frac{m_t}{1-\beta_1^t},\quad
+\hat v_t=\frac{v_t}{1-\beta_2^t},
+\]
+\[
+\theta_{t+1}=\theta_t-\eta\left(\frac{\hat m_t}{\sqrt{\hat v_t}+\epsilon}+\lambda\theta_t\right).
+\]
+
+\section{Gradient Sensitivity Metrics in Trainer}
+For each group $g\in\{\text{A0,A1,A2,B0,projectors}\}$:
+\[
+G_g^2 = \sum_{\theta\in g}\left\|\frac{\partial\mathcal L}{\partial\theta}\right\|_F^2,\quad
+P_g^2 = \sum_{\theta\in g}\|\theta\|_F^2,
+\]
+\[
+\text{grad\_norm}_g=\sqrt{G_g^2},\quad
+\text{param\_norm}_g=\sqrt{P_g^2},\quad
+\text{relative\_grad\_norm}_g=\frac{\text{grad\_norm}_g}{\text{param\_norm}_g+\varepsilon}.
+\]
+These are exactly what hooks/logging in \texttt{trainer.py} compute.
+
+\section{Code-to-Math Trace Checklist}
+For any adapted layer in your run:
+\begin{enumerate}
+\item Build $X,M_t,M_v,M_a,M_q$ from packed multimodal input.
+\item Compute base output $XW_0^\top+b$.
+\item Compute three masked low-rank branch outputs $U_0,U_1,U_2$.
+\item Fuse question-conditioned information into video/audio branches.
+\item Sum branches, project by shared $B_0$, add to base.
+\item Continue through attention/MLP/stack, then LM head and shifted CE.
+\item Backprop: CE $\rightarrow$ logits $\rightarrow$ decoder $\rightarrow$ each adapted linear:
+      obtain $\partial\mathcal L/\partial A_0,\partial\mathcal L/\partial A_1,\partial\mathcal L/\partial A_2,\partial\mathcal L/\partial B_0$.
+\item Optimizer updates only parameters still marked trainable.
+\end{enumerate}
+
+\end{document}
diff --git a/AudioVisualText/models/modeling_llama.py b/AudioVisualText/models/modeling_llama.py
index 2004e81..155255b 100644
--- a/AudioVisualText/models/modeling_llama.py
+++ b/AudioVisualText/models/modeling_llama.py
@@ -860,6 +860,8 @@ def forward(
         )
 
         hidden_states = outputs[0]
+        # Cast to lm_head dtype to avoid float vs bf16 mismatch at inference
+        hidden_states = hidden_states.to(self.lm_head.weight.dtype)
         if self.config.pretraining_tp > 1:
             lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
             logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
diff --git a/AudioVisualText/models/unified_arch.py b/AudioVisualText/models/unified_arch.py
index c1b1119..b4333ce 100644
--- a/AudioVisualText/models/unified_arch.py
+++ b/AudioVisualText/models/unified_arch.py
@@ -149,7 +149,7 @@ def prepare_multimodal_inputs(
             pre_indice=0
             for idx,indice in enumerate(X_token_indices):
                 special_token = self.IDS_2_SPECIAL_TOKEN[input_ids[indice].item()]
-
+  
                 if special_token == '<question_end>':
                     # token size * emb size
                     tmp=self.encode_ids(input_ids[pre_indice:indice])
diff --git a/AudioVisualText/notebooks/ave_dataset_explorer.ipynb b/AudioVisualText/notebooks/ave_dataset_explorer.ipynb
new file mode 100644
index 0000000..51f2fa5
--- /dev/null
+++ b/AudioVisualText/notebooks/ave_dataset_explorer.ipynb
@@ -0,0 +1,750 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AVE Dataset Explorer (MokA)\n",
+    "\n",
+    "This notebook mirrors MokA's AVE loading path for a few samples, renders question/audio/video/output, and adds a small `M_q` vs full-text cross-attention ablation demo."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "748180e7",
+   "metadata": {},
+   "source": [
+    "## 1) Environment Setup and Imports\n",
+    "\n",
+    "This section resolves the project paths, imports the same libraries used by MokA data loading, and verifies that AVE metadata files are available.\n",
+    "\n",
+    "Why this matters:\n",
+    "- Keeps notebook behavior aligned with the training code path.\n",
+    "- Fails early if the notebook is launched from the wrong directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ba73044e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**AVT root:** `/coc/flash5/rkhan96/MokA/AudioVisualText`  \n",
+       "**AVE root:** `/coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data`"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train json exists: True test json exists: True\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
+    "from IPython.display import display, Markdown, Audio, Video\n",
+    "\n",
+    "# Optional imports used in feature extraction (same path as unified_dataset.py)\n",
+    "import librosa\n",
+    "from PIL import Image\n",
+    "from decord import VideoReader\n",
+    "from transformers import CLIPImageProcessor\n",
+    "\n",
+    "# MokA audio fbank preprocessing\n",
+    "import sys\n",
+    "\n",
+    "# Resolve project root robustly for both execution locations:\n",
+    "# - MokA/AudioVisualText\n",
+    "# - MokA/AudioVisualText/notebooks\n",
+    "REPO_ROOT = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd().resolve()\n",
+    "if (REPO_ROOT / 'dataset').exists():\n",
+    "    AVT_ROOT = REPO_ROOT\n",
+    "elif (REPO_ROOT / 'AudioVisualText' / 'dataset').exists():\n",
+    "    AVT_ROOT = REPO_ROOT / 'AudioVisualText'\n",
+    "else:\n",
+    "    raise RuntimeError('Run this notebook from MokA or MokA/AudioVisualText.')\n",
+    "\n",
+    "# Add AVT root to Python path so we can import local dataset utilities\n",
+    "sys.path.insert(0, str(AVT_ROOT))\n",
+    "from dataset.audio_processor import preprocess\n",
+    "\n",
+    "# Canonical AVE paths used by MokA\n",
+    "AVE_ROOT = AVT_ROOT / 'AVE_data'\n",
+    "TRAIN_JSON = AVE_ROOT / 'train_samples_ave.json'\n",
+    "TEST_JSON = AVE_ROOT / 'test_samples_ave.json'\n",
+    "\n",
+    "# Candidate CLIP processor paths (cluster + local fallback)\n",
+    "CLIP_PATH_CANDIDATES = [\n",
+    "    Path('/coc/flash5/rkhan96/weights/clip-vit-large-patch14'),\n",
+    "    Path('/nethome/rkhan96/flash/weights/clip-vit-large-patch14'),\n",
+    "    Path('clip-vit-large-patch14'),\n",
+    "]\n",
+    "\n",
+    "display(Markdown(f'**AVT root:** `{AVT_ROOT}`  \\n**AVE root:** `{AVE_ROOT}`'))\n",
+    "print('train json exists:', TRAIN_JSON.exists(), 'test json exists:', TEST_JSON.exists())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e63aa374",
+   "metadata": {},
+   "source": [
+    "## 2) Code Provenance: Where MokA Loads and Uses AVE Data\n",
+    "\n",
+    "This section prints exact line ranges from the repository so the notebook output is traceable back to source code.\n",
+    "\n",
+    "We inspect:\n",
+    "- AVE sample construction and modal loading in `unified_dataset.py`\n",
+    "- Question-mask construction in `unified_arch.py`\n",
+    "- Question-only cross-attention in `peft_hyper/tuners/lora.py`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "83751507",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- unified_dataset.py :: AVE sample construction ---\n",
+      "  90:     def add_ave_task_samples(self):\n",
+      "  91:         ave_annotation_path = 'AVE_data/train_samples_ave.json'\n",
+      "  92:         ave_data_root = 'AVE_data'\n",
+      "  93:         tot = 0\n",
+      "  94:         with open(ave_annotation_path,'r') as f:\n",
+      "  95:             samples = json.load(f)\n",
+      "  96:         for sample in samples:\n",
+      "  97:             event = sample['event']\n",
+      "  98:             vid = sample['vid']\n",
+      "  99:             start_time = sample['start_time']\n",
+      " 100:             end_time = sample['end_time']\n",
+      " 101:             audio_path = join(ave_data_root,'audio_data',vid+'.mp3')\n",
+      " 102:             video_path = join(ave_data_root,'AVE',vid+'.mp4')\n",
+      " 103:             label_path = join(ave_data_root,'converted_label',vid+'.txt')\n",
+      " 104:             output = self.read_label(label_path)\n",
+      " 105:             instruction = f'This is a video:\\n<video_start><video><video_end>\\nThis is an audio:\\n<audio_start><audio><audio_end>\\n<question_start>Please describe the events and time range that occurred in the video.<question_end>'\n",
+      " 106:             self.samples.append(\n",
+      " 107:                 {\n",
+      " 108:                     'audio_path':audio_path,\n",
+      " 109:                     'video_path':video_path,\n",
+      " 110:                     'output': output,\n",
+      " 111:                     'task_name':'ave',\n",
+      " 112:                     'instruction':instruction,\n",
+      " 113:                 }\n",
+      " 114:             )\n",
+      "\n",
+      "--- unified_dataset.py :: AVE __getitem__ video/audio loading ---\n",
+      " 197:             audio_path = sample['audio_path']\n",
+      " 198:             video_path = sample['video_path']\n",
+      " 199:             ### process video\n",
+      " 200:             vr = VideoReader(uri=video_path, height=self.image_size, width=self.image_size)\n",
+      " 201:             vlen = len(vr)\n",
+      " 202:             start, end = 0, vlen\n",
+      " 203:             n_frms = self.video_frame_nums\n",
+      " 204:             n_frms = min(n_frms, vlen)\n",
+      " 205:             indices = np.arange(start, end, vlen / n_frms).astype(int).tolist()\n",
+      " 206:             # get_batch -> T, H, W, C\n",
+      " 207:             temp_frms = vr.get_batch(indices).asnumpy()\n",
+      " 208:             frames = []\n",
+      " 209:             T = temp_frms.shape[0]\n",
+      " 210:             for i in range(T):\n",
+      " 211:                 frame = Image.fromarray(temp_frms[i])\n",
+      " 212:                 frames.append(frame)\n",
+      " 213:             frames = self.video_processor.preprocess(frames,return_tensors='pt')\n",
+      " 214:             video = frames['pixel_values']  # t,c,h,w\n",
+      " 215:             data['video'] = video\n",
+      " 216:             \n",
+      " 217:             ### process audio\n",
+      " 218:             audio_feature = []\n",
+      " 219:             audio, sr = librosa.load(audio_path,sr=16000,mono=True)\n",
+      " 220: \n",
+      " 221:             length = len(audio)\n",
+      " 222:             tot = 10\n",
+      " 223:             indices = [i for i in range(tot)]\n",
+      " 224:             nums_per_second = int(length / tot)\n",
+      " 225:             \n",
+      " 226:             for indice in indices:\n",
+      " 227:                 start_time = max(0, indice)\n",
+      " 228:                 end_time = min(tot, indice + 1)\n",
+      " 229:                 audio_seg = audio[int(start_time * nums_per_second) : int(nums_per_second * end_time)]\n",
+      " 230:                 if len(audio_seg) < 1 * nums_per_second:\n",
+      " 231:                     sil = np.zeros(1 * nums_per_second - len(audio_seg), dtype=float)\n",
+      " 232:                     audio_seg = np.concatenate((audio_seg, sil),axis=0)\n",
+      " 233:                 audio_seg = torch.from_numpy(audio_seg).unsqueeze(0)\n",
+      " 234:                 fbank = preprocess(audio_seg)\n",
+      " 235:                 fbank = fbank.squeeze(0).to(torch.float32) # L,128   1s -> 98 tokens\n",
+      " 236:                 audio_feature.append(fbank)\n",
+      " 237:             audio_feature = torch.stack(audio_feature,dim=0) # t,L,128\n",
+      " 238:             data['audio'] = audio_feature\n",
+      " 239: \n",
+      "\n",
+      "--- unified_arch.py :: question mask construction ---\n",
+      " 145:             inputs_unimodal_mask_question=[]\n",
+      " 146: \n",
+      " 147: \n",
+      " 148:             labels_seg=[]\n",
+      " 149:             pre_indice=0\n",
+      " 150:             for idx,indice in enumerate(X_token_indices):\n",
+      " 151:                 special_token = self.IDS_2_SPECIAL_TOKEN[input_ids[indice].item()]\n",
+      " 152:   \n",
+      " 153:                 if special_token == '<question_end>':\n",
+      " 154:                     # token size * emb size\n",
+      " 155:                     tmp=self.encode_ids(input_ids[pre_indice:indice])\n",
+      " 156: \n",
+      " 157: \n",
+      " 158:                     inputs_embeds_seg.append(tmp)\n",
+      " 159:                     inputs_unimodal_mask_text.append(torch.ones((tmp.size()[0],1),dtype=torch.int32,device=device))\n",
+      " 160:                     inputs_unimodal_mask_video.append(torch.zeros((tmp.size()[0],1),dtype=torch.int32,device=device))\n",
+      " 161:                     inputs_unimodal_mask_audio.append(torch.zeros((tmp.size()[0],1),dtype=torch.int32,device=device))\n",
+      " 162: \n",
+      " 163:                     inputs_unimodal_mask_question.append(torch.ones((tmp.size()[0],1),dtype=torch.int32,device=device))\n",
+      " 164: \n",
+      " 165: \n",
+      " 166:                     labels_seg.append(labels[pre_indice:indice])\n",
+      " 167:                 \n",
+      " 168:                 else:\n",
+      " 169:                     # token size * emb size\n",
+      " 170:                     tmp=self.encode_ids(input_ids[pre_indice:indice])\n",
+      "\n",
+      "--- lora.py :: question-only cross attention ---\n",
+      " 390:             question_mask=modality_mask[3]\n",
+      " 391:             ## train process\n",
+      " 392:         \n",
+      " 393:             only_inputs=[x*text_mask,x*video_mask,x*audio_mask]\n",
+      " 394:             # 0: text \n",
+      " 395:             # 1: video \n",
+      " 396:             # 2: audio\n",
+      " 397:             # 3: question\n",
+      " 398: \n",
+      " 399:             #### get question mask\n",
+      " 400: \n",
+      " 401:             output_a=[]\n",
+      " 402:             for i in range(self.lora_num):\n",
+      " 403:                 output_a.append(getattr(self, f\"lora_A{i}\")(self.lora_dropout(only_inputs[i]))*self.scaling[0])\n",
+      " 404:             \n",
+      " 405: \n",
+      " 406:             ### video_token: cross attention per sample\n",
+      " 407:             video_token=output_a[1]\n",
+      " 408:             question_token=output_a[0]*question_mask\n",
+      " 409:             new_video=torch.zeros_like(video_token)\n",
+      " 410: \n",
+      " 411:             for i in range(question_token.size(0)):\n",
+      " 412:                 query=video_token[i,:,:].unsqueeze(0)\n",
+      " 413:                 \n",
+      " 414:                 ## get question tokens\n",
+      " 415:                 indices = torch.where(question_mask[i,:,:] == 1)[0]\n",
+      " 416:                 key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)\n",
+      " 417:                 value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)\n",
+      " 418: \n",
+      " 419: \n",
+      " 420:                 score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k) \n",
+      " 421:                 score = torch.softmax(score, dim=-1)\n",
+      " 422:                 output = torch.matmul(score, value)  # shape: (1, token_num, 4)\n",
+      " 423:                 attention_outputs=video_mask[i,:,:]*output\n",
+      " 424:                 new_video[i,:,:]=video_token[i,:,:]+attention_outputs*self.blc_weight\n",
+      " 425: \n",
+      " 426:             ### audio_token: cross attention per sample\n",
+      " 427:             audio_token=output_a[2]\n",
+      " 428:             question_token=output_a[0]*question_mask\n",
+      " 429:             new_audio=torch.zeros_like(audio_token)\n",
+      " 430: \n",
+      " 431:             for i in range(question_token.size(0)):\n",
+      " 432: \n",
+      " 433: \n",
+      " 434:                 query=audio_token[i,:,:].unsqueeze(0)\n",
+      " 435: \n",
+      " 436:                 ## get question tokens\n",
+      " 437:                 indices = torch.where(question_mask[i,:,:] == 1)[0]\n",
+      " 438:                 key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)\n",
+      " 439:                 value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)\n",
+      " 440: \n",
+      " 441: \n",
+      " 442:                 score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k) \n",
+      " 443:                 score = torch.softmax(score, dim=-1)\n",
+      " 444:                 output = torch.matmul(score, value)  # shape: (1, token_num, 4)\n",
+      " 445:                 attention_outputs=audio_mask[i,:,:]*output\n",
+      " 446:                 new_audio[i,:,:]=audio_token[i,:,:]+attention_outputs*self.blc_weight\n",
+      " 447:             \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Show exact MokA code regions for provenance\n",
+    "def show_snippet(path, start, end):\n",
+    "    # Print source with line numbers so references are easy to verify\n",
+    "    lines = path.read_text().splitlines()\n",
+    "    for i in range(start, end + 1):\n",
+    "        if 1 <= i <= len(lines):\n",
+    "            print(f'{i:4d}: {lines[i-1]}')\n",
+    "\n",
+    "unified_dataset_py = AVT_ROOT / 'dataset' / 'unified_dataset.py'\n",
+    "unified_arch_py = AVT_ROOT / 'models' / 'unified_arch.py'\n",
+    "lora_py = AVT_ROOT / 'peft_hyper' / 'tuners' / 'lora.py'\n",
+    "\n",
+    "print('--- unified_dataset.py :: AVE sample construction ---')\n",
+    "show_snippet(unified_dataset_py, 90, 114)\n",
+    "\n",
+    "print('\\n--- unified_dataset.py :: AVE __getitem__ video/audio loading ---')\n",
+    "show_snippet(unified_dataset_py, 197, 239)\n",
+    "\n",
+    "print('\\n--- unified_arch.py :: question mask construction ---')\n",
+    "show_snippet(unified_arch_py, 145, 170)\n",
+    "\n",
+    "print('\\n--- lora.py :: question-only cross attention ---')\n",
+    "show_snippet(lora_py, 390, 447)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e4c2731",
+   "metadata": {},
+   "source": [
+    "## 3) Build AVE Samples Exactly Like MokA\n",
+    "\n",
+    "This section reconstructs AVE sample metadata in the same format MokA uses during training:\n",
+    "- fixed instruction template (question text)\n",
+    "- derived audio/video paths\n",
+    "- output loaded from `converted_label/{vid}.txt`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ba66ebe0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0] vid=c---zaDCTaE event=Church bell gt=(0,10)\n",
+      "   video: /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/AVE/c---zaDCTaE.mp4\n",
+      "   audio: /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/audio_data/c---zaDCTaE.mp3\n",
+      "   output: The video shows a church bell tower. The bells in the tower are ringing from the beginning of the video to the end, that is, from the 0th second to the 10th second. So the audible and visible event in the video is <event> Church bell </event>, and the time range is <range> 0,10 </range>.\n",
+      "[1] vid=fCZi6I6kPpU event=Church bell gt=(6,10)\n",
+      "   video: /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/AVE/fCZi6I6kPpU.mp4\n",
+      "   audio: /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/audio_data/fCZi6I6kPpU.mp3\n",
+      "   output: The video shows a church bell tower. From the beginning of the video until the 6th second, there is no sound. The church bells ring from the 6th second to the 10th second. So the audible and visible event in the video is <event> Church bell </event>, and the time range is <range> 6,10 </range>.\n",
+      "[2] vid=EV1bVf8Bldk event=Church bell gt=(0,10)\n",
+      "   video: /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/AVE/EV1bVf8Bldk.mp4\n",
+      "   audio: /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/audio_data/EV1bVf8Bldk.mp3\n",
+      "   output: Two church bells are visible from the beginning to the end of the video. The bells ring throughout the duration of the video, from 0 to 10 seconds.  So the audible and visible event in the video is <event> Church bell </event>, and the time range is <range> 0,10 </range>.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load a few AVE metadata samples and reconstruct instruction/output exactly like unified_dataset.py\n",
+    "def read_label(vid):\n",
+    "    # In AVE, training target text is stored per sample id in converted_label\n",
+    "    label_path = AVE_ROOT / 'converted_label' / f'{vid}.txt'\n",
+    "    return label_path.read_text().strip() if label_path.exists() else '<missing label>'\n",
+    "\n",
+    "with open(TRAIN_JSON, 'r') as f:\n",
+    "    train_samples = json.load(f)\n",
+    "\n",
+    "# This prompt string is copied from UnifiedDataset.add_ave_task_samples\n",
+    "INSTRUCTION_AVE = (\n",
+    "    'This is a video:\\n<video_start><video><video_end>\\n'\n",
+    "    'This is an audio:\\n<audio_start><audio><audio_end>\\n'\n",
+    "    '<question_start>Please describe the events and time range that occurred in the video.<question_end>'\n",
+    ")\n",
+    "\n",
+    "# Select a few samples for manual inspection\n",
+    "N = 3\n",
+    "probe = []\n",
+    "for s in train_samples[:N]:\n",
+    "    vid = s['vid']\n",
+    "    probe.append({\n",
+    "        'vid': vid,\n",
+    "        'event': s['event'],\n",
+    "        'start_time': s['start_time'],\n",
+    "        'end_time': s['end_time'],\n",
+    "        'video_path': str(AVE_ROOT / 'AVE' / f'{vid}.mp4'),\n",
+    "        'audio_path': str(AVE_ROOT / 'audio_data' / f'{vid}.mp3'),\n",
+    "        'instruction': INSTRUCTION_AVE,\n",
+    "        'output': read_label(vid),\n",
+    "    })\n",
+    "\n",
+    "# Print concise summary table\n",
+    "for i, s in enumerate(probe):\n",
+    "    print(f'[{i}] vid={s[\"vid\"]} event={s[\"event\"]} gt=({s[\"start_time\"]},{s[\"end_time\"]})')\n",
+    "    print('   video:', s['video_path'])\n",
+    "    print('   audio:', s['audio_path'])\n",
+    "    print('   output:', s['output'])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b24cf18",
+   "metadata": {},
+   "source": [
+    "## 4) Render a Sample (Question + Video + Audio + Label)\n",
+    "\n",
+    "This section displays one sample in a human-readable format so you can quickly inspect the instruction text, supervision label, and raw media."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "42e45b96",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "## Sample 0 - `c---zaDCTaE`"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Instruction / question text**\n",
+       "\n",
+       "```\n",
+       "This is a video:\n",
+       "<video_start><video><video_end>\n",
+       "This is an audio:\n",
+       "<audio_start><audio><audio_end>\n",
+       "<question_start>Please describe the events and time range that occurred in the video.<question_end>\n",
+       "```"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Target output**\n",
+       "\n",
+       "```\n",
+       "The video shows a church bell tower. The bells in the tower are ringing from the beginning of the video to the end, that is, from the 0th second to the 10th second. So the audible and visible event in the video is <event> Church bell </event>, and the time range is <range> 0,10 </range>.\n",
+       "```"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Video exists: True /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/AVE/c---zaDCTaE.mp4\n",
+      "Audio exists: True /coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/audio_data/c---zaDCTaE.mp3\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"/coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data/AVE/c---zaDCTaE.mp4\" controls  width=\"480\" >\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "                <audio  controls=\"controls\" >\n",
+       "                    <source src=\"data:audio/mpeg;base64,SUQzBAAAAAABCVRYWFgAAAASAAADbWFqb3JfYnJhbmQAaXNvbQBUWFhYAAAAEwAAA21pbm9yX3ZlcnNpb24ANTEyAFRYWFgAAAAkAAADY29tcGF0aWJsZV9icmFuZHMAaXNvbWlzbzJhdmMxbXA0MQBUU1NFAAAADgAAA0xhdmY2MS43LjEwMAAAAAAAAAAAAAAA//NYwAAAAAAAAAAAAEluZm8AAAAPAAABGQAAd0AABAYJDA8QExYZGx0gIyUoKiwvMjU3OTw/QURGSUtOUVNVWFtdX2JlZ2ptb3F0d3l7foGDhoiLjZCTlZeanaCipKeprK+xs7a5vL3Aw8bIys3Q0tXY2dzf4uTm6ezu8fP2+Pv+AAAAAExhdmM2MS4xOQAAAAAAAAAAAAAAACQDEAAAAAAAAHdAA22ODAAAAAAAAAAAAAAA//M4xAAR6U5dRUMoAIAgMT9/6hO7uZVCocDAxZhADAMPnkIT853Q53kI1SErQhGqJh86BwUYhPzhwg0H//xACA0EAQWHy4Ph/OcSOB/Yt///BDlAwtdotjkMhW9FqozotWBKxGTrCLcmSdvJ//M4xBcZYW8SX4lYAoURRfLmH32zhn0bv9lst938VBpRHeaEsXnPa6R0HO03Mm1SjmKezqJl05od1rXZfoh+nZFcTn0uKBg++tk5/recE4PmwQAaSsUcj///4drNEmATL5temCJnEJvtdaqt//M4xBAZWfbmLcxYALUaB4iHvbRIPyg/DO29GxfxWDCWij8onHOJR9au4/5aYruk+jA9cS5xWtVrPtmyd1Q255iPh3JlytxXxCKsew1O6ocLnS6mBhas1TxGKbELsr+WPlVVESb6pWW5B1Yc//M4xAkWyscCNivKes4xwzHg0laZdooyL3rlXqy7CSSbfn2zZxKhxl1zbP174iIff1Dz3G4i/e+aWMK+Byy/9fNt43Vhikf1Ujd1caSOc70eZtdP8RItH47enQJTukwqGtlrjsScl3H/69aT//M4xAwXSj8eXnpLCg3K5lzPG8rNepI2Rp+ctkGddBn6k3K4f2YEDUZRR5fPXVLvvd/UR6Yg3t+6P0JN/MUWMTagvvqGnnr+1terBR8+qYYGzeyI5mSlNNe6JQXhZ1WtpqkmlbeH+XNqTlmg//M4xA0YIqr6NktOsp96GeKuh1IZazMVkOdEctdprsaiKT3M+Sx4Wpaht7ONCSkEWC81OIpr0WqXuePt/6o//xkNEo1Gjn/Uqxpw9njsRy6qXvRNr5v1y7lq6f+cAKCQxQ5k8Fm5uPqM360c//M4xAsX0mLqLgPUFOZUmt/M9LIYPNapOJHhNag9QW6kBoCHZePVUV3vq3Rxm9kZBeX9GWlc97mO8r1zbO+tS/PcwfE7XT/dZ93NYak7kjvytTK61mxM5UTOZYcbpeBA0lUVqADybR/+2aOz//M4xAoWUkLiJkvLCFbCCQQPZWv4HC9ePQDfkCmQp+/L9rvFwIvnOs4xbvqQ0RXXxy+iiQ9CiY6Ag/qNbPciOyoNd1X+/0/SJo+KP+zuyk+JOW0f8Au0v7Z46uZV9r0jknfy7j5ylyB5OKUG//M4xA8TqkMeVlqFIiPcRR2jcfzLjcfMpAPqWw8svx1gJH+2VIQQ3T9utQ1eEbXr6j//M37dlf3b+nL0Yo4s5qnTPre4Qrod7EJigLIgkZBoXD/2KfC5gdKKdcyUkcy3zrGRiAheq6G3olwe//M4xB8UQebljltLIF6nW1BlQOIPt9UEnoMerK0oDto9TlzDRWa3bVWfLQraIwcVYB9xO5Xt/pWLNTM7PrLoPZqa7co8LFBYcQrjHCedNn4U0XerrhZGsQLuxl9UBCQ2YQ8qsLB7/+P9nfdd//M4xC0USeL5jivUpk19HyFm1d1/3m86j/JhZcCXEppguhoc3LTR63CwsuVX9VQVEVU9pn/qblH+xYrr6UtLi5xYee5jFB+a998E2T1NQoQRrtk7HMJkKZfSSPn5va5WOBzIushErsiHpwT8//M4xDoUGicFlltFRh3J/uT230VTC1zt9fUfXZt/t3/6dDUqEYSWqJJOXD/PNMbjqDM7ai9wLjJfnXUYajdL9+ZVqSSeI4+achsEEbNnlN+YEtDik0oR8TGb8w1Mi07/S/7aUMKMFXbWTlIu//M4xEgVSeMGPlpPQobXhh3W8Oh9npkvFwlVtaR6ZmnLQ+5aKTa0MUHXpXyp1ChctVtEQguprMPkp+hcMZNW/ES3VqVExJh53xu3lW8yJjzVurf/8z9nYfbPHP/jvzc8nT+PCvaLFz7zVnyd//M4xFEUqncCNjPOpslCIQQJO8eFJJ8RI8B3E7VTNR3qmlWLq5tLuAJlSyiNngkCbycd5aoCyjv/ngH5ondBFXRxpbvRzsgSp6+n2/VylpU3j5DU4RZOr6B5V/9rX1pF6vWQojfZ+N50zMmM//M4xF0UUfbhbgPOGJjOfTk5jkHlNpsgGnUHb0PkdTxGqijv4iH+nyoEGSYTUUbUCJtLMjiUQDALo/yVJ0ancEEOXDivkNzC6xVwazNT/KHiX7P0i1U0BEZPx/oOTbDV3rqaiyOCo1tinF5y//M4xGoUyibuFmnE2A4AAvOXOP1aJqV7WQ36WApkJZ7fWBlrFMWKjYWtT3Y7vmL/49EXf+hzrdV+Ma07/W4uvo7UR7f5YgRPi9G1Y5opS0hSjKKZKVWpR8fv4Uv7aps7l7Y4vtJM+/NBjMAm//M4xHUWeq7dblqLRDh+LnPqX8agKoD0tLsaaUHRQxbdn6nCNQ09oQujYpL3NcRkerqZjL00c6fv6mAc9YP9a5SKrmZcv///s4kKncyUzgjktIaENbCSmVC5+PX0MuVmk4Q0U5yEuGkUrEwr//M4xHoYmrbqNnnLEhxA7yieAKAjzM8o0N2JxQDdZRon9aBgc7O9tYy+1YsNsfPlcMY9lGZf4Ki6kJ+ooAM8IT/4uqdYkamQfrR/9GDCDVQz1XwzhjyabBo0EVoBSdH/zPEktqPBxeLd4/J4//M4xHYYuuraLjNEvOTaqqoeqz/Os7qwCnZI8sCIh8R5AhAt1M4KNqOQ9zzfsGlYYe77algOLpp0Y8hYt3V//5PgJkQ5YU/y7WUDNzIPC37shB2Fw7omhYAhh67ibpo2ljovPv5zrF3q/1ep//M4xHIeoy7plnjLc+dtPUcpmA7SOqdCEJRxRN4Xfd1nGq49D+z30H4tc36KxnUxaYGO9u5Z3s+m9+s5IpFYfKJVFJkHyHb3NNYZgTHD0w2lJiu6I6HUnI/f7HIXolPsxpvbeY9ZtMqUPas7//M4xFYW+jby1mPVIPVf//ewP5pVU4C3jz24rxokfPDksRG3f2Cqa76OSmSyMhvz1JSNiklzmHQ8GiuqDNs0K4tr6EiYvCEsUTbzTpfKysb0v3mei/0bJ/Tnn4ZhTKXe6BLa5UFiUet/uDPo//M4xFkWwmr29kqE9KgTkSSQmBk3B9nr0vAK0qDpeeRYu8KIUaQx4dJY7ylOGyWtSjM1ddZOrTUH8APoL9RtU2+Tq6mx539WVJRIAR05k/OWgd/nXZXJuQRMtYUNiw2P4Qs+q/P2irSaFWki//M4xF0Wkj72NjNK7hSCYZUvHdAEKRgkEQdcOQEght+rLqJh8in//y8Tdm0f/ZzCIWD3SQsyGC8QJb5nKggPbNohATjZ3q0q57RoebI7JuRDLoZ+aZvV9JROQP0tR3tTRipAYr+Gu/FFT/o9//M4xGEaUnbu1hvUyf9DfgRP5p1ZFd4nJrrSTcwtaOHkjMpCKE73KloM8bFGNEh9f+x2oQMeMU25zkRDVT+i4pahQn4MX7pa0k1vJIJvbrj5011rtjZ6nDTflwScmpctkkX0xCSv45dT8Eas//M4xFYWuVMOPkPXJrOlRqg43jmn+ttOTi53SjSWfCfBdjTQ0B2scazsdPOmRn+W1iKL1STHxJxRPokgjWonDLA2CfEf5znAex4kjpEVN1ys1x334m6+v6CzW/jH/7Hq37cYWHPT9uyvhh4V//M4xFoWunsSPktKnh+xpDNAnRz7rV2Ig+P5cv8GG9ZxEM0QAhvlg+3/4ztMQOf6wHAcqN1GhfUv1KTIvzN/8Yg/94/gwJ8wImtYO6JNTbSzR/mFOAAsN//N/upn5E///Q46jL23oM+ULva6//M4xF4YIrbd9lvLLP1ZlQ0o4AAfnm4NcTD4ckc2YxSLtrz25veniVzOHX9SJouJ1pFkR1NMI0YsziuGADRf4tcSg3utGd7FjG/o2nn8nchf/oaX2/kj/b17/usqyHfbfR/+pK/vySq2KWX6//M4xFwVyubh/AvUNJNNuD58WuDXgWcHULGkAN0q/+NLqfCSH99dW5Ai7z2Innh4PBybULmqThoxT3q3cNZvrll7bu9CndCH8K53/5LOp//Yhjnen/v55CCP3wfD3/bVHbIwHgGph1E2owWg//M4xGMViocKPknFZg4yVuTWseSaoZqy15UGr4B2/+FblxKm20qs0k1LnW3lpAhQcAiJRJ9NT2TfMUoRLfsj53PfnmMY+uZ0GptxQsh/QdQ6ESuZ/+/nsx/16jjpikZc1Bc6xGWIDWE1h2RS//M4xGsZ4tLuNmrOfIEwBGMc1CHYfXSCSP5msd2AooYt+pIC+4AC/q3SWFflDGgWRrJooRC2X07EZfuDSL/9GxAvIm+u/i5OiX0fTqER9v//o1/ro5vGs4AtZUmxFaZwDUR88WygWwUPlzjz//M4xGIUss7u1loKsFo7zY2MteMnj94n4/Tf2bLbTQ3OJn2NjewuIjxEH/jBdqqWgWFDBT/+2NHeqN11ysICv1vv/EC/T/+1fyp3bqNaurbiihbWWQ0TaSo/2dDT2BYE26LOy3TqyBa8dhZt//M4xG4U2tLmNBLKXBK//VXm23+emaaZUaqWZVCA4C4Btyp7LCMI7zuoxFaT+fWiua1gumXUgJjnf2d3jUkdW06J/H5zH9/IM5Ct0m7/IVUZpHUKKAWHYvOxDuH8xdy/UZSjMlJ6BtI28Nq///M4xHkXKjLqPlpVRMxLrelXJ+ZTXpJSDH9JU2aVllVx+UgNcs5uVCYdcafonroA1uYhMgmuJknjglF39ff9TW///Sj/p/yrOL+WW0NIHc1hmhI9B4CBxF45VHJ+eW0So++lLau2ZGES/+Sx//M4xHsXQs7mHmrOvDw5PfRA8tkSJS6xhg71/v+TIGTT+UiZPxi7FmVf/IKr8s9Lf4n/5ks6Pib/4VhzxYKrpWfhnZDO/0IVIKHcgCi4OgJJ+gcZ7OexvEpqEnzH2qG4Zv0qP/UXoBBxgQeJ//M4xH0V6hr2PBMQVLDXepxYUcyI7NqpIMnQubyi9rUfEeIjhBoBw6pqHHZcUdnOSjdT7ZUObNmpHqoa5yNEcFeH+qkwka5DAl2T16eGEVoiIC+yRou4AB4siqlWv4/+0RmGva3U4wlfrqhY//M4xIQUoZ72PhJKWN8X/CG775uAjsz+0qGxbPv/Oh/f/KPS9ci8BLhzUbq5GvrJbInm3HZx8ZxNqLLFpMyZU0O5ezgfxMqtJG8BMsSSrf/f+04+i5J1uZEImwQN/1Z5fUd/w21t8HzDkauv//M4xJAUYlr2VFpEsNUGb/+P/+QgMewItUuKVP7Nn27lOCskqPJBCD00aSnNCTQL60o+1sTRQO6M6MBZSnM7dWfwEl4UwO5gHFNBABQsKCz+aRY0m+kJGdfKlvo+OccdW/qjy5nrTzFl/xwf//M4xJ0U+ociXlnFYrPXiup1XFpRCanITAgErD0DHYLCo61hH/GHbqEFKdZkewEapk7CzY1/4NzSMfLdLnQzVF6zm/zXxSFqegtCIkypF9l8g3WpKjb6R8Hl6G4wM36PIa6PK+kQKjrZYCMH//M4xKgUMhrpfGmPCPsyHDFBuAFQOa6U5zYLzMe4BFllcmpnutn8nx0vWMmMt9/MCifhQ98Vshg9WmW/7q7UGemtUDA8IDWiyEqJGsoZ6Knpu3PLuFRTWv56ujzhMgAuUeUHEJU5j57pdbnZ//M4xLYUSULuPgsWNISdFatZqCVdSkgeFlKWv2EkPE5A8b/s0kcJk8oyGmSepOl7cw3u/MTyT2luVZFF7yWyL5ORDIuPXVVaLW7HnZShaTbqG2tjcebkl2HlDTRmg0AGDjDShKPWVyEcR34g//M4xMMUoaLpdELEnHiYLZrJ9DTjS5YcPvPmMryjfMRuLJ/c0Q8386kYs0Wvh8p+gTXJ5rmF5VPL+TrvUoouPJ5VJJ9IVhLWmEgSTl45UpRxMA8VyZv8wubQEpBMOPwljBUCvRNcexByg2XS//M4xM8VIaLplhMOWOEMzeqwcktxIer/xMz9ta9jfI3mep/R/cWDx6kLrmCMBhKkipyx7aKPdJZne6rxlKAAt3j7Xg06OMiPMAh/sB9t5wCzGXvsH86wTEX9b9/m40cZNAzOFZVU8aYsrnvP//M4xNkU8scmXlBFkmfTHdB5WNa/wi3Zdac8dzgrcs8prSoTdRxCji9DkfuejcO67dX1CiOjFMJCCV212rT1GiSolRISoUpB28f/3l1Gzp7D0r7V2y6xt+U0el6ch4nIfB3X8/tVMIwtB1xt//M4xOQU0hL2PgLKGBYRFmw1TvjTi2Nl/Zc/fijwIibvz+/qX/BNvVvMBJ8E+r6McQtgT0zWQyPKMSKoRu8qYvWLKjhMWbnH/8YQZvBhrACMXpb1IDxgC6p3iBbpcGCF8f5f/EQQOkyiW0/q//M4xO8Zqx7l/kiLgEkniOb5VipB1fUxTEQ3xwN9/roDHZ1lKqtRc61eE7KFPFl4Znry5dwNL5VZE8Oxzgb+LIQG6TtQchIU4ApKM24//klj1+0SwAO74R9KTIrMvx4HjpkdHxW/sUVUFD4o//M4xOcXiqrqHnoE9OqPURZynrXGuorVOOt2pChZsXf0V6r7tXrs/YQP01In2cj0oYlSpqe6rXxXqY7rKqI0VfSCiAwQucb3Zq+mIvIMnnAFFooypRgsABBlI3LvGRYaWFyWaGbu/6Ny9RZI//M4xOcYcqbhVknFTNNRaWNToaV9pUlVzSFlZG5/EBsw0136aM3VH/udGK6X+zCfRzyEyN+2VejeB2d3nX7N+hLE1LGQlhDlwuEW7RrP+M5h5Z1S4RDvzuOf25KMha1trH5iLzpMKRU1U8/5//M4xOQWCqbuPktKyHa9P86WEcJWhoOStzxoorjhAV1afkpCiWW7m9pypqNHW6O4cDtLaUaJIdNUKVw8KORZvPT17ziJCVfKlFZFL/NCtUcoY2YqPlxxJQf6aEFSpeCgJZ0Kj/CdqJFYGi9d//M4xOoZeybiNlnFaet6WiiZFRGv/9Fl5YpEDg5PfYwpBrv7cm5EgMDTQNHPH/xgqu0nW/9zfyitVmK3MgucW2tMLkBZW2+z/b1F5l3GNTTVBchKiEFVhSqDUxISwFTjWdekJ4907nTkQmO///M4xOMa0ybmDnpK7JXNKKGpGLrUnk3t0E0ig7t18zCjmw/N5yAX+aW7Lbn3RW5MPSbvPw3nbwzWmi31ZsvQn+ZPW3nJG+1wizf+/6dRnkd6i/b/ood3sBD6qgWENI+URWuw//3d6ol27Xal//M4xNYY0sbtbloLSoo+o950RKkOLWdXsDwZzAnQDKm3X66CFmdnOzb+LWXYYXEcw/txArxy7v+PGTwr+9/1/xZDnRgp8kKBoL8iKOG+nq1/11QpnTmFdf6twbYoHxOy7Xh/tVHaGpYR1hMs//M4xNEYEwrmtnrEvQCxJMLrUzZhC8M8d5m5Uhlwiv/+jkaFHOqADeQeZqd1kVroMrdbWE96j29X9f9QwaT4IO+cYlkdTJMCVDrZ7/9/o7+fj8g3/8jmAUeqsiKYlKco/n5FGQmNGhgSuAFk//M4xM8Ygw723nrEvBMHfXk0j+R/ZPzOkDBL/pJoR5pItqhTP/qxo6TygcxNc608subHhnS233/qgw/M+6f6FgmjKv6v/4tHc70I1FP/Rdg/oGMGKLURoEzaoqVUxgsKlnxnwUzljCUVxDk1//M4xMwWsqLl5liRUPkZxkeuLT/0E6KdJD5hVpj/eSGEaMeRFkMpK2mlr1hgiOf7f/yo/LGkxpU01qqI4xVNjXjAaERaQ1PZEOXP1t3NMLa///8/zypUZSwbhDMTJDJEWvjmuFaeYURR8WAO//M4xNAWqyr2FknFH+Do/W5qpHMr3+V1by9Utv/+YIRpxRiUk5eKkeAxcwi/oOMHG0z7m6BnNcclf/P/Ugqhomz9BAxWvscYoDjp5bVNI9jQf//5JCqEQxMbUR3Ddd+8XMBJOnh1zC0x9vzt//M4xNQYwyrp7gMUEv6JQYV1WvzznHShR+7Xv/Jil0s/06/fkEA/zMN2v3KEzxuX0eObMiRQGfnkXzP8z/US2RXEI1O6EzboZtEziK69f//yoe8+9/f/6/qRqhqwxCr1H+ggaoiwhCgwoDVj//M4xNAWikbu3lqLYL4HZyggRlbNDnS5x1BBdet8sm4eecWu9wEF20CnZE3uU4swh0dF+qjJ+gd7N2I3f+NBhrEiAGVHn8upCTmA0FDTxYXIeU+vuo+bQu/qfGnXulfSVDGDHCrcoYguHxyA//M4xNQY8ybmtnpOuShEfuNjdrlFs1WEq1HD0bCcXJ2UDBeyAWNVETVDlSplts3kO64DAh9RJiibflHuYmhpvxoGTre+39p66/f+NBgefcuSepDYkHi9PhhYOPMTV8N/5lD/+mrvIUgTora+//M4xM8bgtrqDllRUCbjzzy4qwJ8udFPqJseNPRBfyWclIXWIb5x6ceHGCFMJqbYa0d9IiPMI3KY0vtgy/E/boBc6ZfQB5RP+Tv4sE3n/E5OiS8BFjH0aph1RmoE3kZPhDiEwrTmsCl0GESf//M4xMAWUj7l5llRTA56WV7cH4O08zZvSx4znP0Y8cGrLmiOW0IqiaaQsGBahcw9LPuZRhc/zP8031/o7KpMffwtxZGzISL9ZtGccAO7TikQIEJAqB/ylDFTJQzaOWosMOUVjU5B0OXlFykF//M4xMUUgnbl7klFMFEqWH/1IJlPceMBR+hS8/txIhgZXJu5G2u8EPzsBF0XnK3X+jtKJFP9P/OH//6I+nUz0rhn62cutSSZaq05BrcA91D7VnDpGzdPOy5E1RcFdHdh4ucLf0YLBAFynY4Z//M4xNIVufL6/gsOOCXMLJda0cYGS7dc/0U0VnJpK/pJCUWXW0XKGoOe+1AiDeHeLFH5UitUsM/8XdWxMoIB4bvHHqlKMEWzw72BuumNZZ6hMJlnCkCI0gPp6HGxwT+9b9mBk1bVeWBcfg/U//M4xNoU+srl9loFDDfpwx8ULHcvcxd1a5onq/d6WOa44FHd/6OwxQxmXJuU12dHIgQ7qX9dj/5n5bqjc/cn0FPVBir6WWvVy2S7YTszRydhfsZcYekVNUoXnpvNBidpzdxk6LLZC/mXeqlx//M4xOUVGasCNisONu346a2FTGFz25cQqMg1chCMLO8Etv/5mJq2/3OqIdG3SSu1p5mTU/0IU6AAre7rF7i8pyemjJBJCpSodKOeZFOnA2hXoQjdmesjRs8mD+Kxi48w3VT2tmCXtkWZRtII//M4xO8ZyyriNloLTEJDAeFRYDgqsf/LBNQ88bXDMcDoVM2xCHXde3/j+6DgIyR09x39RJ7VydU/iQAiwrWQN1mlWeu/n9xU4YMVp5//Svn6+xrN3H7U6fkUwPjJgUGpKu4gQKHCSmjx7d/d//M4xOYXMoMmXmHFKjNPxrqYcfL757HphqDhWrNBtf+w/892jKkcLuKSKdU0GBIv6x+WX6pFfUV/9OjgjpentUe5OXxgi6d3av6f7vT/6/+hP/860ewtEQAAo/CSLOHoVXBDxHJ+QpZnL7FD//M4xOgfUz7qFjMQd/UYtukg0URUmPpwVGpu8GKAiB8C0SbnlxLKx80IQThR/WSuSHGeaRC29MX6Vavk3IBSztVOisVMXl/lE//9P7DU+UdTp+HP4kwMxYciACBBjeonx6xmKfCl8nyuBb8V//M4xMkVOzryXkqLYGhgNtE2SWNWZ/j+xR5dxsl+Q2jKe49FYpLMkFARt22FQuPTrUoDw87ic1m8z3a54VTT96liroac3jx8Ay1H6g+t1mU/d+cPPSqWMhE1jRNpy8O9+iWJlJOVy9Akm2tg//M4xNMX4oLqXhMUfLybBqBFyU3Kns+BQNGUVDgBw4lPY7JAjXe0DgQ/yx5Z1O7rCd2RFCpY131W3bQCAs1/5E96DH0//T7/qNCc0v/uo7/48TPkT99JHTXRtJOGdepMLt0PEBNUhwtpzP0t//M4xNIXGgrvFjPPBIi4YlwRVWIhdY5/P/gmDO0ibIwqPIbj4hc6gUAfX+jGGtN+rNrF7l/m2qbocLQa0r93Qk6t9CX/532+xQqOzTq5/t/9UT/+h5OqLFX+kkiC0IhrYc33TlVNxtRTSj29//M4xNQYWubu/jLO0Fqrj0ySOc1l00PQ/BxrY2naLBvdbrD6lUpwEQMm/RleQdFWhg6twZ3+b/hRv/oqLIVPQDen9l//gn+9QQGEygswr32BBYmCQjxFNQdO5UIEoitaiVUFymeuVjGlTjzV//M4xNEXkysGVhpUXzY0iXDfwGx/TgWE4CjQLRYO0Hpsy+lgRp/UufbZ7Dzep/7f5w4Wuv+j1cUGP6hA89qeczzs76IBPQk9ij/P/9C39QaVGabRRkaMdtHkRsaqlpNgesUj8g9iGR3ajo+o//M4xNEV4nrqXlnFYDh9UKSUmorFvICA5tSgPghZd/R6r+a3q7/f+pKF+JjnM1tDUKHG5T8gELVv9TlNqb+RguNczU528h1ZNck5HfYqQAMNcf+NQ6Q0YRguoofUFj0jFe42S2ruovhK+BCt//M4xNgXUubqNlpPJucFXSH091fO1adY0/bafFIREeB/849IBMcRjzRKSXyEWzusl9tIYvr+9C+hzp4uDen/R1tRvmOW2al+ki/Z+gRAsurRkvd9HluHbfiAEosND8J0od3BloWKOxmwbjjI//M4xNkWyrMGXjpVJkXNuWggO5KzhKSRy5skDa0OoxC0//zKTKxH870MPLdY+/Z7kwPGUmf6E22OellBcxxX//O60U9t20/K/U715moYEqBFOD/9itWWpyuH05cdGnszdH1mPCFcv0CsxcJl//M4xNwYErbQ5kPU1AhyrFasgqejAqhxzf5VajzTNZ/ImP+Z1tNUFkbKcef9YudvX5HUz///ob36dMi+t++K4gGpjKSSWGm5hVX7enluXT4IakfWk2/dL70GxRPvG8N+qCmf3Yd34M2ARDZo//M4xNoW0rbmNjLO1ODMlf/PdCAVdNGdti5KZ0Qu7PZ00Lrp/Ry+Xp5CF8zEbe9zvv+p779/pb/RSn/21IxxDmLstIIoAU9qMt85DV+FkyMpDuTjSvhNdpAfgKdnEfvFbjl8Q368kyPDJDqM//M4xN0VArbhdjIU7riNppg0Do7f6yUxUH2tTB/8Sx8e2skM7aHEA7b/5mrfKh8238k7ne85xWcTPk1RFtweyGiViSSl7ym5RHHKBQoYm6BNaKGYOSw52FSOWvG0/V6ZLB2cVWshJ/5VouLf//M4xOgXcyruNkmVJ+OseTvJX7HSAIzu3v9LpbGTWG74P3IOfqcUxHru6j/98rPOJXwiko1e7v/+o2/mC+3BZ9m6IiPAE0mBrbCpR8EkumbQtQA+gsYX5uMGQF+H8ye/dhnuIh/uoocqZ2mC//M4xOkYAnrePmJUmEKGWJIgMIydujI4hjXnDhfg/1RujxzgKD3p66Cc9fzi7/+5u6ProDj7uT/Qv61xR6shhVXZFIyAUnuHd596ZxUj0cYLuDI7nWuPB8ZBys3YgUppHdMlkRnzJfdLki6X//M4xOgYOnruNlsEuio47MAov+5W2Z7KuNB9+YFJLqlFTVMyzc4DoFuZtU6LKWjdUrTvFfkghc17V+iJHECN91cnAROpChRjbPUZ/i6wABHG7h55ZCqduxdLcK/sopFcmeE5afJc4859WLM+//M4xOYW0ubeFkqLYFoiv9zjjW0bzlZAs3MBOI1nW/XFoveohbedhKGceYdvuZqNtDG3/6qVLkbdA+vp++/7AUbbyXTb/DXREBACQBJb8c/AgKdidaor0Jzpig9pW9hxMt9cAMMyijNkcoMO//M4xOkaun7eHltRbLZiBFpJLhcnT/ShN98VnbTDm+7/4marf6r/6v//ZN/0FLzi8q/O45q25HbWwQAOGW9x8/5alJvCOVudKX7WGLCxMwdJTXiezKlF7MODvHahFVz1kkvCxSaHdzTgXM6s//M4xN0WOnbaDkrE2KCaMqSqv/Hs1klj4vxEts79ah+5+3kCXdzOw8PAlj/6txIpX6hIctvan/rnDz1H/2ob/oJjuM/+gtVVjKJUPGk7R/PwbShRFJrFvzzNCvMPgu5KeczsVtwHBAWYIRJB//M4xOMUOorqXkJPJDHjHHz1HhFLMooC3/m0Qt9Sz+rs+arPsdx0VFmZPztf/NH//wDB7pUE9blnaqK55SqpUQAgKGorz+eRkGw0FZCojSEgJHVTkQEA0ytEue8BOKOrBQRVVsMhgIVGbBSF//M4xPEbSybVlnrLME4UioZdRnkZQxG68FnV+sU05Iwvxnm63zq56wqK3TiUd02oIRdknNPOMu1x5a3VxiUw8cL01Ag6U8VT2csM8ah6wX1sWuyRLtUTFPvXeTMldareBnddRJ7PsW0iXClu//M4xOIVCg7yN0k4AvIm/q/9Kav/t69xbEHdLfebW8ucPth0u5H/zH/L1YIyMGBXBP4QbdSaaOIO6WQ4CKoYYA8mMwjOlN6HEwN5Ff88516ENXQqMw2Jm6GIjKaQCGPpdn8XDv+bdxVVvJOY//M4xOwqiv7hbYl4ANe2v9Gm/nEzKupF2O8HgAHBLaJSIYDiyh3aQB86PYEhNNGYAJ0S9cMlxQ4gR2ikIgCn+WBoayIbjqWtt0ysQZ2Td0kExnn+9pIu9BkDQUGHI9ynvdhJ8lzOiJ4OBeL+//M4xKAXsmLuj8hQAHlpXHCAP+alfuf/t4//yPpP/Vuq+TUzyAuznkbKWZO9wMWAFZZTNKgRKCdo0nBYbMqGHR1RY4nCon38NOtct7lnWyq8qKlou1UCBpWnBUezAWjMrFLBAZBchltM6aGF//M4xKAZImLq1kNQ6L6O/ZzdGmKxGUOO1E6PKY4mPUpxSxtcqG1IkwmNrlpD4ce+tHK2o9hcQZikHkR8SVXDihoitAoIDAf8HnVdLxd06DeiUHJdEnUqwTx1cqK10WiRUXEBQYIB4jnqwmpk//M4xJobWyL29hvKqYe79EFblJvfdlnDfmD+kyt+1KiJn/+0lkEVteV+fvSIigo/JWf/kRI2qXpVpwZZRaa3Y8c/jlEMNceclAJ2rvVlJ6uaP5F4z+zAbE851HGWvoyWIfXXV3+p2jU/nbxe//M4xIsUwoLuPFtK4BTu8nUvfW60Yw4nP8g//Wcz+pGWWA8s4tnPKQbqCGZTdYXAxPjaroYtdHyfjR/RN41u5FlSbawADqulWUaupzGOp+pz70UKhuaeJb+3bKENafmkvo/7etfFAJPcxU/S//M4xJcVEqsCNgIUGskkeNhPTu6T/Ev/P+NdFxdalmBxK4f8tYZyYBxHJo+eYyD5fB9qRsOq7HJ3rauQVYnf8eN+Lw3SVG3Rp9iVFtS+vDi/y/X/zTgudi//Zqm6t0X+RhDfLu/yAb1qSzyO//M4xKEUQjL69gMOGJr3Wy1pf9HQCqYF5kfrpR3iPmPN05HLiKp65azBg4FGpvXN2Q38S9PE42Rx0SG72quCgvfrZeOlPki37/82aMGHKa7/vZZOMS3t6Aa9q8DyjgYbG8qUJMINSDg0cqm4//M4xK8UItL2TlqFDHOs0283gUC5rFYs94n/naNdBFCMqdEYU2WjBZ8wr6U+JiGQjd9T/GMurpp/eME32jjnod2MTYaVCvsSsX/qly9c+yTVpaqYZXIVSEY9xE/MqoGo9kpxyDsJljA/fX1W//M4xL0UejsaVgvONogILShXXPiz98+NRUCMyohsGPo8r/+3fzavulf/D/XBvmrrgyjTlpTl9vU/smC6HMuqi2CWWTS9saaVCIV00KeQJWDUrih8xZSK4r6I0sfAgkujlrhnnE5jquPSV3Wi//M4xMoUMjL6VgLKHCCzUlb8Xk3ej2puyFCUbIcFVad2RKP3tWHissdWe307oZiSbA421rxVomNPjAqWY3ClPSUo/cfSSGQ2EodpRKBVnSbJSaInpk2P+FVeTJPu+bUtiERKhU2lDnv+SLf6//M4xNgVEscG/loFCNwr5A9RV7h+d+RZdiANuemusR+YQqolNE7HVAaNRn+UBzPzZC+vrb+BXqXq8QNr6xQXlxqV3kkkbf/tyj+uQIqxbT9ZbP0RdRg2clSTwhI0Iw52yDqKj8yC81RLbsji//M4xOIUQT769gJUFHF9EJyhfQqyPljtDnygFmjpiFFMnWqq8wE6xFyE7/0bX930/y/4Rto9bx5y1XEYJyhoSVKKlyErRgjydzTdjSWYDmOy+cf+UspO/t3Hd4Cpt1GoC31otG7GlRs3FixF//M4xPAZosr29lmFhH93Kg3NacSsh2RyzF8mhdRVhVsvn9AM34w47tLpQOPVeiix9dPv78S6imZWVhHq0yEul1ffxBDQVh8tOyQIYCEbjQY2qh5ViEW/MS+4FGekidHk1o5MlTR65LT5JP6N//M4xOgXasMaVknK3hG6uXpUb8Ol7qTVne4poUeHunQb6h4c/ooRP0/QW/U9xG5t26G/xNtS66Cqd5f6U8SUijixVT1CIBz/c6kTc8Wq0oa5eQcAsMSEhIxXH5M6BNPUBxkaG+JMeUAuYNTy//M4xOkXusru1GHLKE9weuXCF7reOA3camuDXcwCb84UDupyPMEtXHlnsjK6al0zQQLeYQhYl09542b7HIXNea1X5f69X5fn42b/9W1fIjiMmgklIA1bkrV4/8DrHlYqmXy3+lOcHGc9RM7b//M4xOkYIy7iDltK8AMqJnbMpBbd7cGML6MuHhJJMZw+tY+N7//H8fkd4KG5rhvZ9ASseUtP/oYd6NFicYnxoq1dUFVNf/irp3TvqblqKtt33qhqGaY4uQluVRWKXKoCIU48E8SDcdWTDJTT//M4xOcbgxrVhFwO5T4SRSX1vVX9pUJ1rLPCS/+oBDdYfK/dt5UzqaVwturzV+ZOHOJTazTP3QEN8ddXVzz23bxmW9SykPozfJP05dqfqd/16H+0ib/2ezubedZFJTeKMkQhFcf72dqybIra//M4xNgZ4yLq9kvK7bEExUEsIL1L4O8Hr5hM9xWE5tx6b6BBtqb/n4T62HcxIWfIOv9K/6yeTh1bIX6nX77ldE9Q/T3Fk7fQXfwTzDnL0fp3N/3XBoAjldSSSBgIyYVwprDVGqRTao3YHsbA//M4xM8Y2y7mPhPUfeoAuCeQW391sbXsxGesstk9Bb7HbylTfT/9T67KlWTWqUEMuonWpEzWulhCyZHZG7f+7fR9xrvK7shG9OduPWVv9OipCqdmQ21FlUm458uod2AVF0zI0cBptllwOBVl//M4xMoVui7hVlsFoJseL2Y35uu2MgOhGjPmtv8ayC6C1CN6L90qn1/nfRPxhCfHgUHxGFxEcQ8gwoc1wfAhwPqpdEFY+NN4nE8viRRkgyTg9crysG1CByV+CQnWBiWEng3FMxQ42eZwXAY0//M4xNIWGqruNipEnNMVg8BB/atWQxphI5d9LchREeNVGRk6laAsNqN0u0u46KDsoKK2xvVfrHuQlbkivJopZtwrvTUjxcL7728pFXSoXQqwpdmkOFGZxuoHvbEKWJ5mVFoJMS9PYFRCYmpi//M4xNgV+eMC9kGLIB4yhEANEk5syhJcrUx5buMsO29K4POhA3DZiJW2YQQfEW+gwIFV0Z1daCoq4HS6PmcudT1dI3paAz+rfEql/Ry7Ep1X/pXN+lPuV/GcvRV0DCbReghSUgIjcmUcAy9F//M4xN8f60L6FhlTTjmyotH8G7qNKzY2GBVHEx9SIYCNxoqw+b6nChjX69GGoHL6VKvdTF+/5nb0pq6T/v8SZPb+0aYG9dzlg6wcDW84+GIr6CzaGjYBiEAAOj/8spC0ueXcs38T067AhDdf//M4xL4U4sbuDgLKHPUfvd9uqQjz/TUuvkZox2tgYIM9PRkKMEBO3uya7fM/Zo5yOmtXrp/XzCsv2/yq0n//cLGVcFaxOVO6VLJ1GqTH2AAnB/+XfCS1Vro1Vs8P9Iv/E18S/S3p5+xvv1aj//M4xMkUYlb27gGKHNAAyiIaQlFTwBAkfuFVPHJQ9XDb/VTfQbnr/QGSnk+QV/9h0J/p3jcTz8tGvqUZOol/oOEeulUWokCM0ACsP/0C83V2jzkWfltKvkE9pOHWVznGLOnoa3/8pgY7gfrB//M4xNYVWqryPkmLJHL/FbtOkNCgoOC4axbEXHgOrLkxr2+N5jTxvzl0aI09HAxvzN44z/+9Bv/7wyCVuXdoxL31Hw3qJyYWNkEQ1EAp0eLzulRMcAL2D9nPB/WM9Ut+cHNVM69mX/tgtCyY//M4xN8V0sbyHknFgJokrVxpklDuO9eWpakrKKfhZPZR1K+oSv2qln7ccx6/JOZVrOYYZ5d+EHO1U+6giBFAABo/fqOQg+jRSI79wCoUnxAJ5kT9QtvFwMwPRS4niCRqFwSwZYWPNR24hMV4//M4xOYYelbqPkvLSCISQ7FfdWovmcqK/95pLvgwfEq5/eEMCdfXBMtH+C6oMi+6a9VO1Bb0VvZb6guL5J9ouh+lD0QwGoklvD/PPJhuUrmZNWGLdxRwiUVGXF28hy/JS2UfdH77IEDdnGkQ//M4xOMUeWbyXipK1MD4/Ey3vHowCT7+5pBeXHlCHf31lVp3RgmIFkNZ/zBpiFesa9NjP89+afDXV+2lilI25Xvc35y+v6N/yP38iqoW1OQdQLvH+SUTzE3pBs0au8nLm7ViH5JjaBePdpTu//M4xPAZsq7iXDLE2DzNUOlVlbiSp7zxk8v7I6spwW7+rmdgZb1p/cWv7P7E9q+NUnb/cEJVG2r9j2zhPUE+ozqiLKjEB2o/kLKHh/mCFtVagPkBFbogdtKpHLNb2B75INr0q84nHRQYHTxx//M4xOgaSzrmPllPpJ6E/9ky0J1omD0rMVE0+z1WNqSUi4RrJ8gMj3bT7Cn6dHwz19swUKgRrio+l2HX0amk0MhUlwe59dbGk8KjdRVZpOiZpd7ZP58fbyP2tFWRESVHuzdAwYwdV4sKkxEJ//M4xN0WEsLuNlqE9JUE/3RWCx7IcpdCc5gVmiZX/QrJRXaduqN9xd+7ore/7DBfdv+z+seVeJGuywpJ2qGMIg9GmhU4kk9x5gZsiVoDSbQ3ENgMRRzA4c20Ef1/Z0BCZ5lUWDgZzgceERIY//M4xOMVOlbpdFiLoKoj6hxigUHscx3MGsFkoQU5D6uNLJZEb2iwalFG7a1d3928WYw/7Sd6A/XpdkTzGzl17xjIhkzdyaJgAKYfefOnrmCIrYG1qcRPKzODyKNi/61wO1KDEZHyfUbCERBg//M4xO0Yeq7yNkvKdkd4463mC25fNpnKVDBsuuqGNU4rpH69fGTdVztaqPP1RfbM/VvQqXoQoQ6eoIM8NNEiRfWtD7QhSCFNh/nTA2HQVvHq8BA/5jdF2qNbk2vbM5Rtf5eMO/+4mSmAQuaj//M4xOoY6lbyXmpKkFbbCoJu6T/9hWVqGVYfrPVaHSrO32qPVa2r0V27Nfz4jXdn+OGqu908jjETE9LHuUKXqI1sWhULMmkAAa1H/8tUwh0Ping29KiRlN67cMghulnx/KfI03P/3nAvQnvO//M4xOUW8lrl9lpOkOkU24BKe5Rvq4ykjqT1ncIQVwlP4MUfmLX7oR1NmRqlzur1dCfUatS9esuKNc2VvmUgcUiyVQYyDQKmH+qqfiHDpJ9CReGBbVWjosSusJ51dqlAvHLRLV692dCkTP8S//M4xOgYEpLuHlpKnJD4tBq18+5q8VA2IFnCN6kLSJBB9D1X8IotUdqfnZLo5vocUrFVFsQuUoKULBrYutP730MDdKoLtTFGIVFLsOoCkAvAAxLLWFY74G72VT2vGwMFs3YvGy5/UpadYWl+//M4xOYXUlbuHknFZBYnBkoRaQGHRH6oQfGC7uUV0u+gh6f9QQilFuQvPYRqh4ROjiUcKiZkRrk3Jx3sFh/IrtfY66+py0Uz+/PYugtxoVOJDZhibxM4qkhgsP3gAsoHYmyzDE1IZv9RBYcA//M4xOcXyh7mDlpKnK/Zy3WBpvL/0WhT4bjS+3GDAz2f+g4UTI//ZIwWX52V0TlPfuou7ABsTDlILFb1EWSZxdXamgtEIESkrew8gdh1CZOYYgT6aISq+lIIz+9hm48idvCyjg77H7JVWAQS//M4xOYWsaL29msKPAwBtRQnO0TfxrGBxFRZFKP6C640YGfV170Y7JENftIrRRH9CU3XFje8d1JFpLfz0c/ZpzUJQB0Iu7j/+RVv1okfgoYXcR6vI6Jgw2VBSsavC7Ylr4jBq/iWg8YvRrgE//M4xOoYYlcKNnoKuid7BHK/jbDyWm93GcRhnel/dA1qRAn0s1RQezdNR56utRRmWNJEYdkyVcJEUZVQszVWq2lKLQcvH/8VEcVtkzhGeOuELiF6oMHDxwGXsQtxi4nR5WWcxy4XDc98OvoE//M4xOcXOlrqVhMKPE03xpxhj27auK3vDNF/tUxqqg6n7usOiBvZZMrqiII9bi1jALPDU1sMSqCH+hJ5Na2iAq6py4eq5wROek9GjiHop9L3qTKX5vtpn8G3z2/ShjDtz6XfA7VqH4BUb/Rc//M4xOkX6lbmFkrLJEzZT5U8qphh848j5fWwd339LCwcY08bFv6ECyGINbdEUdZes15tXkPz2WEdp5/zPOM6yhPta/+5UT8Ags4BQIm4PvksOEDD2ubh/T+pSWCSTlxr5T9Oo2P8LqvbYLg8//M4xOgWWlbhRkrKzBqNVWDwEtXejerGO6F7W9Oo0DkUcT2BhiPNISffPk1LIgZKghioPEZtCmVbPbuS5QqxNYWgBMqnIP78FRNFKyAF+i0PzK4pi4mHnREuyvbOySvgim8f7ykdssjkM/5J//M4xO0aqlr6NksPCwXNK55TPtWNnKcfW3ZeEAIXe//FhJjRNvvt7V9n6PVEboeLtvKPQUEoo6tK0ilwfkiFb20KEsRmuOOW7j/ePBPNLNC03ybatoU5ZVZCrv75Vyl/mJp9XPIbW1m0Snaa//M4xOEVWWrpdkJOktBQeVuYnDA8KaF4w+pz4HCQ3OZb+WtJsuLfvv+Pz78ASmfPnJRib7Ds5prEp0XUh0W7AwgUC4tMyFV1Zxq6YboHiAzN/cbOXEGNDRlr0NahBCHqJgPPolcshryBedZH//M4xOoYGnbuFkrLDtV+NhwRVbQRjuBIMOj6tcVhmaQ7PnNeBhBqUb/UwcRZr/bHHXd/jsoYkPlBFvDWmuzUtZyq6pSKaiLA8wXF+xI16Ii/CUSDzLJABmBNE0C9Y7KWW5K7P4cEQtRo1EwP//M4xOgZ4jcOPlnNapU0EL0UFwSuefUeoXA0WtLK5UtdL8QF0pXb2uWPocOHtumhNloXR8sLnxI80mSK8VqtAmOiY1JcK99UhWy8qLm/F6DLjYuf9jxhkqYDyTyFDtiBWG/bPLmflfq5/e5s//M4xN8WUhrcphMOPHf47/FcRRHIGDA7t/W7mg4CoaUMWrf30lHzPmGHwQZoeVeKIyzVQibRH2QdHPqTL1j00AkcQPlRrlmqhIIIdJwXUZgYOYq1ZMjEjzx6/VaNOPIOKUIdyCwK+wEtOpcT//M4xOQVofbYpCsOPI0ZuJuqlnKd0zhfRklDdGZXjYBxrZ23s8WFazy33zqpVnfs5hfVTe7Y/r91npT3sAk6SOZqcuH/suaVivBC49e/U3RYcOCXaDLw0QPGAcpHUIKNX/vmXI24TzkcPViK//M4xOwaIjb6PkqHpk3OblswZmJPJK+efoNBMG+tOngYU6QvK9t6sESvmVl91Sz5kFclDP9FP60tLu/apjRucg/hfcbJlKa2o1JuIyj4LlZR77PbuNBS54UkbfIJ51j81GFZ3KDmvm4oyAKW//M4xOIVOhrmDlmO5mbMUbsWhs1p316nXLLS5Pjp7vxZ+UH5KETJSZ/ggj1Bv/+iVblQgaZpZift/V9W9p9XomVfMqtR70M7DPHgotWuIqJZ+lvH+9plJ8V2qdn7QafkKPtbaksiaikyGUkD//M4xOwZQvr1lknFa3fChN1yi+riZpPfVnaiDH3LMinAEYgnWY7L9Tak9va/o3R0PRaOm+ij3LAAyPcpmuwWsw9yEOPVqRRSmdXLh53mgQFWWNlAjcpcaabKd9jMLJ+qRvNJtZQ89MWcmflo//M4xOYZ0y8CPkqFo2o8N0AraQaZsqPUzCrkUZSw+9BEVBGWu/8gjT6YOmtTTp2viWBXOnRyUqwqEL0vFzornRMbDi3qPqyrdpbh/6OSUMlkQ6iPMOk0AQlkwMmIeA/aQpqSU6OR8h+/Uum2//M4xN0VklcCNknLBgjJmWUIs/SLYb/UEIUBEIjJMObnQT03/iyPl917VJe3rhqvqr9UhneEUYxa139TTL6hqpNVoAAgADv4zr+HBa9wnyWeNCfrjZfndX66GovPEiTrrG0VGEd2/2EjWnJz//M4xOUXmab2NjMKeoLUyCW8mpJX7jBh7CL89ShjmDtl8zauHBesR/6BwZxlaVQ4WIq2QA9RHhqyUQpY0+MSt0txIVNnjg+GIbWtEpFoUm7eMpkrHYcSXR/Es+lgf/sif81m6dnp+pb03p+W//M4xOUW6mb9lklFZkIlRKQVPyGDc0kaF6fyhFZe/ux/McP51fq7g/aqjdqntW61X/6zP+hylprLpa0Bnvc/rFlh/FjagwqtFNKom3LcOPPKMPqSsxYrXQoIVqSXPq3tebzT12SzIE9HzFGg//M4xOgZobrZ/nsK7J3FTaKwXdnpGr6toaRZlU7ejt3FByFpjqpn/vUUJuP+br9FHqYXwiBAM4IzufFARHDhaj//PCk43fUgAcqpzD/NbUiGEa6UXih6PIyY0dpEZepcKma5g1xSIBa28S4B//M4xOAW2lr+PmJOtop/sBIrtQc981saGncayX3ZtUEko3/xZE0Nmb06a+iU26uzeS0F6/Wv/0k/peYxWMmaEQd1iRI1WlnLh/PrJomXqNHInX1g3Abwng2HGhdHTGC7XUMkyvYEfkCoXddx//M4xOMXmeMCPliRYn6lCRNTPOWOEmUlq/tqYxbv/oaLSdGJtvR7oy0Mf+xzfS3z2cwTF9zE96vR3v6X78rNdlLI/2vppdEm8rjSU343vLUuJoLV2gwwIDaUOLe8dfa0IbORciQ3X4FVBszW//M4xOMWcy7pdllFZ4SAU2MsBIe5X8/Axngvf/ch+v/MYvX/2+npUb9M6Jytmejuh+2+MdIl45Z+ZZI4YjEiOwVkUOwAAwFP8PV6okniqFCi0ZdzAPPLorBo2fLM+abpHc+5ZZvh7pxYCIp4//M4xOgYowryNksOqw10CMXQb052PDc/+hBwe5tv4dD1oz/OJ/jPs4RB3TMw8YHuDAfSwiEF5StmXEdMhUGK4ACnKKTPPt0fh4i0uhKdUNo1pLEdQYekP5y8Vy7bHx8ZQi2udeZfPWgi7Fix//M4xOQW+wcKPkFHkmdJ2T4wIiO6W//itkOInjZZOEfPhTKii3YUqE3hIX+fmvvkXSzi50wyW+PmPrEk7//LYqF4A0GIo41ZKTw3PNMw397x9wGePSGiC2IE62dFnmXxGKdjjaeO7PIlvAeY//M4xOcWohriFkoLIIHc1YWw4G5QKc/yZEstAUriqpENhTUgEIoHNhzvRRMFCsPwgicvNSdVdRWw1QuC6nVWi1oCh2dfc4FAkXJlHwoTlm52DxdD+ZqPaO4EEfylOm9FpwEBK2+rJp/lHCGq//M4xOsn6z7hZmIfbwZLVdghzFe1QwQJkNhai07Up1qTS9qGxgQFl03JaXlGKKDpLXM0G3WksvJPXWPE4MmVG1ckkR1IkmH+kxJEinxldpXnDSXFZaubSmabUjtzFTBQvdxbcIiR9eWH9HiW//M4xKofEybu1libjUo1NNaqGFB4aAWBk2lO990biP/1//Qddc0+11Qfp9ihAVDen06t9eeIowy9D26f/RDew6k6iHVVtkkskstHfyDteCb8seQTIhR0FoY+5PJIOx7gNcbOoetMm1ca1/TR//M4xIwYayb+XlvPAeCENCu+bq+uClux/UMd/djdQr5L5mdq+RtAjooROsEIJcpXOABm/+/itpyUb3YTZD2XRFTpEgRkG9+Q2VRoa7PVLQHuu0mVcy7YL129qgmLxJTzESHVc7i3xkJG/0Oc//M4xIkWui82/jLE9vswwWKP+7J99W4j/16FJvlbQq5kdBd3//79Ef5WMl24wXHtP+mS9TaiIMOf533h9Za/pgnXsLIyjAlmvH/8Bn5DqAFnUAezxOR3FpqyzVU0DhYFA7/uNe1oOZy9RNfU//M4xI0VAtL21jLKkJsddBgg9/r0DwqbR/M70d1YEMOJ/vFZ7QofloLerCAaWjOnSHICSeXIkdYsATorqEsKSYYZ1B+S7Vw4NVWB/b7AmlC8S/lH+7Dw6ijWf/J+l/U4GrOuWvhmaFd+bq/I//M4xJgWajLqREnLRPVhr6yeYp78UaG/5EllQ8ljGrlfZlK3VUJxiVslKYPD87seKet5hF5KYmc/F93/H20K6u3AtLoJSvxktyN5TcKFlGU8o26OvUBgIqelvE2zfL9tKZUfWXqvtjSpDv51//M4xJ0U8jLxtBJEnI3nlVvEdHpxQGh1tnUiNJihBKXD1Xar1RYxp1vK77T2uPOH0/LOC2MwLXG9RlegbG6XRtnqswZ84oHSheOeUi33f1Cin+jfL2Xoj9W10m/9fNwlNXz7DNM5ppl/4MkC//M4xKgU6jMC/jHLYGrIaWV5r51238b25juaUo3eyE4Z/qShTOj+Avyh9pY3MRkFggZakb2dVcoJEGX/81/R816gQq9vL81IeTnM/XtWoF/T2Jl4sKFv30j9YbWpB5st2chVP+pIdWnh58DC//M4xLMVGjL+/lrEnGxsonOE4SE3YLOjYLdSUhSPskkE8xLa/QsUf6tqgyrYoByNC5xD0/4ztrfr/QOM2jt4EFoaqhSrl+/XfFf1GSJVxanu4fAw4FiChkUwIOYy1hX5y4LWnYnce7U0JOuN//M4xL0VCjMO/jILLC4nxSrVLIT3QYXOqIxi0vjZrP1PLFBn6kePVPPlA2PDP0Lf8k7NQQNej6nLhTdPxya9n7eR11qOHs5X7l/8zRCWBwAE6PNqj5mxqt2Nbk9RlKMvXP4fdiyyH1MYS904//M4xMcUqqsFlinE9h7l3K546iIpapLbPJ+10dQdCd2P5RfrqRTqUbtzHvIt6gfsishpoUHSp/UHVktDhu+rWYXxVkgGMglKPX6v7CZIqj831IKTLHHTGilh0VU50TiyBLJumJ8G+XHJFkVu//M4xNMU6nby1lnLTGfXGMeOmlltGk41E78xIxnWc4In/dPduK5qZxun1DBr9UJdW/6aUJ/vpLtn0H9vXxa8XRmqiBktBm4/gqZJunl0syZEh7g0NDPQ4ObAXq4S5mCK9PnCjfrtaCm90HLt//M4xN4U4eLmFmsPBPFAwOFvUThzWHj89UDK3HecR9l5X9u3Xybf+NW/Qtmtb+PPJsl7J0Zp1LakOfXSb478UP0kFeymANahKYeodfLSVOoo5ErWiTWmR/o2ZMrUB0tIbgNJErHgX+ahzEkU//M4xOkXcyLmVktE2ZhlsonIaaimLUWRYv2HKy63ueTcOMrySSadRV6NUm31Dt3/nU7Opp8sCAcjaDXRrf0eVot9PGTbMKyGY4g8r0jrBKr6SSR22VOXcRVYeTBMOIs1B2gJsUKvJMV/mnN+//M4xOoYUtbqVlvPBfoItE7a0P7VuvCEqXdaklnmT82MRKJl15s6/7ZfV653nELbehE01b//6fB+v+zk0/Z9OHGuZ0f/zZF/nT5RVB2iWnbWgmOk6tW203FXylwxj4d1xGu5RfNqJ6njqK9i//M4xOca2uLeNktE+FqXkn1cUNKw1WvG65qdYV9lLgqZ8fXIX8Z0HPoT0IEoyswCGRBHBDq3VWmfmK/X/DN/4tdevXT+mHyu/6e1OKHEd4ox8SIJsIUIGGKceXGQcOZAo+6CumpCVuXoAkzF//M4xNoY0tMaXkPE9keOKakqPU8eOtRJxg+0Yu4mW6if3onNTJ+iJ68qb0Z9E/qXHpGlZs85SAZF5lUZ5hv/rlM2/5biBafqOSwfWdJK3tk/5Rt2zj7zyMahpd+RZ941PGvp7maamnl4tMPO//M4xNUX6tMGVnqFKoC8tcxPPrYu+TNJvSKaAQi0FOoiiRXtRnQxyuQ6H+zcaM3Mfp+qiKXZzlbtAqNMmdcdMY0UPfR6qrlBeNBjo6Lv2yNJRtXaNONM3vXKJzZFdqnixHFk16vdg0kw75HT//M4xNQVmqbh5hGUTNmcJU7hnQfehci5r+N8r8IDhACLMmJzlT9rlWl/f/nO3Tyf1oIgdWupW9mkbm1UcqlTU4xd/51fh/uLU/qPyyo2WBIVBQA7H0FKbUky7H8OGPcdemELzw1G7SHQqOHt//M4xNwYyncOPntO8pDb48w8anlRjcxsxjVHGch7/Xlfp6/4lAEMxQVjs6p4aCVjAnMHx04gcX/+mn/y76nf+FBOK9G2renqy4fd7/SWc33odnTuYyMgU2Cv5bfYUuTrME9TGFVJizRLpCcY//M4xNcWCjsKVmHXNn+oZWrIyshhI0f5PjH2Z+nk/jQsakFQeMQ90PICRgIc5B8fQ9H/71Vpn9DNKyhaijRKb3CEVbG0vpHmy4f5ZDKhScVPp7KKfakJxIg1quTVxDvgS4gy2VtUF7ddfsfD//M4xN0VOqrpbgGOFn+W6zGNSOCWNmLdCLevEyenv/UiDnKK0iSKKLpKY8POmFrQRU5k7HPrbo63v/2SalUa2FBm5TsHDCrMoglJNx3/FM6UEQgvTVviLikGcjr4tIJopZOL6j06USzWaCYI//M4xOcXsssGNklPNtTmGg/i44/jcr7PYlKal38Hf07v/y/4wDU2W2go6ghdmt21bo2kpf49r7nFe73WBqrRAI0VB6SsvnMh5gIK7JAVpFosYFUCkMY3UeTRi6W2z9raHdyiSAHJs63md5KZ//M4xOcYiscCNknbNveSkCDH5m30O9ptxb/cleybhUlJfVn+7D6+oqDHrk86OdttMAs/Yr/+w9ebOiHEc0T0UySh2l+tyrfI4jdXiCIEZt0e2Z48YHBZcuNm1ztqfh6kAr40PiynLHGeMnus//M4xOMVUsbmJjNK8IzV+VlTPo+5jXMoMD738EL1m0ob7e39BoFMx1dm8tgiyxEDB9LIhroWpVb9bluO+nQQCwBViBJiNNwTlZjyaQML2HYFzKNrQULEU7cztJIHprq1L72UgcaubREwbVCN//M4xOwa+jLZbkvW5r79TVWmVDSoq2b2TdsVLmfX6hay0G38Uox8PUp1bD8U/4iykCMzaoaPe2/ZBIOMzQ5PU0EbgZoplGrhwi1E7uZC/yvS9Hp/5dvuff/ovwXRpvXqqclGR+a3s7wznoXT//M4xN8VUjLuDkHLMtpIAHUdMLg9EDFZ0jwCRFOnKUguyWZZ2rC0E2tsasCNXzXqrt+6nYjjp0e3mNRVf8KyAitug4sxyCojuBnO2kRzvjP02gsxh0ZBm0IC6B3MlG5EDnaCi+iJNBiZ5BW8//M4xOgeIzbyDmLHH+qRQrXgP8sNMaiHDf7iZ/o0Pv8hR1YCWZDFPajoqUMtnfE/+pVOg9vATXlifFxY/uqld423Fm6d6YUxQ+FYFeM62p1Es21/euuv53ULhgXJ6ABELuFkm02uyVwuTrSM//M4xM4VslMWNjFE2u70VxMH0YxlDNDFfaJrzC4cNsmpNAxvjHvGE/f1UWesd/LWn+jeM+6J/vja+R+7IHJF0phY2Dl4j9BhiPzZpXOFnBxEC9GE1ohP0/0iNTWKqCXX8y+Wbl70+9YeQbFy//M4xNYUQbLy1kPK6HDdrax5YmZIR+3z8VI/GBg3XZla5TilGzZiMorp+yCYwVVhX6EYg5hMNMHLQqlAcYgUYXyZwxIhFuUUw8IHojnD1TnZaoU4PFisK4INKgOpT5NFHv9Y0vP9NRC750ja//M4xOQV8rbqzmNK7O/qZcgPqNmHSTep0UJK1URzs6Q8P7xLoYB7sRlmtmA7ffVVED/brk4xv5+m/sNSWRSdt1iIkjroVq5Fiwkq6ZIpuPThd6GiXn3dEiAUNjHTOiGstIHF/P7OGFIklj/2//M4xOsbkibiLlvK1EiI3NWeQopDe8fa93DbMc2vy6/XU+d+3o7ktU+qsXsumryBHDUXPhxOQGGne3s6mrWMhGlrEuCk7UrR45PPwi9x2aGbgRAwQ0bPJGFPAaUdR6XZysAkrPI4sEfy3PHA//M4xNsWQk7xjivKslBhEozlRiPIPud4pPuYeBQlid562x0HyGpympyqb3t7sY1BUv0RiYPtb31Ptu7zxlEUp97NdASVzQQEDsOPRvVedyshj79ho0FzJS53GFJkD7ls/SjYE68Tcf1Sgtu7//M4xOEU4k7uLgrOPP2GERCT35R9IRDKyqPb1VvVdhzIbuj+FUZ+n9Nen1G2bkdYoPfUS3bxJs2q0QbgHAC4+P4F5ZJXBWPTkQikN6Y9gKs6mvqlEhuAYT/frenxeGSjcjxcQol87aqBxOWZ//M4xOwZUhrqLmPPIq+ZZb1gXYxjRLzy3xZNexAAT3wm7X96ZMFrtGP+6YDdn93aW52aKFE1K/1OfX/1eQip62Q5zyEABEmhG//+1GMSCABFmP4WjATBu4fMiqxt3BY6yAW7iz0jQyKhOD6E//M4xOUUmk7mBlnFZHO0IZEWPS/MjYZsIp+oFaqoA9ecFbYjqCZWMb9A44wji2/40e8rcztIys5WdIk6rEyxYz4w9vUaLO4mBGKqq+jBgloEwmU+jjto11s0Yg5yGjgJiLRVl1JEl2AopOD+//M4xPEeAzLeLnmFibYrQleTfw1NBaKQOaqOeKztTIn3BE5rP81iROpDKODHIodf787GUHZnl25ews6rxi/Lmn/KNDxwyxYqQk3kdjg5F7JGo/iBlv/Yq2JDO5iVubDUjgGRuOgW1PysYFZy//M4xNgbywL2TkoLBZGQLcqCrMlnfVD/mcqI6N9eodKImaqGafqcHIIa1VN8OA527emRd3Pf3Oo7fQyewyzNebt1+7+cdNN4iazplDABA2hAFqAi5Qr1oBxVQ5GbGUjO+CH7JF6hbPyigt+g//M4xMcUuXb+3klTLEbmb2HSsuoelx7bd/JV/RxMH6rvyC6Nf/u6F7+mypchreRIj5bPzxISXEUYHBlbNv+3QmrWRN/4403IPrcUKsl7B0a2oZTqrKiFwmVU0M1kCp1v9jT32RCfs5uQTlze//M4xNMUwscK3imKiJCwQRnFxDyStwgHTQtM+Npp60xwEBkaINp9FJ/0qYdLmG4Phpw88nFWJodW/IurD2/KLUrC9fYawOgRJb4fzJje7FZwb2OFiEioxQbJXJBlOXHQUq5oy0R8CcM+aoMG//M4xN8VElby3hLEnOLiyiUBgatyDblBsI4u9avnuGIzpZHX4QBON+2PuWv+R7eUzl8pQxKgWq/ZW6B3VL+HKdx2WdrKixIFJcR10QaCmXD2GUkFUGXmNiIIv5DnJHetq/XyXm/wfAlqZ7UT//M4xOkYGfMSPllFog+sPq4Dgb0YmuaREUFRI9unQ4Z15fhIHP6V1pMVq5G4mwwVTC3UBYbFerEf/+DSVG0XqMg5+JKL4TMz7rUNP9bYx/ou7caGXzJ5NFmzq99c/c5EBi+8hm3zsbmCMJBq//M4xOcXsqruPmnFSNaEgZibzveawI+fHAxQUBo3kbQEKM2N40RlKCeICYMXzodSsAi+pn/Cg9Sg8ka0mcIGy3cq5hAABmApzB6aSEKCFQLjpe9jENUlWkhLW5H+Z/q59SyZ9e9BzBW+fD1R//M4xOcWchrqHlnLZKqboXB2Jj179AZS0UNXqt0D6tJkP1FT5da9fVWy01Gk+xIYc1ClPsHIQ7s+c6uKNzwmXRqiiBPy0Vdx/sXbMUidybwWp2pSIapxROqSZcRw9bnRCisdZ6AfhgDbeXgQ//M4xOwYiWL6XmPFYFtY2aA1lkE8TfryJkOMnT2rJy25I8ysr1BVTre8y/i03UUiUQjhdY3vQUHlbKP9SWqVDrRPECXNh/6qp5QsHka7AOpQyJNWdIg+ktcjiAfA33ofJz6fM7iaH5e69YvO//M4xOgXKe7qVhMamDbNy5bqYmXcG4zfL+K/0yfeIkANkb/DCzIcnX6MGZAlCPoZHF2Xdtt0Z9wkJU3yhxOQ6iPXjApCEAChAqYLGQgtzSijHgZ7ERcfbiLwRafiQenRU9MqDGQWCSfRASEQ//M4xOoXUX7yXlvVJOtlwkY6PgqT12bKgMQ8SGKFq2NMO7T2VJ7+k5yqvf/x8/FHzQsaShbd7ia2kUcJr/3+sdSqFcBIamLObj0CkeoC5mS1OIl/1lsfoHkOnvmMz8zrOQCkft+0AUA3V2JI//M4xOsYwm7mHlhFoB40PiYa//UVKxaLSYcTqXq2/5GIPQ+39EOSa98QoKTWejmyrL0ocvt22d0nelVyC85k/ea/W5I1E+oFOD/+L/C4mlNCP0dIaUkVZS0AyczjMsX32xy3Yv2vuDlZJjCt//M4xOcXSeLi1gMOGAU/lpLrprQ5IgLApZTWN1KByW3RUHR0h5Xpq/blznR//W96fPdDa6a7Izs9+P3dFf9GMRJqJfMOZOuaYXc7qF+mKoZjRWm5ZG7aAymbilN7cxp4y5Iz6iM7G0z3m/y///M4xOgYMuLuVisKXbqtE78Krowfpp9aJamVimdYrfN35gKYISLPpxSXXzuj/3Jd/+s5IMhzs7S4pHL0EP9Odkg3iahJi2//bfmBIArjaVILuQGYIOKwYIs7BygSAcC8gD255Yz/mL/aTGUz//M4xOYaUwrpdkvPC5Wia+wmCuhn24kzM/B6jX+Wl8U/rF7GKT/LiJW/P+ruNKIymiAhSa1nL1j0PJaklH1peFABBgScY+KYThRMa3YhLPHf0Pil8Peq51cEN4BNcchsYHdvpdtxomoDtRhJ//M4xNsWKncm/ljFgnQSArIJno/So9syKoi4symncaiHcS3rxzPQy/aQyK5SteinaW1WiKGFjnp+9WZUbrd96sIPo1x0ad+Cw5UZpPatqclFkYjqr1Gzokw65t6tr1PMbzUWbYwHEmrfLMFg//M4xOEVQjrp9DmKtMdUHMKNcJHxDv0HyjtXxp15l6J/bYz6/6FrjadSoj+s6jA+hW4TnUPxS6b9PfgUaplgI6CFUWpfhpFgJLbKdnd0C2Ttg+gJEcE4YyWRQ7bDopx/oYaBB6MOCACJQSDS//M4xOsaUubmznsKPKB7tzBrD12fG03ibvnRm8+6u0r/zIRVQLYvih3bW1YcnFVf/dKud+3kVrOztyfu7pzPf42WUhJHoJl9EJv71Ea98AjfAJFOiTFu5GS8+Jdp9S7BgLzzpQZsSHCZmXEx//M4xOAUaj8ONgJKHh6jzxBv4tCPUmWvrooeZa/ul5B/+x2RJT+bLXZA6Z0Q56bbXbt6iJJimI87mFnniCxwMm9F9wJCWBXNx1KMGxLYDUKAHNPmulBCUI7ICetnFVJwnLNyTNMl0KHADM2U//M4xO0Ywj72/lJKNUGwiB3u2nR1IlyFWdSdFpUbJ/sSVVH/as16CyeXMzuqzkARj1/oY5evxr8r6Ps6fhW/zW/iypVSI0X7SU/4+3S03lmODtTwcPM+e9tOWEf6vTKzoM45Oo52K050tJhc//M4xOkX6tbu1ktLAL9bcoZ/Q7OQcWw9VZ39UcpwcEZkyr0u9y0b91BMio/9WPfVvPsiY9WkJAi7iq7DACVDcJESNK3+/S7wgAAoqWCrlaqe2lEqbwL1dHNXpYqCD5/4+7rDhA1+3DclSIYk//M4xOgX2qLmNhpKPTcGI1CAKKy78GkLH40UfYc9e2T+9THZVcfTvq+ZEX1q8vlcQvzE4/tCC2p9H9uokGCuw0jBjsFRSgk0fmoe1PuDtyaSGxorJaq4tSR7tKO95ZHXcWgYMEaROc+UOLua//M4xOcYwk7y3mLFYPmdlURGlDSKz2J7J8gy/M2HeUf/VOVPJ+21NkY4IlY06nukbO11pMcoQWJT0WByBld+JS6IkKcqpui5Xy77kccIiXBHypOIcg8Vog7J5L5M7tHmQLP6oT4/tlDiMFgV//M4xOMUuj7mLkiLRKYJIKkagSCUu42qyvj5eC41zhtSrGdHXQb0f1zld0Lf9TaHKaS6FkRK0U6g+XyczY1qq7V85hMWvf/3Wv0+meVtVYgAa4KU4PZsV+KIG6Hi1Vyh5BPstBOlxHbAgrmG//M4xO8ZOqbiBivOWCi/7nvuEMbyklHkaD8KS9ojcX3zXxL4acX5OblyujvM20hfhE9a/EL/r+fztLSBABP50/RIIPos/p8LiWvCdkL2Gx7J7m43iP8cABEZfz9su3rk9nXsooOBExj1tL5t//M4xOkbCzLqFkvPI6zDKrAj1UVI9XtQMkDJ0VQiezrMpks97bDjm44FF5UNnkANrL3lV7/uZQnq+p1GbXslePQJ1W2bmU/5fyauQ3jWRqny0zogR6LBKhroAZirZOX2fo3c3Uuwk6IEZDnR//M4xNscYz7uFljNk3PYHTjFP9YH5aakQCETE3AunQxoahCscJG1Jxtw4i0oU8Xe5GmF3vG3ycjQ+5PCMYQ/x3V1q4oS/qfBCVnTS/bK+oNsjO+xFXbM5wuUApgnQjbc82LzJ8rJ4diMwA3k//M4xMgcwubqFkrFNTIfOhd6hqrel3eS1yxvXcOSQGUezV1OoiIN7xlisMevVrAez4xVdpg1YstaM6PANV8hARLdCO62BjDVSrrwTL+kno3nd9lwltrI70SgI0hET/++iC6uTlYfRK1K/RVV//M4xLQWyqr2rkCHUIhBEsEuDwuiUQ3JfkN4tVyFWxPpwX9C+UAL3YlWOnLxBxGqATQhW0CsgC/RRAdODYzwTb8EP72w4/k8S/aSYb0X0Pu+GLqs8RDk4dN1vZKhrXoaZURVFjaUSV1NuD+C//M4xLcVyqsS/gsEHFKOkxhQ2oeTLkxhAzxwH9cgwCowBf34uWX9ws63Gtc3hrB8OPZoiGK26txFqoR7n/e4gvRvvr4W3ZR/wOPVHkQOmnxNv/oL2juYg+QLUdSekOgdGssckdrbstH+qVOc//M4xL4VUjLujkjFBK5tVCknvNMwXhNe5GFyAZtB3WNOv9zKSxMU2tfciDhyASoK41hutNuVNU/5zdPX12sP6vLgwIWzNO4o80r+RBMZlooUcyX7JEoSEIaEiEEgzD/UvYrKtsDE2ujVHPIn//M4xMcW8qsKVloK6mJnAzvFvaSGTbCuv33IUtLYxD3U9ZC5UDr5k85G93oPEt3Ub93roWbRultvJvyr8nTpYvEQEXkGki7lb8gmggzJqsySRQCC4OJ3l3G2HvQPL0bhipYREEIGewv7Ay6D//M4xMoVgjMiXloE8qe65g53pmnb+GIYaDDaeJR8Ebvoejh00Edn8rPqzdfhywH/6CHpWyzyjoiCetsmqiUlnOTSHdfVttru6jkEM23DG6uiEBvAVN+Z/pZaCeJk345fXArbdicvd5rjrrT+//M4xNMVieLqPlpUlOv3JVIafHkU8yeXxc1JfbqKOeJfX42b39f3iAu+5pf0B26jrYEjDi6NXXz38udo3L/tz+7a0fO7IX1KNFAm0TI8WGNByahgql3mOcymH+zH5HvTAAdRsaezw6qz0L5m//M4xNsU+hLmFkIEnL6nMRhD80dQdqem0xTydeVKe/jVv+/+LxsIY0qcP3mEEegvDpQVZxgXciHyt9+79Rsvbl9ymiZC0rixmUTVGiYWCTLbx2TspRbI48BUN9Fpx1I96xLjzh8ylkNakwal//M4xOYYmyMGNkrPSoxL6fMNgd0ZlE5tDEuyMZReeb2+TmdjGPyHrMOiQXf31mkrz+KpLZWpp5D4vIkPMgdmxZmuynIDl1IaEgDIQFraOiZbMwbc1xE38D/OGJtmekL5NpQFqERawx0YNqio//M4xOIYSqrtjhJUXlh0xXqo6PsVBjzLjgJmvelWQms1+v/kHbR/d/SoHl30TuqhMs1Y+hYw1fns+TakTkDWsi5JC2L2UL2rqkIAptH+1R71HEAdx5NGYsoRhiVkgxcSSV5Z2YDPJTHGVVwa//M4xN8WqobuHgJUGJIXVq0z7nV++bZnkk14XxY24Jyq5jnzwfXJlrTSH3fGX+ZuZ00gsEY9DzSjGTzKqYxkiChYwt+vt8W24nE1wYYYH4IkiGRIyFXsIAUBV3Dd3pb4JSFd4cZt5HzgtYsb//M4xOMYMobqPgJOGGlRbcLS2xNYg/PR31MzE7Kg+OtWf2NwfEDXHUlyJjbVdRw1TH6Fub8lubdvdt6Yfhx7q53nEjIe04fDhzpna0yuK9Z4qwTvT+QqfLrVJBfaOUecGZQUHw9esQ7BmBxC//M4xOEayobYtlvPYEUfzxd+RuVmhtlulkYHUbpR+YbCod9cYuujoa6EfKDXvpkjv+v+OgEk1MlD7mKYMlEFB6DqgVMMRpXulnfoeiV9ju7s+R5jXb39Tl44kFoVIoaZWsmHgsOdBMwoG7Dh//M4xNQYkmriFkrVRGqTy0XIjkrEElBJUbLGF7lx0o0hnL2JRCJGmrI30y739H7fNX29P8UREsbLY/VwoxguL2QeDG4IsK+65fm//6d/6N7daGVHYNRVYAL8JmZ9q60pKIIHTNf6oSH6DC4e//M4xNAYKzLtbhGOe2NFNA58c3asFHFH+aWrOry0fJm8e3dyZzmXlWXnWsX7XNhzKTcqNmUaVXM2fNSIxcyyEKyTG6sisEwZL+2mSpMoos5FHzsi//ySqZBGCY67hteruIwYHStI3Af3jskZ//M4xM4WSz7+NjoF4qLiCEzw7029izlTi5rTPRUKgp+bl6pU5GE1at7f9ZuptqpP9SoUI2trbU+5tJzlzpSI0IvVZVk/YIEjamXIsXhM4CQwfa2iWoZStlw7v9LpEpB6w9TYj7Vo9opfoF8u//M4xNMXqjbUtGPPYEGvsj+x6sMGNP7nwT561qq6Hx7+f78OPZDAUzjWFRceqHE8MF6ojqet1s5qHOUcGHF+jfu3NIf2Ziyv1F1Ass2STpiSlu43flGHvV1RDxZXNYRYmEEW1BsnRs/A7+Nl//M4xNMWajL6NkKNGoHsH+qDeqLOoJnyrqydtSZbfO+3I36KzXMR/qDBl6ZsxNZ7UUPkOI1Wr6m6C0Y6JRw6cdfh9ouPY0EnB2qpEBWNTjlHQ3IYgKuZPPBkDGVHqzOQzzn2XYwRY/uogg02//M4xNgWKocGPjFPNui6qIkdQF+NF0Z+VGjv//10IhaSC9/wkGWYijss5mo9RAEMV/lPBfHdftrG1cgQAgaIkv49z8ztzLsmUnCBCkZGOuZ7k4nD3AHEFoc9K1Fp96j08g8zdZFvvt7SoSxx//M4xN4XEocKPkKFcnIilzGKyIwkGT1T7fbhyiOU7nQSAG091iYK7UU32W6YwV+X+nxqB9aUllPKzTy2LY2t0QYo0cg/zwV2k6c/EbdTX1N1WZxXd0eMIotSUN7Fo8YKnGiTH5x8QGrd4XRH//M4xOATijL+NhoK0uintHNlV893p6DYup85zdBV+yicbFDYMXhWoTkxVy4YD1E/q+ePT+8ho/W0VXKVOVlUrQ8YiAAkJuQdr6RnctJIzDGaEOBjd03svwi5/xiuBK/7SxHhMBapAr9+mMBx//M4xPAY8oLmPkrK8P6+2Ms0/nGtVrp5v+F1V0upmC3otKDQfSjtdKO9cJA+tENWmjbRc689JNGwyS6XGRBWOIpMVciQIBt9H8vj+sMic3HAXZKGWpB8zr3yY2XGp5yeOzOMf3WqtTHy4w7I//M4xOsYeoLtdknRGtojoKhe3nhgzPVSbWHGUgamYEjf5rJ2koAjfpDfZvi5SF481l082XvFQ9egMSyzolXOrsXj69QZtyD5nkXtlUfitggbW0K86Z8EXRx2i8h4k8kvai3W5djyDCUcyum7//M4xOgXyoLp9jJLYkqT5mMGUNlTeK9k4mCFr7rib0zpo3/uYM5mqJlijMUSo6Ejqryn2RX8oakoe8wtPvRGRBaEW6XLNuWAIN/Cl4+02z9i6mcRoKPfAfHVFGFYu6wGBfxYX3h52/3Nz1Sc//M4xOcXwoLh9knLTHApmGe91HvRoSDEDV1OuMc1e4a7baP/zf9Dm2p86y+WSxTqEZd3fbuiINC4REJKvEmxkHct13CGLagqUQD4ZrxoTFFDyKW0p0aHezZ3rUc1Fv5YZ0pebXvb82+4Q//X//M4xOcYeqblTkrO5pj+evNfXZfyTP/mKl/9Pm12NddfvZkELL1reE1fe3rNsTRfPxI7RI9G/N8q69UPurkex/9K+tUM7wyxsr8KpOibr7l9esfCwI06iX47uKrFNFvvE15ODe2tnAkb/aYm//M4xOQVujLh7klPSCAg3chGQIST94xF64YhCfF9Pq2c5z9P+39VDizvV0qGO/7tXE8u84XB950nAdWgAiACnhnUmiWvFLyK9dx3T0afASa1oZu2crx5lLWxPqQ3R8MlrRotH+ITPbdI+Lqx//M4xOwZ+zL6FlsF4iwbQxU6jfGz/6QNdszav3fZUvWV6dumm0X9++jj1s1o3D8u7VmKjn5NGTKjnbYAqEih5QSytY64pZLbPM3SdpqNwUkc9SiyS9KHOHwkP6/qwmxdyp00VaqsfVcqsant//M4xOMUihL+FmGEztnXN35JLkia0UZLZsSUjVFFw7jAEKJTHyRrUp0xlcTHv8++sG8MyNSPgSTKgMp5dn4qGA4n6Mjq+2acRmo+0t6ehpCzj4C0UjwvU1kVjX0YqqWrEsWCyFHlaOyatX9H//M4xO8l6z7Z9mPXESjucWSV1fBD/UfJVYUzYmqfnckzhHXodqHF5cme6NrJa3p2fXiAJFZQ16fGSoev4cbrRZVZye3mg18N7QqjUbp9Z/8kfF8S6+M50/UpTZ0xg8Juh6pU4fF+qAc//810//M4xLYYiosOXipU0v6jDef/T/x0odlnt0fBr/E0NJlWM3MPmdpNx/fKE8MOAyba6mdxWhCxcgIfJJjBkcRU+eMHwG3WWj0IEvBNYFSNNmKMMXBG/+9xnv/mDQm+ho8QfRe6DY/UIhJHGUyd//M4xLIZOor2/lvPEKE+Op/xW/Q3/1/uRJypv/PJP/zAquui/G/ql+pqjbaJYnXxGD28mLrSECTDV49m80Yafsujp25HSXqd91wKBDlPKtthzHQ8cFzTWcYz2UwF6u/Uy1V1vedBXykarqUc//M4xKwaYub2/krO8Sh60+mYLdcHgvuyGadWolSyjQ03WNSkDVdJTKX//sZGy3+r4i/vIlhzKnulUxNAjxx7Xj34pBUfFC4MuJ+IFrcv2f1RTr2/QWf/89OSBMXUjVEwSVVIUE7x6+688+MM//M4xKEa0nr6VlvbJrza3zB1CV795GAqYbzjCxnROaS+IIxP/yFm10mgSuYZZjbU+d7ucUKF2Ykoayn+S/9IhW3TmZIL7yLBg0YdifHuPmsVTLnrd142zlFd1cYrJWqT13Dr/neYIfjLd6jD//M4xJQbQubzHksVDJ1CwMJw8MdnSpUfjVDnObKExR/a9Anlm1Uob6+73aA4LUvGO1H0R3/kYLxjHNW+3/9S7y9es95cx1H8Plh1rRCAAmDFOi9h8RukdPMI6QNJsLaPPsmFbHM/kT8+9+18//M4xIYZKnbmXkvU6AId7H1LCQ4wFlM3L2bXRJ417fU6z6KlKqdRmgQE9bHD7daNWq9wIwyNf6/f/oDAtXt///MXqQ1X+n/3JhR1ZrW9RlWlIEtjZKEt0fXjYoSmPGeZSiGgQXLKWHEJ/rSV//M4xIAY0ubaVkvbBLUzj58uxHixsD+AXFXnmcEjY25pPD1GL6FQUlDX9TlZKo+hKFyDQl1OKHVmGppJeoGhQv/6t/oCt////qQl4RYJgxnOssr35NWNJG94lF1nzUYzB0cKgABqSYckVorz//M4xHsZEn7eXkPU0KjvrBKl1asqwBUlSWJgXBEuSkZUuQGu6CwNBPO3X1au3OC7DUeu+QDqNQsQGVm9Q8CD///0KgVHQrlNv//Kd//X/1JP9KokBGtM/2KNpU3P3RehNHnj17wpaxvL5nai//M4xHUWyub2PBvUwsTzQJY3umiVoO6GV1mC0H6jv0Zn3Xqo8DW+t76L2d6GisBCzzb9lzP88oJZJTdr//9T2r///8t/VbEkq6icltoV+Ps6k7MQVqsIjXCC75UJa8oq3uRLjzSKHBRxAABZ//M4xHgUauLZVFtVBE39MyciqKAjPlc6z68Se+UxeclnPX8/B/lQdhzYylQNPdfSLqVexlR7K+v6pklI4trGb/87N/83fJkyvNKOPasIABqo//9xhzbHibRCtGlgqGRPATuI24Qgv4IIpC5c//M4xIUZgucKPjlW+ncnGRGwgVhsOXBNGjmQIAceb7rzM5h+zBZ/tovrR6dmp+kJn5XNw8X/N8ZOlrAd+w5rG1X322SdttJS07XdvfjysdN9ZmLUbAOABDq06uue1mXqjIddhz1Xc3xURU3d//M4xH4hUwrhVEsRcX/8kEu6DH8V2rJnpepGBKzSSTMnGxpzbcDY19Or1S4ARHyg9rcJVf/L83/6EfYpKOyXcZvdzdPDm/+QVabqddbLdqGNVFV7USgSnjTXIKAdQuKEiof6aXL8AP7WmY5f//M4xFcUekcWNkCXUs5KHXYTaEwCFVica5bDVplM4Qa6zMU1KLiJ91DxWXfscfU5m3/c8kMx5vXfPCfVQhbW+//5eOX93vv0HKDrrpop/70Qj26G//9F8YuBaoIzMiIChIO/2KDCC2oIysjn//M4xGQUoyceVitFE+50oTdptcvslogsE5xN6GG0bQ4HEsxhHb+woZR3pXlYYv5xX/v/9Bjq8f/RHr3I5z//v/6u3PGC7ULv/YscnQUWzrSbiSWtuD3n9q6hcfj4XZ910uO4Kb8r4y/duvEz//M4xHAUcyru5htKkM3hO+yovyw1bp/6kUJkf/PNHxoc59CoT/+//5sdnl/2NJfmlB0mUpW/0fT85y2d///3+jj1GhAKERs7h/US16B1z8/5nf60ErkhvKLiaTAaZO7opF6sBhjWnb/o0Vb3//M4xH0UqysKVkvPAqc7FDPqF/8Y+66aRoEg2ZAJv/3rWfOaV2kkdiXJFuD1u8K3pDDm3eZ1IYlVEJjBfKwVDNeuw8YAlh3QZR+/JYzKyvJ/KhA5prtPJqYW+NR71/8faI7e9O7DxGvjUm37//M4xIkU8kbyNllNNL/p88JSijJ5QnulmVXPN1NCQoZ/9W0/yLxeeddt7KU2D7PWSFDVXye0OM8HJ9DS0s3nlAJ2PHghGqVJZvRQXgaW3IX/0C8qKpZdHr3Y8vbqDw2M/np2+kRyYeExGo+T//M4xJQU8qby1kqOyOk45/oPxsHwhvxeN8P5L/6nKhgwCK7D0XRmfcr+c7pbIotXgWi9ns/hU10/4aQihAZv/HZI/+OS+RlzM5iM5cbwTZ5kKePUijiRDonnVR6OJskIE7C5wPn5KSjUWjG+//M4xJ8USkLhRCmUUO5z39dJV3mQMt0Nd4DMVbk0Vk09uLtwldILTJRsknp+NC07P1NE+gifA9pm5B9jvJdmBgJ/p5UUUhqaNTSv1/m0/6V+pdNRaW5l60LAyIKO02sOxnV6W7GBjfH/Nrt6//M4xKwWywr+BhBTqapx1/bVuse5dpiVB7/SpDbTURmIqoWgWQk9ll0ThB6q2akyus+tLKwnCkfOHfLpDfWicVqYHi2/VK8mnI/sIo39bKUgj/X2HXbv/xMHGu0bk9X7RzcqxYDVl1VobakI//M4xK8VEuMK9hIEeQAlhzXCtWq6zZ4CLxYw/zJIDX2Fusea9Myzz+Wf5iDly3G0vJEWoiOBv+JxMR/+iN0o6ewgKv/ohRH6e4qv/7qCHIJOlTtvN+pghZUDSKWHV2fSO1tt24L4CkHROJYA//M4xLkU0lcKNktKngsBRhhQRlCwT9QRxAa8462jEOyHUxUsoRCLaBD9zp3ivRn/RRSiv39GS//2Ff/y3/31QRH3ilqpRpaK0iDtfyw31rX2WW660UnKEWOCIoaSGw+wC0HSn41HCK84rLFv//M4xMQVMl76/llFYMTc/ECGyvCXscxFoOmsWXG4jT++V632YXyH/1LqWv92Rfo3t/+V1b/XVlHoj/0qnsg2Td6ArdkZZSrWNwR5lKcGUpZYhbMLMoRoFXwKYqzxpqS0JdRy/JhDHpNCAxtY//M4xM4Uoq8u/hJKUlwm7MOdWMbUWqm0Urp0wfOn+pBnb9qVT/9Bgv+YSLfEtYKBAQTTFPsCZ0J9hUs2qJhZF+JlKXDGdZbkWfOmRQk0OBoJLM6gMmYn2Eath+kyd5SQvXElxnHRBhBnO5hH//M4xNoVSq8WPkLFRpuFbcZfFZfVcLmzSN05h36/jWxrXfpssS/ajIg9/d/OcRJtb3uwgOmnf+iF9Ao5T2GXtcri8UoWVUhogCHcP/6Xt6ig6FnA1Ol1uyPlCP9G+mmj9Vv+qsRFpgWiOpUS//M4xOMU8hMCNmKEuteCaaPli4fy2/U1r6hzYHH+fu+Pg31Fur+0wev17GEE/+iRvX9nBB56ffg/ujmcju/rVr5dP4o9FTLAjAMo/LVSUm0okvUfdA4awO13OxIzKaZO76jQQOjAeO+Pye6u//M4xO4Zoqr23nrKvC8ryhpr6McMAsVyV/0I0oQmOa/1Q6n76oxdfo/OoNer/6vT+3Kagzgns7EdCh41QjHmgE/x/Mh6QpSzymTqIjksgoVJUwfqkw6vJSEoUZsOKKelyoOitPt9CNxhp6yq//M4xOYX6yruPkrE9Mk19/CO+fjVzMUO/7P973npQGNg9P7P/9JR/SHdssED8n5ygyFKvp8P+CFyzHUPbb9SihZjFcGaDiTZ6skpOD1GnIJHIRw7GOswocMSQ2CY0jQw2n/YnfLwxnFrylwD//M4xOUU2o7uLgJUHOTjh6g6A8inppZbZinXCxnjjavoK11Izv9K/9Zv7G+dOOfr/dSP6q0E9Sf/EDl19RVUhgAJB8n+QrxvXNPDVERFlg+ZRDv9/xD9cqGpk0/Zo/y1QLcy+XB8r4yWZPru//M4xPAaYwLuXlvFLIbGEoyafXCoN3fBu3pkP0Ck+zRJlK1Pxpq+X1dmE+iP5TzP/V94mun/+LfsWKf8i2d8lS3zJdtKqaUMSVMpuD2+g1dSJNFy2lDKsxlQMaosfqiCqchGNnm2esiCbcOO//M4xOUVWlcCPisPDnje7uGs/WLWHyrY8BbzfVZjitRMk+bTTv8mf7J56shduwqsIJRFyM6qbmUtz+trMfUaW4BZVM5eP/8OTzDfRmZ0eVzPpnv2uPR3/85gGJrwBlqyPGzLVVt1S9y5sY7J//M4xO4Z2sbiNkrFMZ/KL/UYftV9S5wb9epX1GgZdRnf7Nabla2gWdei/EmKj3/W1EctRt9tK0Op0noUOdfK3qcyC/DM5aED0oNEMQsoW8Pc/zqVsRkgaHa8Iau9M1f72PDndzg3uvtdJRgf//M4xOUV2lr+VlnFZnQlJlJqupzJT6alUS0xY1Oh6TvM29qkk1LTQUyv1NX3a2y13/9y/r99NqqjZ+vfZJSX0lff//1es1n4GrwXsn6ObU9X8a65bUN/DwBygpQcorDk8ATA7gqjcOSDbiDD//M4xOwaez7uVnrFLBvhf4Z4NPEARyh2kCJkwH/4qYuyOFyCAgfAOUQY6Xjgcl+DfBuk8OoaxKh/SAi5Rwmo6gy8OT/LayLE+RYxTPiyCdUo2MUjEumX/mCY5I6SmTh06WCUPOYy8RYxQMls//M4xOEXsyLqt0loAGS//3OJIGiaq3uV3BkQgrjBZn+d/y0C7gKoldj63qpexezH5eZ5SqFEsYzhQFwoCVDOpXKJK3ylb//ob+hv+YxlCgLhRLIZygLlKyGdSm/MahvQzflLKVjGcoCBQVnv//M4xOEl+qq8AY+IAAad1gq6JVgqTEFNRTMuMTAwqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq//M4xKgUOontgcMQAKqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq\" type=\"audio/mpeg\" />\n",
+       "                    Your browser does not support the audio element.\n",
+       "                </audio>\n",
+       "              "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.Audio object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Render/link a sample's text, audio, video, output\n",
+    "def show_sample(idx=0, width=480):\n",
+    "    s = probe[idx]\n",
+    "    display(Markdown(f'## Sample {idx} - `{s[\"vid\"]}`'))\n",
+    "\n",
+    "    # Show instruction/question text used as model input\n",
+    "    display(Markdown(f'**Instruction / question text**\\n\\n```\\n{s[\"instruction\"]}\\n```'))\n",
+    "\n",
+    "    # Show expected output text from converted label\n",
+    "    display(Markdown(f'**Target output**\\n\\n```\\n{s[\"output\"]}\\n```'))\n",
+    "\n",
+    "    vpath = Path(s['video_path'])\n",
+    "    apath = Path(s['audio_path'])\n",
+    "    print('Video exists:', vpath.exists(), vpath)\n",
+    "    print('Audio exists:', apath.exists(), apath)\n",
+    "\n",
+    "    # Use notebook-native renderers (links + embedded controls)\n",
+    "    if vpath.exists():\n",
+    "        display(Video(filename=str(vpath), width=width, embed=False))\n",
+    "    if apath.exists():\n",
+    "        display(Audio(filename=str(apath)))\n",
+    "\n",
+    "show_sample(0)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "daff0bed",
+   "metadata": {},
+   "source": [
+    "## 5) Reproduce AVE Modal Feature Loading\n",
+    "\n",
+    "This section mirrors `UnifiedDataset.__getitem__(task_name == \"ave\")`:\n",
+    "- video: uniform temporal frame sampling + CLIP preprocessing\n",
+    "- audio: split into 10 one-second chunks + fbank extraction via `audio_processor.preprocess`\n",
+    "\n",
+    "The printed shapes help verify what is fed into MokA encoders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a1f45e62",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sample 0 (c---zaDCTaE)\n",
+      "  video shape: (10, 3, 224, 224) frame idx: [0, 25, 50, 75, 100, 125, 150, 175, 200, 225]\n",
+      "  audio shape: (10, 98, 128)\n",
+      "Sample 1 (fCZi6I6kPpU)\n",
+      "  video shape: (10, 3, 224, 224) frame idx: [0, 25, 50, 75, 100, 125, 150, 175, 200, 225]\n",
+      "  audio shape: (10, 98, 128)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# AVE modal feature extraction mirrored from UnifiedDataset.__getitem__ (task_name == 'ave')\n",
+    "def resolve_clip_path():\n",
+    "    # Pick first existing CLIP checkpoint directory from candidates\n",
+    "    for p in CLIP_PATH_CANDIDATES:\n",
+    "        if p.exists():\n",
+    "            return str(p)\n",
+    "    raise FileNotFoundError('Could not find clip-vit-large-patch14 in known locations.')\n",
+    "\n",
+    "clip_path = resolve_clip_path()\n",
+    "video_processor = CLIPImageProcessor.from_pretrained(clip_path)\n",
+    "\n",
+    "def load_ave_modal_features(sample, image_size=224, video_frame_nums=10):\n",
+    "    video_path = sample['video_path']\n",
+    "    audio_path = sample['audio_path']\n",
+    "\n",
+    "    # ---------------------------\n",
+    "    # Video branch (uniform sampling)\n",
+    "    # ---------------------------\n",
+    "    vr = VideoReader(uri=video_path, height=image_size, width=image_size)\n",
+    "    vlen = len(vr)\n",
+    "    n_frms = min(video_frame_nums, vlen)\n",
+    "    indices = np.arange(0, vlen, vlen / n_frms).astype(int).tolist()\n",
+    "    temp_frms = vr.get_batch(indices).asnumpy()  # T,H,W,C\n",
+    "    frames = [Image.fromarray(temp_frms[i]) for i in range(temp_frms.shape[0])]\n",
+    "    video = video_processor.preprocess(frames, return_tensors='pt')['pixel_values']  # T,C,H,W\n",
+    "\n",
+    "    # ---------------------------\n",
+    "    # Audio branch (10 x 1s chunks for AVE)\n",
+    "    # ---------------------------\n",
+    "    audio, sr = librosa.load(audio_path, sr=16000, mono=True)\n",
+    "    length = len(audio)\n",
+    "    tot = 10\n",
+    "    nums_per_second = int(length / tot)\n",
+    "    audio_feature = []\n",
+    "\n",
+    "    for indice in range(tot):\n",
+    "        start_time = max(0, indice)\n",
+    "        end_time = min(tot, indice + 1)\n",
+    "        audio_seg = audio[int(start_time * nums_per_second): int(nums_per_second * end_time)]\n",
+    "\n",
+    "        # Right-pad with silence if segment is short\n",
+    "        if len(audio_seg) < 1 * nums_per_second:\n",
+    "            sil = np.zeros(1 * nums_per_second - len(audio_seg), dtype=float)\n",
+    "            audio_seg = np.concatenate((audio_seg, sil), axis=0)\n",
+    "\n",
+    "        # Same fbank path as training\n",
+    "        audio_seg = torch.from_numpy(audio_seg).unsqueeze(0)\n",
+    "        fbank = preprocess(audio_seg).squeeze(0).to(torch.float32)  # L,128\n",
+    "        audio_feature.append(fbank)\n",
+    "\n",
+    "    audio_feature = torch.stack(audio_feature, dim=0)  # T,L,128\n",
+    "\n",
+    "    return {'video': video, 'audio': audio_feature, 'frame_indices': indices}\n",
+    "\n",
+    "# Run for first two probe samples and print resulting tensor shapes\n",
+    "for i in range(min(2, len(probe))):\n",
+    "    modal = load_ave_modal_features(probe[i])\n",
+    "    print(f'Sample {i} ({probe[i][\"vid\"]})')\n",
+    "    print('  video shape:', tuple(modal['video'].shape), 'frame idx:', modal['frame_indices'])\n",
+    "    print('  audio shape:', tuple(modal['audio'].shape))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41adc816",
+   "metadata": {},
+   "source": [
+    "## 6) Ablation Demo: `M_q` (Question-Only) vs Full-Text Attention\n",
+    "\n",
+    "This reproduces the logic used in `peft_hyper/tuners/lora.py` at a toy level:\n",
+    "- video/audio tokens are queries\n",
+    "- text LoRA branch tokens are keys/values\n",
+    "- compare using only question-span keys (`M_q`) versus all text tokens\n",
+    "\n",
+    "The printed norms and attention mass provide a quick sanity check of behavioral differences."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "bd685004",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Video delta norm (Mq):       2.1200497150421143\n",
+      "Video delta norm (full txt): 3.0253419876098633\n",
+      "Audio delta norm (Mq):       2.3169591426849365\n",
+      "Audio delta norm (full txt): 2.559093475341797\n",
+      "Avg question-attn mass (video, Mq):       1.0\n",
+      "Avg question-attn mass (video, full txt): 0.23980757594108582\n",
+      "Avg question-attn mass (audio, Mq):       1.0\n",
+      "Avg question-attn mass (audio, full txt): 0.29861217737197876\n"
+     ]
+    }
+   ],
+   "source": [
+    "# M_q (question-only) vs full-text ablation demo based on peft_hyper/tuners/lora.py\n",
+    "torch.manual_seed(7)\n",
+    "\n",
+    "def cross_update(video_token, audio_token, text_token, video_mask, audio_mask, key_mask, d_k=4, blc_weight=1.0):\n",
+    "    # Video update: query=video_token, key/value=text_token filtered by key_mask\n",
+    "    qv = video_token\n",
+    "    kv = text_token * key_mask\n",
+    "    idx = torch.where(key_mask.squeeze(-1) == 1)[0]\n",
+    "    kv = kv[idx[0]:idx[-1]+1, :]\n",
+    "    score_v = (qv @ kv.T) / (d_k ** 0.5)\n",
+    "    attn_v = torch.softmax(score_v, dim=-1)\n",
+    "    out_v = attn_v @ kv\n",
+    "    new_video = video_token + (video_mask * out_v) * blc_weight\n",
+    "\n",
+    "    # Audio update: same structure as video update\n",
+    "    qa = audio_token\n",
+    "    score_a = (qa @ kv.T) / (d_k ** 0.5)\n",
+    "    attn_a = torch.softmax(score_a, dim=-1)\n",
+    "    out_a = attn_a @ kv\n",
+    "    new_audio = audio_token + (audio_mask * out_a) * blc_weight\n",
+    "\n",
+    "    return new_video, new_audio, attn_v, attn_a\n",
+    "\n",
+    "# Toy sequence in LoRA A-space (d_k=4), with many non-question text tokens as distractors\n",
+    "T_text = 20\n",
+    "T_video = 8\n",
+    "T_audio = 8\n",
+    "d_k = 4\n",
+    "\n",
+    "text_token = torch.randn(T_text, d_k)\n",
+    "video_token = torch.randn(T_video, d_k)\n",
+    "audio_token = torch.randn(T_audio, d_k)\n",
+    "\n",
+    "# Mask variants\n",
+    "question_mask = torch.zeros(T_text, 1)\n",
+    "question_mask[14:20] = 1  # last 6 text tokens are question span\n",
+    "full_text_mask = torch.ones(T_text, 1)\n",
+    "video_mask = torch.ones(T_video, d_k)\n",
+    "audio_mask = torch.ones(T_audio, d_k)\n",
+    "\n",
+    "# Run both settings\n",
+    "nv_q, na_q, attv_q, atta_q = cross_update(video_token, audio_token, text_token, video_mask, audio_mask, question_mask, d_k=d_k)\n",
+    "nv_f, na_f, attv_f, atta_f = cross_update(video_token, audio_token, text_token, video_mask, audio_mask, full_text_mask, d_k=d_k)\n",
+    "\n",
+    "def avg_question_mass(attn, q_start=14, q_end=20):\n",
+    "    # If only question tokens are present in KV, mass is exactly 1.0\n",
+    "    if attn.shape[1] == (q_end - q_start):\n",
+    "        return 1.0\n",
+    "    return attn[:, q_start:q_end].sum(dim=-1).mean().item()\n",
+    "\n",
+    "print('Video delta norm (Mq):      ', (nv_q - video_token).norm().item())\n",
+    "print('Video delta norm (full txt):', (nv_f - video_token).norm().item())\n",
+    "print('Audio delta norm (Mq):      ', (na_q - audio_token).norm().item())\n",
+    "print('Audio delta norm (full txt):', (na_f - audio_token).norm().item())\n",
+    "print('Avg question-attn mass (video, Mq):      ', avg_question_mass(attv_q))\n",
+    "print('Avg question-attn mass (video, full txt):', avg_question_mass(attv_f))\n",
+    "print('Avg question-attn mass (audio, Mq):      ', avg_question_mass(atta_q))\n",
+    "print('Avg question-attn mass (audio, full txt):', avg_question_mass(atta_f))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "755235f1",
+   "metadata": {},
+   "source": [
+    "## Notes\n",
+    "- AVE question text is fixed in training/test JSON path (`unified_dataset.py`):\n",
+    "  `Please describe the events and time range that occurred in the video.`\n",
+    "- `M_q` in MokA is built in `models/unified_arch.py` and consumed in `peft_hyper/tuners/lora.py` to restrict key/value to question-region text.\n",
+    "- The ablation above compares this question-only behavior to using all text tokens as keys/values.\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "moka",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.25"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/AudioVisualText/peft_hyper/tuners/lora.py b/AudioVisualText/peft_hyper/tuners/lora.py
index f3371f7..e12ea62 100644
--- a/AudioVisualText/peft_hyper/tuners/lora.py
+++ b/AudioVisualText/peft_hyper/tuners/lora.py
@@ -26,7 +26,7 @@
 from transformers.pytorch_utils import Conv1D
 
 from ..utils import PeftConfig, PeftType, transpose
-
+from .trilinear_cross_modal import trilinear_text_update, trilinear_text_update_packed
 
 @dataclass
 class LoraConfig(PeftConfig):
@@ -64,8 +64,18 @@ class LoraConfig(PeftConfig):
     ### eval
     reserved_modality: str = field(default=None, metadata={"help": "Alpha of blcloss"})
     loramethod: str = field(default=None, metadata={"help": "Alpha of blcloss"})
-
-
+    cross_attn_kv_mode: str = field(
+        default="question",
+        metadata={"help": "KV token source for modality cross-attention: question or full_text."},
+    )
+    cross_modal_mode: str = field(
+        default="trilinear",
+        metadata={"help": "Cross-modal fusion mode for LoRA: pairwise or trilinear."},
+    )
+    trilinear_pack_tokens: bool = field(
+        default=False,
+        metadata={"help": "If True, compact active text/video/audio tokens before Triton trilinear attention."},
+    )
 
     merge_weights: bool = field(
         default=False, metadata={"help": "Merge weights of the original model and the Lora model"}
@@ -141,6 +151,9 @@ def _find_and_replace(self):
             "blc_weight": self.peft_config.blc_weight,
             "reserved_modality": self.peft_config.reserved_modality,
             "loramethod": self.peft_config.loramethod,
+            "cross_attn_kv_mode": self.peft_config.cross_attn_kv_mode,
+            "cross_modal_mode": self.peft_config.cross_modal_mode,
+            "trilinear_pack_tokens": self.peft_config.trilinear_pack_tokens,
             "fan_in_fan_out": self.peft_config.fan_in_fan_out,
             "merge_weights": (self.peft_config.merge_weights or self.peft_config.inference_mode)
             and not is_hf_device_map_available,
@@ -288,6 +301,9 @@ def __init__(
         lora_dropout: float = 0.0,
         reserved_modality="text",
         loramethod="uni",
+        cross_attn_kv_mode="question",
+        cross_modal_mode = "trilinear",
+        trilinear_pack_tokens: bool = False,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         merge_weights: bool = True,
         **kwargs,
@@ -302,7 +318,9 @@ def __init__(
         self.blc_weight = blc_weight
 
         self.reserved_modality=reserved_modality
-        
+        self.cross_attn_kv_mode = cross_attn_kv_mode
+        self.cross_modal_mode = cross_modal_mode
+        self.trilinear_pack_tokens = trilinear_pack_tokens
         
         self.fan_in_fan_out = fan_in_fan_out
 
@@ -362,21 +380,36 @@ def eval(self):
         for i in range(1):
             getattr(self, f"lora_B{i}").eval()
 
+    def _select_kv_mask(self, text_mask: torch.Tensor, question_mask: torch.Tensor) -> torch.Tensor:
+        mode = str(self.cross_attn_kv_mode).lower()
+        if mode == "full_text":
+            return text_mask
+        return question_mask
+
 
 
     def forward(self, x: torch.Tensor, modality_mask:List[torch.Tensor]=None):
+        # Ensure inputs and weights share dtype (avoid float vs bf16 mismatch)
+        weight = transpose(self.weight, self.fan_in_fan_out)
+        if x.dtype != weight.dtype:
+            x = x.to(weight.dtype)
+        bias = self.bias
+        if bias is not None and bias.dtype != x.dtype:
+            bias = bias.to(x.dtype)
+        result = F.linear(x, weight, bias=bias)
 
-        result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)        
 
-            
         ## infer not first forward only text token
         if(('test' in self.loramethod) &(x.size(1)==1) ):
-            # only text token
-            output_a=getattr(self, f"lora_A0")(self.lora_dropout(x))*self.scaling[0]
-            input_b=output_a
-            output_b=getattr(self, f"lora_B0")(input_b)
+            # only text token; cast to lora A/B weight dtype to avoid float vs bf16
+            lora_a0 = getattr(self, "lora_A0")
+            lora_b0 = getattr(self, "lora_B0")
+            inp_a = self.lora_dropout(x).to(lora_a0.weight.dtype)
+            output_a = lora_a0(inp_a) * self.scaling[0]
+            input_b = output_a.to(lora_b0.weight.dtype)
+            output_b = lora_b0(input_b)
 
-            result=output_b+result
+            result = output_b + result
 
             return result
 
@@ -384,149 +417,239 @@ def forward(self, x: torch.Tensor, modality_mask:List[torch.Tensor]=None):
         ## infer first forward
         if(('test' in self.loramethod) &(x.size(1)!=1) ):
 
-            text_mask=modality_mask[0]
-            video_mask=modality_mask[1]
-            audio_mask=modality_mask[2]
-            question_mask=modality_mask[3]
+            # Cast modality masks to match x dtype to avoid promotion to float32
+            text_mask = modality_mask[0].to(dtype=x.dtype)
+            video_mask = modality_mask[1].to(dtype=x.dtype)
+            audio_mask = modality_mask[2].to(dtype=x.dtype)
+            question_mask = modality_mask[3].to(dtype=x.dtype)
+
+            kv_mask = self._select_kv_mask(text_mask=text_mask, question_mask=question_mask)
             ## train process
-        
+
             only_inputs=[x*text_mask,x*video_mask,x*audio_mask]
-            # 0: text 
-            # 1: video 
+            # 0: text
+            # 1: video
             # 2: audio
             # 3: question
 
             #### get question mask
 
-            output_a=[]
+            output_a = []
             for i in range(self.lora_num):
-                output_a.append(getattr(self, f"lora_A{i}")(self.lora_dropout(only_inputs[i]))*self.scaling[0])
-            
+                la = getattr(self, f"lora_A{i}")
+                inp = self.lora_dropout(only_inputs[i]).to(la.weight.dtype)
+                output_a.append(la(inp) * self.scaling[0])
 
             ### video_token: cross attention per sample
             video_token=output_a[1]
-            question_token=output_a[0]*question_mask
+            question_token=output_a[0]*kv_mask
             new_video=torch.zeros_like(video_token)
 
-            for i in range(question_token.size(0)):
-                query=video_token[i,:,:].unsqueeze(0)
-                
-                ## get question tokens
-                indices = torch.where(question_mask[i,:,:] == 1)[0]
-                key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-                value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-
-
-                score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k) 
-                score = torch.softmax(score, dim=-1)
-                output = torch.matmul(score, value)  # shape: (1, token_num, 4)
-                attention_outputs=video_mask[i,:,:]*output
-                new_video[i,:,:]=video_token[i,:,:]+attention_outputs*self.blc_weight
-
-            ### audio_token: cross attention per sample
-            audio_token=output_a[2]
-            question_token=output_a[0]*question_mask
-            new_audio=torch.zeros_like(audio_token)
-
-            for i in range(question_token.size(0)):
-
-
-                query=audio_token[i,:,:].unsqueeze(0)
-
-                ## get question tokens
-                indices = torch.where(question_mask[i,:,:] == 1)[0]
-                key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-                value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-
-
-                score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k) 
-                score = torch.softmax(score, dim=-1)
-                output = torch.matmul(score, value)  # shape: (1, token_num, 4)
-                attention_outputs=audio_mask[i,:,:]*output
-                new_audio[i,:,:]=audio_token[i,:,:]+attention_outputs*self.blc_weight
-            
-
-            input_b=[output_a[0],new_video,new_audio]
-            input_b=sum(input_b)
-
-
-            output_b=getattr(self, f"lora_B0")(input_b)
-
-            result=output_b+result
-            
-            return result
+            if self.cross_modal_mode == "pairwise":
+
+                for i in range(question_token.size(0)):
+                    query=video_token[i,:,:].unsqueeze(0)
+
+                    ## get KV tokens (question-only or full-text depending on mode)
+                    indices = torch.where(kv_mask[i,:,:] == 1)[0]
+                    if indices.numel() == 0:
+                        new_video[i,:,:] = video_token[i,:,:]
+                        continue
+                    key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+                    value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+
+
+                    score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k)
+                    score = torch.softmax(score, dim=-1)
+                    output = torch.matmul(score, value)  # shape: (1, token_num, 4)
+                    attention_outputs=video_mask[i,:,:]*output
+                    new_video[i,:,:]=video_token[i,:,:]+attention_outputs*self.blc_weight
+
+                ### audio_token: cross attention per sample
+                audio_token=output_a[2]
+                question_token=output_a[0]*kv_mask
+                new_audio=torch.zeros_like(audio_token)
+
+                for i in range(question_token.size(0)):
+
+
+                    query=audio_token[i,:,:].unsqueeze(0)
+
+                    ## get KV tokens (question-only or full-text depending on mode)
+                    indices = torch.where(kv_mask[i,:,:] == 1)[0]
+                    if indices.numel() == 0:
+                        new_audio[i,:,:] = audio_token[i,:,:]
+                        continue
+                    key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+                    value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+
+
+                    score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k)
+                    score = torch.softmax(score, dim=-1)
+                    output = torch.matmul(score, value)  # shape: (1, token_num, 4)
+                    attention_outputs=audio_mask[i,:,:]*output
+                    new_audio[i,:,:]=audio_token[i,:,:]+attention_outputs*self.blc_weight
+
+
+                input_b = [output_a[0], new_video, new_audio]
+                input_b = sum(input_b)
+
+                lora_b0 = getattr(self, "lora_B0")
+                input_b = input_b.to(lora_b0.weight.dtype)
+                output_b = lora_b0(input_b)
+
+
+                result=output_b+result
+
+                return result
+
+            if self.cross_modal_mode == "trilinear":
+                text_lora = output_a[0]
+                video_lora = output_a[1] * video_mask
+                audio_lora = output_a[2] * audio_mask
+                if self.trilinear_pack_tokens:
+                    text_updated = trilinear_text_update_packed(
+                        text=text_lora,
+                        video=video_lora,
+                        audio=audio_lora,
+                        text_mask=kv_mask,
+                        video_mask=video_mask,
+                        audio_mask=audio_mask,
+                        alpha=self.blc_weight,
+                        prescale=True,
+                    )
+                else:
+                    selected_text_lora = text_lora * kv_mask
+                    text_updated = trilinear_text_update(
+                        text=selected_text_lora,
+                        video=video_lora,
+                        audio=audio_lora,
+                        alpha=self.blc_weight,
+                        prescale=True,
+                    )
+                    text_updated = text_updated + text_lora * (1 - kv_mask)
+
+                input_b = text_updated + video_lora + audio_lora
+                lora_b0 = getattr(self, "lora_B0")
+                input_b = input_b.to(lora_b0.weight.dtype)
+                output_b = lora_b0(input_b)
+                result=output_b+result
+                return result
 
         ## train
         if('train' in self.loramethod):
 
-            text_mask=modality_mask[0]
-            video_mask=modality_mask[1]
-            audio_mask=modality_mask[2]
-            question_mask=modality_mask[3]
+            text_mask = modality_mask[0].to(dtype=x.dtype)
+            video_mask = modality_mask[1].to(dtype=x.dtype)
+            audio_mask = modality_mask[2].to(dtype=x.dtype)
+            question_mask = modality_mask[3].to(dtype=x.dtype)
+            kv_mask = self._select_kv_mask(text_mask=text_mask, question_mask=question_mask)
             ## train process
-        
+
             only_inputs=[x*text_mask,x*video_mask,x*audio_mask]
-            # 0: text 
-            # 1: video 
+            # 0: text
+            # 1: video
             # 2: audio
 
             #### get question mask
 
             output_a=[]
             for i in range(self.lora_num):
-                output_a.append(getattr(self, f"lora_A{i}")(self.lora_dropout(only_inputs[i]))*self.scaling[0])
-            
+                la = getattr(self, f"lora_A{i}")
+                inp = self.lora_dropout(only_inputs[i]).to(la.weight.dtype)
+                output_a.append(la(inp) * self.scaling[0])
+  
+
 
             ### video_token: cross attention per sample
             video_token=output_a[1]
-            question_token=output_a[0]*question_mask
+            question_token=output_a[0]*kv_mask
             new_video=torch.zeros_like(video_token)
 
-            for i in range(question_token.size(0)):
-                query=video_token[i,:,:].unsqueeze(0)
-
-                ## get question tokens
-                indices = torch.where(question_mask[i,:,:] == 1)[0]
-                key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-                value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-
-
-
-                score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k) 
-                score = torch.softmax(score, dim=-1)
-                output = torch.matmul(score, value)  # shape: (1, token_num, 4)
-                attention_outputs=video_mask[i,:,:]*output
-                new_video[i,:,:]=video_token[i,:,:]+attention_outputs*self.blc_weight
-
-            ### audio_token: cross attention per sample
-            audio_token=output_a[2]
-            question_token=output_a[0]*question_mask
-            new_audio=torch.zeros_like(audio_token)
-
-            for i in range(question_token.size(0)):
-
-
-                query=audio_token[i,:,:].unsqueeze(0)
-
-                ## get question tokens
-                indices = torch.where(question_mask[i,:,:] == 1)[0]
-                key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-                value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
-
-
-                score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k)
-                score = torch.softmax(score, dim=-1)
-                output = torch.matmul(score, value)  # shape: (1, token_num, 4)
-                attention_outputs=audio_mask[i,:,:]*output
-                new_audio[i,:,:]=audio_token[i,:,:]+attention_outputs*self.blc_weight
-            
-
-            input_b=[output_a[0],new_video,new_audio]
-            input_b=sum(input_b)
-
-
-            output_b=getattr(self, f"lora_B0")(input_b)
-
-            result=output_b+result
-            
-            return result
\ No newline at end of file
+            if self.cross_modal_mode == "pairwise":
+
+                for i in range(question_token.size(0)):
+                    query=video_token[i,:,:].unsqueeze(0)
+
+                    ## get KV tokens (question-only or full-text depending on mode)
+                    indices = torch.where(kv_mask[i,:,:] == 1)[0]
+                    if indices.numel() == 0:
+                        new_video[i,:,:] = video_token[i,:,:]
+                        continue
+                    key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+                    value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+
+
+
+                    score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k)
+                    score = torch.softmax(score, dim=-1)
+                    output = torch.matmul(score, value)  # shape: (1, token_num, 4)
+                    attention_outputs=video_mask[i,:,:]*output
+                    new_video[i,:,:]=video_token[i,:,:]+attention_outputs*self.blc_weight
+
+                ### audio_token: cross attention per sample
+                audio_token=output_a[2]
+                question_token=output_a[0]*kv_mask
+                new_audio=torch.zeros_like(audio_token)
+
+                for i in range(question_token.size(0)):
+
+
+                    query=audio_token[i,:,:].unsqueeze(0)
+
+                    ## get KV tokens (question-only or full-text depending on mode)
+                    indices = torch.where(kv_mask[i,:,:] == 1)[0]
+                    if indices.numel() == 0:
+                        new_audio[i,:,:] = audio_token[i,:,:]
+                        continue
+                    key=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+                    value=question_token[i,indices[0]:indices[-1]+1,:].unsqueeze(0)
+
+
+                    score = torch.matmul(query, key.transpose(-2, -1))/ math.sqrt(self.d_k)
+                    score = torch.softmax(score, dim=-1)
+                    output = torch.matmul(score, value)  # shape: (1, token_num, 4)
+                    attention_outputs=audio_mask[i,:,:]*output
+                    new_audio[i,:,:]=audio_token[i,:,:]+attention_outputs*self.blc_weight
+
+                input_b=[output_a[0],new_video,new_audio]
+                input_b=sum(input_b)
+
+                output_b=getattr(self, f"lora_B0")(input_b)
+
+                result=output_b+result
+                return result
+
+            if self.cross_modal_mode == "trilinear":
+                text_lora = output_a[0]
+                video_lora = output_a[1] * video_mask
+                audio_lora = output_a[2] * audio_mask
+                if self.trilinear_pack_tokens:
+                    text_updated = trilinear_text_update_packed(
+                        text=text_lora,
+                        video=video_lora,
+                        audio=audio_lora,
+                        text_mask=kv_mask,
+                        video_mask=video_mask,
+                        audio_mask=audio_mask,
+                        alpha=self.blc_weight,
+                        prescale=True,
+                    )
+                else:
+                    selected_text_lora = text_lora * kv_mask
+                    text_updated = trilinear_text_update(
+                        text=selected_text_lora,
+                        video=video_lora,
+                        audio=audio_lora,
+                        alpha=self.blc_weight,
+                        prescale=True,
+                    )
+                    text_updated = text_updated + text_lora * (1 - kv_mask)
+
+                input_b = text_updated + video_lora + audio_lora
+                lora_b0 = getattr(self, "lora_B0")
+                input_b = input_b.to(lora_b0.weight.dtype)
+                output_b = lora_b0(input_b)
+                result=output_b+result
+                return result
+   
diff --git a/AudioVisualText/peft_hyper/tuners/trilinear_cross_modal.py b/AudioVisualText/peft_hyper/tuners/trilinear_cross_modal.py
new file mode 100644
index 0000000..cec63a6
--- /dev/null
+++ b/AudioVisualText/peft_hyper/tuners/trilinear_cross_modal.py
@@ -0,0 +1,126 @@
+"""
+Trilinear Cross-Modal LoRA Attention.
+
+Implements the two-simplicial attention where TEXT tokens attend to (VIDEO, AUDIO) pairs.
+Video and audio tokens pass through unchanged; only text is updated.
+
+Formula:
+    score[i,j,k] = (text_i ⊙ video_j)^T audio_k / sqrt(r)
+    weights[i,j,k] = softmax over (j,k) of score[i,j,k]
+    text_updated[i] = text[i] + alpha * sum_{j,k} weights[i,j,k] * (video_j ⊙ audio_k)
+"""
+
+import sys
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+triton_dir = Path(__file__).resolve().parents[3] / "Triton"
+if str(triton_dir) not in sys.path:
+    sys.path.insert(0, str(triton_dir))
+
+from two_simplical_attention import TwoSimplicialAttentionFunction
+
+
+def trilinear_text_update(
+    text: torch.Tensor,
+    video: torch.Tensor,
+    audio: torch.Tensor,
+    alpha: float = 1.0,
+    scale: Optional[float] = None,
+    prescale: bool = True,
+) -> torch.Tensor:
+    """Update text tokens with the Triton two-simplicial attention kernel."""
+
+    if text.device.type != "cuda":
+        raise RuntimeError("trilinear_text_update requires CUDA because it uses the Triton kernel directly.")
+
+    rank = text.size(-1)
+    if scale is None:
+        scale = rank ** -0.5
+
+    # Triton's tl.dot path requires the feature dimension to be >= 16. (check if this is how we should handle this)
+    # Current solution: Zero-padding preserves the original trilinear scores and values.
+    padded_rank = max(rank, 16)
+    if padded_rank != rank:
+        pad = (0, padded_rank - rank) # zero-pad the feature dimension to 16
+        text = F.pad(text, pad)
+        video = F.pad(video, pad)
+        audio = F.pad(audio, pad) # zero-pad the feature dimension to 16
+
+    q = text.unsqueeze(2).contiguous()
+    k1 = video.unsqueeze(2).contiguous()
+    k2 = audio.unsqueeze(2).contiguous()
+    v1 = video.unsqueeze(2).contiguous()
+    v2 = audio.unsqueeze(2).contiguous()
+    seq_len = q.size(1)
+
+    out = TwoSimplicialAttentionFunction.apply(
+        q,
+        k1,
+        k2,
+        v1,
+        v2,
+        seq_len,
+        seq_len,
+        None,
+        None,
+        0.0,
+        0.0,
+        scale,
+        prescale,
+        False,
+    )
+    text_updated = text + alpha * out[:, :, 0, :]
+    return text_updated[..., :rank]
+
+# Currently using this function to update the text tokens. Takes ~16 hrs to train compared to 34 hrs for the original function.
+def trilinear_text_update_packed(
+    text: torch.Tensor,
+    video: torch.Tensor,
+    audio: torch.Tensor,
+    text_mask: torch.Tensor,
+    video_mask: torch.Tensor,
+    audio_mask: torch.Tensor,
+    alpha: float = 1.0,
+    scale: Optional[float] = None,
+    prescale: bool = True,
+) -> torch.Tensor:
+    """Run trilinear attention on compacted active tokens, then scatter back."""
+
+    if text.device.type != "cuda":
+        raise RuntimeError("trilinear_text_update_packed requires CUDA because it uses the Triton kernel directly.")
+
+    updated = text.clone()
+    batch_size = text.size(0)
+
+    for batch_idx in range(batch_size):
+        text_idx = torch.where(text_mask[batch_idx].squeeze(-1) > 0)[0]
+        video_idx = torch.where(video_mask[batch_idx].squeeze(-1) > 0)[0]
+        audio_idx = torch.where(audio_mask[batch_idx].squeeze(-1) > 0)[0]
+
+        if text_idx.numel() == 0 or video_idx.numel() == 0 or audio_idx.numel() == 0:
+            continue
+
+        packed_len = max(int(text_idx.numel()), int(video_idx.numel()), int(audio_idx.numel()))
+
+        packed_text = text.new_zeros((1, packed_len, text.size(-1)))
+        packed_video = video.new_zeros((1, packed_len, video.size(-1)))
+        packed_audio = audio.new_zeros((1, packed_len, audio.size(-1)))
+
+        packed_text[:, : text_idx.numel(), :] = text[batch_idx : batch_idx + 1].index_select(1, text_idx)
+        packed_video[:, : video_idx.numel(), :] = video[batch_idx : batch_idx + 1].index_select(1, video_idx)
+        packed_audio[:, : audio_idx.numel(), :] = audio[batch_idx : batch_idx + 1].index_select(1, audio_idx)
+
+        packed_updated = trilinear_text_update(
+            text=packed_text,
+            video=packed_video,
+            audio=packed_audio,
+            alpha=alpha,
+            scale=scale,
+            prescale=prescale,
+        )
+        updated[batch_idx, text_idx, :] = packed_updated[0, : text_idx.numel(), :]
+
+    return updated
diff --git a/AudioVisualText/run_eval_ave.sbatch b/AudioVisualText/run_eval_ave.sbatch
new file mode 100644
index 0000000..6d8c983
--- /dev/null
+++ b/AudioVisualText/run_eval_ave.sbatch
@@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH -J moka_eval_ave
+#SBATCH -p overcap
+#SBATCH -N 1
+#SBATCH -n 1
+#SBATCH -c 2
+#SBATCH --mem=8G
+#SBATCH -t 01:00:00
+#SBATCH -o /nethome/rkhan96/flash/MokA/AudioVisualText/slurm-eval-%j.out
+#SBATCH -e /nethome/rkhan96/flash/MokA/AudioVisualText/slurm-eval-%j.err
+
+set -euo pipefail
+
+source /nethome/rkhan96/flash/miniconda3/etc/profile.d/conda.sh
+conda activate moka_torch25
+
+cd /nethome/rkhan96/flash/MokA/AudioVisualText
+export PYTHONUNBUFFERED=1
+
+# Required: pass RUN_DIR via --export=ALL,RUN_DIR=results/finetune/<run_dir>
+export RUN_DIR
+if [ -z "${RUN_DIR:-}" ]; then
+    echo "RUN_DIR is required. Example: sbatch --export=ALL,RUN_DIR=results/finetune/llama2_ave_bf16_20250305_143022 run_eval_ave.sbatch"
+    exit 1
+fi
+
+python scripts/ablation/ave_eval_flexible.py \
+  --inference-jsonl "$RUN_DIR/inference_results/inference_ave.jsonl" \
+  --annotations scripts/evaluation/Annotations.txt \
+  --out-json "$RUN_DIR/inference_results/metrics_ave.json"
diff --git a/AudioVisualText/run_train_infer_eval_ave.sbatch b/AudioVisualText/run_train_infer_eval_ave.sbatch
new file mode 100644
index 0000000..e12edb3
--- /dev/null
+++ b/AudioVisualText/run_train_infer_eval_ave.sbatch
@@ -0,0 +1,71 @@
+#!/bin/bash
+#SBATCH -J moka_train_infer_ave
+#SBATCH --partition=overcap
+#SBATCH --requeue
+#SBATCH --open-mode=append
+#SBATCH --gres=gpu:a40:1
+#SBATCH -N 1
+#SBATCH -n 1
+#SBATCH -c 8
+#SBATCH -t 24:00:00
+#SBATCH -o /nethome/rkhan96/flash/MokA/AudioVisualText/slurm-train-infer-%j.out
+#SBATCH -e /nethome/rkhan96/flash/MokA/AudioVisualText/slurm-train-infer-%j.err
+
+set -euo pipefail
+
+source /nethome/rkhan96/flash/miniconda3/etc/profile.d/conda.sh
+conda activate moka_torch25
+
+cd /nethome/rkhan96/flash/MokA/AudioVisualText
+export PYTHONUNBUFFERED=1
+export NPROC_PER_NODE=1
+export DEEPSPEED_CONFIG=${DEEPSPEED_CONFIG:-deepspeed/stage2-offload-torch25.json}
+
+# Run identity / config (override via sbatch --export=ALL,...)
+export MODEL_NAME=${MODEL_NAME:-llama2}
+export DATASET=${DATASET:-ave}
+export PRECISION=${PRECISION:-bf16}
+export CROSS_ATTN_KV_MODE=${CROSS_ATTN_KV_MODE:-question}
+export CROSS_MODAL_MODE=${CROSS_MODAL_MODE:-trilinear}   # pairwise|trilinear
+export BLC_WEIGHT=${BLC_WEIGHT:-1}
+export NUM_EPOCHS=${NUM_EPOCHS:-3}
+export TRILINEAR_PACK_TOKENS=${TRILINEAR_PACK_TOKENS:-False}
+export SEED=${SEED:-42}
+
+# Optional label in folder name (e.g. kernel/packed)
+export ABLATION_TAG=${ABLATION_TAG:-}
+
+# If OUTPUT_DIR is not provided, create a deterministic one here so inference can use it.
+if [ -z "${OUTPUT_DIR:-}" ]; then
+  RUN_TAG="${MODEL_NAME}_${DATASET}_${PRECISION}_${CROSS_ATTN_KV_MODE}"
+  if [ "${CROSS_MODAL_MODE}" = "pairwise" ]; then
+    RUN_TAG="${RUN_TAG}_moka"
+  else
+    RUN_TAG="${RUN_TAG}_trilinear"
+  fi
+  if [ -n "${ABLATION_TAG}" ]; then
+    RUN_TAG="${RUN_TAG}_${ABLATION_TAG}"
+  fi
+  RUN_TAG="${RUN_TAG}_ep${NUM_EPOCHS}_bw${BLC_WEIGHT}_$(date +%Y%m%d_%H%M%S)"
+  export OUTPUT_DIR="results/finetune/${RUN_TAG}"
+fi
+
+echo "OUTPUT_DIR=${OUTPUT_DIR}"
+
+echo "=== TRAIN ==="
+bash scripts/finetune/ft_ave.sh
+
+echo "=== INFER + EVAL ==="
+export YOUR_CKPT_PARH="${OUTPUT_DIR}"
+
+# Ensure inference uses the same fusion mode
+if [ "${CROSS_MODAL_MODE}" = "pairwise" ]; then
+  export CROSS_MODAL_MODE=pairwise
+else
+  export CROSS_MODAL_MODE=trilinear
+fi
+
+bash scripts/finetune/infer_ave.sh
+
+echo "Done. Metrics should be in: ${YOUR_CKPT_PARH}/inference_results/metrics.jsonl"
+
diff --git a/AudioVisualText/scripts/ablation/README.md b/AudioVisualText/scripts/ablation/README.md
new file mode 100644
index 0000000..b703626
--- /dev/null
+++ b/AudioVisualText/scripts/ablation/README.md
@@ -0,0 +1,123 @@
+# AVE Ablation Workflow (Mq vs Full-Text KV)
+
+This doc explains how to run and log AVE ablations after adding `cross_attn_kv_mode`.
+
+## What changed
+
+- New training/inference arg: `--cross_attn_kv_mode`
+  - `question` (default): old behavior, KV from question mask (`M_q`)
+  - `full_text`: ablation, KV from all text tokens
+- Added flexible evaluator:
+  - `scripts/ablation/ave_eval_flexible.py`
+  - Accepts explicit input/output paths for automated experiment tracking
+- Added SLURM wrappers:
+  - `run_infer_ave.sbatch`
+  - `run_eval_ave.sbatch`
+
+## Key files
+
+- Config arg definition:
+  - `configs/unified_config.py`
+- Arg wired into LoRA setup:
+  - `scripts/finetune/finetune.py`
+  - `scripts/finetune/inference_cut.py`
+- Core behavior switch (question vs full-text KV):
+  - `peft_hyper/tuners/lora.py`
+- Flexible evaluator:
+  - `scripts/ablation/ave_eval_flexible.py`
+- SLURM wrappers:
+  - `run_ft_ave.sbatch`
+  - `run_infer_ave.sbatch`
+  - `run_eval_ave.sbatch`
+
+## Run directory naming (automatic)
+
+Run dir is built from env vars and ends with date_time:  
+`<model>_<dataset>_<precision>_<question|full_text>_[gradsense_]bw<blc>_<YYYYMMDD>_<HHMMSS>`
+
+Examples:
+- `llama2_ave_bf16_question_bw1_20250305_143022` — bf16, question (Mq), blc=1
+- `llama2_ave_bf16_full_text_bw0.5_20250305_150000` — bf16, full_text, blc=0.5
+- `llama2_ave_bf16_question_gradsense_bw0.5_20250305_160000` — + gradsens
+
+Training prints the output directory at start (check slurm-*.out). Use that path for inference/eval.
+
+## Output locations
+
+Training output directory:
+- `results/finetune/<auto_run_name>/` (see naming convention above)
+
+Important files:
+- `saved_config.json` (full run args)
+- `trainer_state.json` (log history incl. training loss)
+- `adapter_model.bin`
+- `non_lora_trainables.bin`
+
+Inference output directory:
+- `results/finetune/<RUN_NAME>/inference_results/`
+
+Inference output file:
+- `inference_ave.jsonl`
+
+Evaluation output (recommended):
+- `results/finetune/<RUN_NAME>/inference_results/metrics_ave.json`
+
+## Training (SLURM)
+
+From `AudioVisualText/`. Run dir is built from env vars (defaults: MODEL_NAME=llama2, DATASET=ave, PRECISION=bf16, CROSS_ATTN_KV_MODE=question, GRAD_SENS_RUN=0, BLC_WEIGHT=1).
+
+```bash
+# Mq baseline (defaults)
+sbatch run_ft_ave.sbatch
+
+# Full-text KV ablation
+sbatch --export=ALL,CROSS_ATTN_KV_MODE=full_text run_ft_ave.sbatch
+
+# With gradsens and blc=0.5
+sbatch --export=ALL,GRAD_SENS_RUN=1,BLC_WEIGHT=0.5 run_ft_ave.sbatch
+```
+
+Check slurm-*.out for the printed output directory, then use that path for inference/eval.
+
+## Inference (SLURM)
+
+Pass the exact run directory (from training log). Match CROSS_ATTN_KV_MODE, PRECISION, and BLC_WEIGHT to the trained run.
+
+```bash
+# Example: run dir llama2_ave_bf16_question_bw1_20250305_143022
+sbatch --export=ALL,YOUR_CKPT_PARH=results/finetune/llama2_ave_bf16_question_bw1_20250305_143022,CROSS_ATTN_KV_MODE=question run_infer_ave.sbatch
+
+# FP32 run: pass PRECISION=fp32 (and BLC_WEIGHT if not 1)
+sbatch --export=ALL,YOUR_CKPT_PARH=results/finetune/llama2_ave_fp32_question_bw1_20250305_120000,PRECISION=fp32 run_infer_ave.sbatch
+```
+
+Outputs:
+- `<YOUR_CKPT_PARH>/inference_results/inference_ave.jsonl`
+
+## Evaluation (SLURM)
+
+Pass the same run directory used for inference.
+
+```bash
+sbatch --export=ALL,RUN_DIR=results/finetune/llama2_ave_bf16_question_bw1_20250305_143022 run_eval_ave.sbatch
+```
+
+Or direct (no SLURM):
+
+```bash
+python scripts/ablation/ave_eval_flexible.py \
+  --inference-jsonl results/finetune/llama2_ave_bf16_question_bw1_20250305_143022/inference_results/inference_ave.jsonl \
+  --annotations scripts/evaluation/Annotations.txt \
+  --out-json results/finetune/llama2_ave_bf16_question_bw1_20250305_143022/inference_results/metrics_ave.json
+```
+
+## Quick comparison checklist
+
+For each run, record:
+- `cross_attn_kv_mode`
+- `blc_weight`
+- final train loss (from `trainer_state.json`)
+- `frame_accuracy`
+- `valid_prediction_format_samples`
+
+Keep all other hyperparameters fixed for fair ablation.
diff --git a/AudioVisualText/scripts/ablation/ave_eval_flexible.py b/AudioVisualText/scripts/ablation/ave_eval_flexible.py
new file mode 100644
index 0000000..e8d9731
--- /dev/null
+++ b/AudioVisualText/scripts/ablation/ave_eval_flexible.py
@@ -0,0 +1,143 @@
+import argparse
+import json
+import re
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.metrics import accuracy_score
+
+
+def load_event_mapping(annotations_path: Path):
+    vocab = set()
+    with annotations_path.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            vocab.add(line.split("&")[0])
+    mapping = {"none": 0}
+    for idx, event in enumerate(sorted(vocab)):
+        mapping[event.lower()] = idx + 1
+    return mapping
+
+
+def parse_pred_event_and_ranges(pred_text: str, mapping):
+    matches = re.findall(r"<event>(.*?)</event>", pred_text)
+    if len(matches) != 1:
+        return None, None
+
+    event_content = matches[0].strip()
+    pred_event_temp = event_content.lower()
+    pred_ranges = []
+
+    if pred_event_temp in mapping:
+        pred_event = pred_event_temp
+        ranges = re.findall(r"<range>(.*?)</range>", pred_text)
+        if len(ranges) == 0:
+            return None, None
+        for range_str in ranges:
+            parts = range_str.strip().split(",")
+            if len(parts) != 2:
+                continue
+            try:
+                pred_start = int(parts[0].strip())
+                pred_end = int(parts[1].strip())
+            except ValueError:
+                continue
+            pred_ranges.append((pred_start, pred_end))
+        if len(pred_ranges) == 0:
+            return None, None
+        return pred_event, pred_ranges
+
+    # Fallback format used in some generations:
+    # <event>Some event (0 3), (7 10)</event>
+    time_matches = re.findall(r"\(\s*(\d+)\s+(\d+)\s*\)", event_content)
+    if len(time_matches) == 0:
+        return None, None
+    for start_str, end_str in time_matches:
+        pred_ranges.append((int(start_str), int(end_str)))
+
+    first_range_match = re.search(r"\(\s*\d+\s+\d+\s*\)", event_content)
+    if first_range_match is None:
+        return None, None
+    pred_event = event_content[: first_range_match.start()].strip().rstrip(",").lower()
+    if pred_event not in mapping:
+        return None, None
+    return pred_event, pred_ranges
+
+
+def evaluate(inference_path: Path, annotations_path: Path):
+    mapping = load_event_mapping(annotations_path)
+    rows = []
+    with jsonlines.open(str(inference_path), "r") as reader:
+        for sample in reader:
+            rows.append(sample)
+
+    n_samples = len(rows)
+    n_frames = n_samples * 10
+    pred_labels = np.zeros(n_frames)
+    real_labels = np.zeros(n_frames)
+
+    valid = 0
+    c = 0
+    for sample in rows:
+        answer = sample["output"].replace("</s>", "").strip()
+        pred = sample["predict"]
+
+        event_match = re.findall(r"event:(.*?)start_time", answer)
+        if len(event_match) != 1:
+            c += 10
+            continue
+        event = event_match[0].strip().lower()
+        if event not in mapping:
+            c += 10
+            continue
+
+        try:
+            start_time = int(answer.split(" ")[-2].split(":")[-1])
+            end_time = int(answer.split(" ")[-1].split(":")[-1])
+        except ValueError:
+            c += 10
+            continue
+
+        pred_event, pred_ranges = parse_pred_event_and_ranges(pred, mapping)
+        if pred_event is None:
+            c += 10
+            continue
+
+        valid += 1
+        for i in range(10):
+            if start_time <= i <= end_time:
+                real_labels[c] = mapping[event]
+            if any(pred_start <= i <= pred_end for pred_start, pred_end in pred_ranges):
+                pred_labels[c] = mapping[pred_event]
+            c += 1
+
+    acc = accuracy_score(real_labels, pred_labels)
+    return {
+        "inference_path": str(inference_path),
+        "annotations_path": str(annotations_path),
+        "total_samples": n_samples,
+        "valid_prediction_format_samples": valid,
+        "frame_accuracy": float(acc),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate AVE inference jsonl with configurable paths.")
+    parser.add_argument("--inference-jsonl", required=True, type=Path)
+    parser.add_argument("--annotations", required=True, type=Path)
+    parser.add_argument("--out-json", required=False, type=Path, default=None)
+    args = parser.parse_args()
+
+    metrics = evaluate(args.inference_jsonl, args.annotations)
+    print(json.dumps(metrics, indent=2))
+
+    if args.out_json is not None:
+        args.out_json.parent.mkdir(parents=True, exist_ok=True)
+        args.out_json.write_text(json.dumps(metrics, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/AudioVisualText/scripts/analysis/plot_grad_sensitivity.py b/AudioVisualText/scripts/analysis/plot_grad_sensitivity.py
new file mode 100644
index 0000000..160a7d1
--- /dev/null
+++ b/AudioVisualText/scripts/analysis/plot_grad_sensitivity.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Plot and summarize gradient-sensitivity logs.
+
+This script always writes CSV summaries from grad_sensitivity.jsonl.
+If matplotlib is available, it also writes PNG line plots.
+"""
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Dict, List, Sequence, Tuple
+
+
+PLOT_SERIES = {
+    "lora_rel": [
+        "grad_sens/lora_A_text_relative_grad_norm",
+        "grad_sens/lora_A_visual_relative_grad_norm",
+        "grad_sens/lora_A_audio_relative_grad_norm",
+        "grad_sens/lora_B_shared_relative_grad_norm",
+    ],
+    "lora_grad": [
+        "grad_sens/lora_A_text_grad_norm",
+        "grad_sens/lora_A_visual_grad_norm",
+        "grad_sens/lora_A_audio_grad_norm",
+        "grad_sens/lora_B_shared_grad_norm",
+    ],
+    "projector_rel": [
+        "grad_sens/vl_projector_relative_grad_norm",
+        "grad_sens/al_projector_relative_grad_norm",
+    ],
+}
+
+
+def _moving_average(values: Sequence[float], window: int) -> List[float]:
+    if window <= 1:
+        return list(values)
+    out: List[float] = []
+    running = 0.0
+    q: List[float] = []
+    for v in values:
+        q.append(v)
+        running += v
+        if len(q) > window:
+            running -= q.pop(0)
+        out.append(running / len(q))
+    return out
+
+
+def _load_rows(jsonl_path: Path) -> List[Dict]:
+    rows: List[Dict] = []
+    with jsonl_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            rows.append(json.loads(line))
+    rows.sort(key=lambda r: int(r["step"]))
+    return rows
+
+
+def _write_long_csv(rows: Sequence[Dict], out_csv: Path) -> None:
+    keys = sorted({k for r in rows for k in r.keys() if k.startswith("grad_sens/")})
+    with out_csv.open("w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(["step", "epoch", "metric", "value"])
+        for r in rows:
+            step = int(r["step"])
+            epoch = float(r.get("epoch", 0.0))
+            for k in keys:
+                if k in r:
+                    w.writerow([step, epoch, k, float(r[k])])
+
+
+def _write_summary_csv(rows: Sequence[Dict], out_csv: Path) -> None:
+    keys = sorted({k for r in rows for k in r.keys() if k.startswith("grad_sens/")})
+    with out_csv.open("w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(["metric", "count", "mean", "min", "max"])
+        for k in keys:
+            vals = [float(r[k]) for r in rows if k in r]
+            if not vals:
+                continue
+            w.writerow([k, len(vals), sum(vals) / len(vals), min(vals), max(vals)])
+
+
+def _maybe_plot(rows: Sequence[Dict], output_dir: Path, smooth_window: int) -> Tuple[bool, str]:
+    try:
+        import matplotlib.pyplot as plt  # type: ignore
+    except Exception:
+        return False, "matplotlib not installed; wrote CSV files only."
+
+    steps = [int(r["step"]) for r in rows]
+    for group_name, metrics in PLOT_SERIES.items():
+        plt.figure(figsize=(10, 5))
+        for metric in metrics:
+            vals = [float(r.get(metric, 0.0)) for r in rows]
+            vals = _moving_average(vals, smooth_window)
+            short_label = metric.replace("grad_sens/", "")
+            plt.plot(steps, vals, label=short_label, linewidth=1.6)
+        plt.xlabel("Step")
+        plt.ylabel("Value")
+        plt.title(f"{group_name} (smooth={smooth_window})")
+        plt.legend(loc="best", fontsize=8)
+        plt.tight_layout()
+        plt.savefig(output_dir / f"{group_name}.png", dpi=180)
+        plt.close()
+    return True, "PNG plots written."
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--jsonl", required=True, help="Path to grad_sensitivity.jsonl")
+    parser.add_argument("--out_dir", required=True, help="Output directory for CSV/plots")
+    parser.add_argument("--smooth_window", type=int, default=10, help="Moving-average window for plots")
+    args = parser.parse_args()
+
+    jsonl_path = Path(args.jsonl).expanduser().resolve()
+    out_dir = Path(args.out_dir).expanduser().resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    rows = _load_rows(jsonl_path)
+    if not rows:
+        raise SystemExit(f"No rows found in {jsonl_path}")
+
+    _write_long_csv(rows, out_dir / "grad_sensitivity_long.csv")
+    _write_summary_csv(rows, out_dir / "grad_sensitivity_summary.csv")
+    plotted, msg = _maybe_plot(rows, out_dir, max(1, args.smooth_window))
+
+    print(f"Rows: {len(rows)}")
+    print(f"Step range: {rows[0]['step']} -> {rows[-1]['step']}")
+    print(f"Wrote: {out_dir / 'grad_sensitivity_long.csv'}")
+    print(f"Wrote: {out_dir / 'grad_sensitivity_summary.csv'}")
+    print(msg)
+    if not plotted:
+        print("Install matplotlib to get PNGs: pip install matplotlib")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/AudioVisualText/scripts/evaluation/ave_eval.py b/AudioVisualText/scripts/evaluation/ave_eval.py
index 2a94565..5567fba 100644
--- a/AudioVisualText/scripts/evaluation/ave_eval.py
+++ b/AudioVisualText/scripts/evaluation/ave_eval.py
@@ -1,103 +1,186 @@
+import argparse
+import json
+import os
+
 import jsonlines
 import re
 import numpy as np
 from sklearn.metrics import accuracy_score
 
-vocab = set()
-with open('Annotations.txt','r') as f:
-    lines = f.readlines()
-for line in lines:
-    line = line.strip()
-    event = line.split('&')[0]
-    vocab.add(event)
-vocab = list(vocab)
-print(len(vocab),vocab)
-mapping = {}
-mapping['none'] = 0
-for i,event in enumerate(vocab):
-    event = event.lower()
-    mapping[event] = i + 1
-print(mapping)
-
-path = 'inference_ave.jsonl'
-N = 402 * 10
-c = 0
-pre_labels = np.zeros(N)
-real_labels = np.zeros(N)
-nums = 0
-with jsonlines.open(path,'r') as f:
-    for idx, sample in enumerate(f):
-        answer = sample['output']
-        pred = sample['predict']
-        matches = re.findall(r'event:(.*?)start_time', answer)
-        event = matches[0].strip().lower()
-
-        answer = answer.replace('</s>','')
-        answer = answer.strip()
-        start_time = int(answer.split(' ')[-2].split(':')[-1])
-        end_time = int(answer.split(' ')[-1].split(':')[-1])
-        
-        matches = re.findall(r'<event>(.*?)</event>', pred)
-        if len(matches) != 1:
-            print('event != 1,  idx: ',idx, 'pred: ', pred)
-            continue
-        event_content = matches[0].strip()
-        pred_event_temp = event_content.lower()
-        
-        pred_ranges = []
-        if pred_event_temp in mapping:
-            pred_event = pred_event_temp
-            matches = re.findall(r'<range>(.*?)</range>', pred)
-            if len(matches) == 0:
-                print('idx: ',idx, ' no ranges found in pred: ', pred)
-                continue
-            for range_str in matches:
-                try:
-                    parts = range_str.strip().split(',')
-                    if len(parts) != 2:
-                        raise ValueError("Invalid range format")
-                    pred_start = int(parts[0].strip())
-                    pred_end = int(parts[1].strip())
-                    pred_ranges.append((pred_start, pred_end))
-                except:
-                    print('****** idx: ',idx,' exception in primary range')
-                    continue
-            if len(pred_ranges) == 0:
-                print('idx: ',idx, ' no valid ranges in primary format')
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--jsonl_path",
+        type=str,
+        default="inference_ave.jsonl",
+        help="Path to inference_ave.jsonl (predictions).",
+    )
+    parser.add_argument(
+        "--annotations_path",
+        type=str,
+        default="Annotations.txt",
+        help="Path to AVE Annotations.txt.",
+    )
+    parser.add_argument(
+        "--output_metrics_path",
+        type=str,
+        default=None,
+        help="Optional path to write metrics JSON (e.g. metrics.jsonl).",
+    )
+    args = parser.parse_args()
+
+    vocab = set()
+    with open(args.annotations_path, "r") as f:
+        lines = f.readlines()
+    for line in lines:
+        line = line.strip()
+        event = line.split("&")[0]
+        vocab.add(event)
+    vocab = list(vocab)
+    print(len(vocab), vocab)
+    mapping = {"none": 0}
+    for i, event in enumerate(vocab):
+        event = event.lower()
+        mapping[event] = i + 1
+    print(mapping)
+
+    path = args.jsonl_path
+    N = 402 * 10
+    c = 0
+    pre_labels = np.zeros(N)
+    real_labels = np.zeros(N)
+    nums = 0
+
+    skipped_reasons = {
+        "event_mismatch": 0,
+        "no_primary_ranges": 0,
+        "no_valid_primary_ranges": 0,
+        "primary_range_exception": 0,
+        "no_secondary_ranges": 0,
+        "no_secondary_parentheses": 0,
+        "secondary_event_not_in_mapping": 0,
+        "secondary_exception": 0,
+    }
+
+    idx = -1
+    with jsonlines.open(path, "r") as f:
+        for idx, sample in enumerate(f):
+            answer = sample["output"]
+            pred = sample["predict"]
+            matches = re.findall(r"event:(.*?)start_time", answer)
+            event = matches[0].strip().lower()
+
+            answer = answer.replace("</s>", "")
+            answer = answer.strip()
+            start_time = int(answer.split(" ")[-2].split(":")[-1])
+            end_time = int(answer.split(" ")[-1].split(":")[-1])
+
+            matches = re.findall(r"<event>(.*?)</event>", pred)
+            if len(matches) != 1:
+                print("event != 1,  idx: ", idx, "pred: ", pred)
+                skipped_reasons["event_mismatch"] += 1
                 continue
-        else:
-            try:
-                time_matches = re.findall(r'\(\s*(\d+)\s+(\d+)\s*\)', event_content)
-                if len(time_matches) == 0:
-                    print('****** idx: ',idx, ' no time ranges found in secondary format')
+            event_content = matches[0].strip()
+            pred_event_temp = event_content.lower()
+
+            pred_ranges = []
+            if pred_event_temp in mapping:
+                pred_event = pred_event_temp
+                matches = re.findall(r"<range>(.*?)</range>", pred)
+                if len(matches) == 0:
+                    print("idx: ", idx, " no ranges found in pred: ", pred)
+                    skipped_reasons["no_primary_ranges"] += 1
                     continue
-                for start_str, end_str in time_matches:
-                    pred_start = int(start_str)
-                    pred_end = int(end_str)
-                    pred_ranges.append((pred_start, pred_end))
-
-                first_range_match = re.search(r'\(\s*\d+\s+\d+\s*\)', event_content)
-                if first_range_match is None:
-                    print('****** idx: ',idx, ' no parentheses in secondary format')
+                for range_str in matches:
+                    try:
+                        parts = range_str.strip().split(",")
+                        if len(parts) != 2:
+                            raise ValueError("Invalid range format")
+                        pred_start = int(parts[0].strip())
+                        pred_end = int(parts[1].strip())
+                        pred_ranges.append((pred_start, pred_end))
+                    except Exception:
+                        print("****** idx: ", idx, " exception in primary range")
+                        skipped_reasons["primary_range_exception"] += 1
+                        continue
+                if len(pred_ranges) == 0:
+                    print("idx: ", idx, " no valid ranges in primary format")
+                    skipped_reasons["no_valid_primary_ranges"] += 1
                     continue
-                pred_event = event_content[:first_range_match.start()].strip().rstrip(',').lower()
-                if pred_event not in mapping:
-                    print('==== idx: ',idx, ' event not in mapping:', pred_event)
+            else:
+                try:
+                    time_matches = re.findall(r"\(\s*(\d+)\s+(\d+)\s*\)", event_content)
+                    if len(time_matches) == 0:
+                        print(
+                            "****** idx: ",
+                            idx,
+                            " no time ranges found in secondary format",
+                        )
+                        skipped_reasons["no_secondary_ranges"] += 1
+                        continue
+                    for start_str, end_str in time_matches:
+                        pred_start = int(start_str)
+                        pred_end = int(end_str)
+                        pred_ranges.append((pred_start, pred_end))
+
+                    first_range_match = re.search(
+                        r"\(\s*\d+\s+\d+\s*\)", event_content
+                    )
+                    if first_range_match is None:
+                        print(
+                            "****** idx: ",
+                            idx,
+                            " no parentheses in secondary format",
+                        )
+                        skipped_reasons["no_secondary_parentheses"] += 1
+                        continue
+                    pred_event = (
+                        event_content[: first_range_match.start()]
+                        .strip()
+                        .rstrip(",")
+                        .lower()
+                    )
+                    if pred_event not in mapping:
+                        print("==== idx: ", idx, " event not in mapping:", pred_event)
+                        skipped_reasons["secondary_event_not_in_mapping"] += 1
+                        continue
+                except Exception:
+                    print("****** idx: ", idx, " secondary exception")
+                    skipped_reasons["secondary_exception"] += 1
                     continue
-            except:
-                print('****** idx: ',idx,' secondary exception')
-                continue
 
-        nums += 1
-        for i in range(10):
-            if i >= start_time and i <= end_time:
-                real_labels[c] = mapping[event]    
-            if any(pred_start <= i <= pred_end for pred_start, pred_end in pred_ranges):
-                pre_labels[c] = mapping[pred_event]
-            c += 1
-
-print('tot: ',idx)
-real_labels = np.array(real_labels)
-pre_labels = np.array(pre_labels)
-acc = accuracy_score(real_labels, pre_labels)
-print('c: ',c, ' nums: ',nums, 'acc: ',acc)
+            nums += 1
+            for i in range(10):
+                if i >= start_time and i <= end_time:
+                    real_labels[c] = mapping[event]
+                if any(pred_start <= i <= pred_end for pred_start, pred_end in pred_ranges):
+                    pre_labels[c] = mapping[pred_event]
+                c += 1
+
+    print("tot: ", idx)
+    real_labels = np.array(real_labels)
+    pre_labels = np.array(pre_labels)
+    acc = accuracy_score(real_labels, pre_labels)
+    skipped = (idx + 1) - nums if idx >= 0 else 0
+    print("c: ", c, " nums: ", nums, "skipped: ", skipped, "acc: ", acc)
+    print("skipped detail:", skipped_reasons)
+
+    if args.output_metrics_path:
+        metrics = {
+            "jsonl_path": os.path.abspath(path),
+            "annotations_path": os.path.abspath(args.annotations_path),
+            "total_samples": int(idx + 1 if idx >= 0 else 0),
+            "valid_samples": int(nums),
+            "frames": int(c),
+            "skipped": int(skipped),
+            "accuracy": float(acc),
+            "skipped_detail": skipped_reasons,
+        }
+        with open(args.output_metrics_path, "w") as f:
+            json.dump(metrics, f)
+        print("Wrote metrics to", args.output_metrics_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/AudioVisualText/scripts/finetune/finetune.py b/AudioVisualText/scripts/finetune/finetune.py
index 8828f66..1e47af2 100644
--- a/AudioVisualText/scripts/finetune/finetune.py
+++ b/AudioVisualText/scripts/finetune/finetune.py
@@ -23,10 +23,9 @@ def train(attn_implementation=None):
     # print('lalala')
     # return
     global local_rank
-    set_seed(42)
-
     parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    set_seed(training_args.seed)
 
     output_dir = training_args.output_dir
     saved_config = {
@@ -85,16 +84,19 @@ def make_inputs_require_grad(module, input, output):
         print('lora_nums: ',lora_nums)
         modules_to_save = None
         peft_config = LoraConfig(
-            task_type = "CAUSAL_LM",
-            target_modules = target_modules,
-            inference_mode = False,
-            r = lora_rank, 
-            loramethod= training_args.loramethod,
+            task_type="CAUSAL_LM",
+            target_modules=target_modules,
+            inference_mode=False,
+            r=lora_rank,
+            loramethod=training_args.loramethod,
             reserved_modality=training_args.reserved_modality,
-            lora_alpha = lora_alpha,
-            lora_dropout = lora_dropout,
-            lora_nums = lora_nums,
-            blc_alpha= training_args.blc_alpha,
+            cross_attn_kv_mode=training_args.cross_attn_kv_mode,
+            cross_modal_mode=training_args.cross_modal_mode,
+            trilinear_pack_tokens=training_args.trilinear_pack_tokens,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            lora_nums=lora_nums,
+            blc_alpha=training_args.blc_alpha,
             blc_weight=training_args.blc_weight,
         )
         model = get_peft_model(model, peft_config)
@@ -179,30 +181,105 @@ def make_inputs_require_grad(module, input, output):
             f.write(str(model))
     
     image_processor = model.get_model().visual_encoder.image_processor if training_args.visual_branch else None
-    dataset, collator = get_dataset_collator(data_args=data_args, tokenizer=tokenizer, 
+    dataset, collator = get_dataset_collator(data_args=data_args, tokenizer=tokenizer,
                                              image_processor=image_processor)
     trainer = UnifiedTrainer(model=model, tokenizer=tokenizer, args=training_args,
                              train_dataset=dataset, data_collator=collator)
 
-    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
-        trainer.train(resume_from_checkpoint=True)
+    # Lightweight resume logic added here since original was not working:
+    # - We do NOT rely on DeepSpeed/Trainer checkpoints, since this codebase only
+    #   writes `finetune_weights.bin` inside each checkpoint directory.
+    # - On restart, we scan checkpoint-* from latest to earliest, find the first
+    #   one that contains finetune_weights.bin, and load those weights into the
+    #   model before starting a fresh Trainer run. This preserves learned LoRA
+    #   / projector weights even if optimizer state is not restored.
+    output_path = pathlib.Path(training_args.output_dir)
+    ckpt_dirs = sorted(
+        output_path.glob("checkpoint-*"),
+        key=lambda p: int(p.name.split("-")[1]) if p.name.split("-")[1].isdigit() else -1,
+    )
+
+    resume_weights_path = None
+    resume_ckpt_dir = None
+    for ckpt in reversed(ckpt_dirs):
+        candidate = ckpt / "finetune_weights.bin"
+        if candidate.exists():
+            resume_weights_path = candidate
+            resume_ckpt_dir = ckpt
+            break
+
+    if resume_weights_path is not None:
+        rank0_print(f"Loading finetune weights from {resume_weights_path} before training.")
+        try:
+            weights = torch.load(resume_weights_path, map_location="cpu")
+            missing, unexpected = model.load_state_dict(weights, strict=False)
+            rank0_print(f"Loaded finetune weights; missing={len(missing)}, unexpected={len(unexpected)}")
+        except Exception as e:
+            rank0_print(f"[Warning] Failed to load finetune weights from {resume_weights_path}: {e}. Proceeding with base weights.")
+
+    # So the Trainer resumes step/epoch and skips to the right point in the loop, pass the
+    # checkpoint dir. We only save finetune_weights.bin + trainer_state.json (no DeepSpeed shards),
+    # so we must skip DeepSpeed's load when it would fail.
+    if resume_ckpt_dir is not None:
+        ckpt_path = str(resume_ckpt_dir)
+        trainer_state_path = os.path.join(ckpt_path, "trainer_state.json")
+        if os.path.isfile(trainer_state_path):
+            import transformers.trainer as _tr_module
+            _orig_ds_load = _tr_module.deepspeed_load_checkpoint
+            def _skip_ds_load_if_our_format(engine, path):
+                if path and os.path.isfile(os.path.join(path, "finetune_weights.bin")) and os.path.isfile(os.path.join(path, "trainer_state.json")):
+                    rank0_print("Resuming from our checkpoint format; skipping DeepSpeed load (model already loaded).")
+                    return
+                _orig_ds_load(engine, path)
+            _tr_module.deepspeed_load_checkpoint = _skip_ds_load_if_our_format
+            try:
+                trainer.train(resume_from_checkpoint=ckpt_path)
+            finally:
+                _tr_module.deepspeed_load_checkpoint = _orig_ds_load
+        else:
+            trainer.train()
     else:
         trainer.train()
     trainer.save_state()
 
     model.config.use_cache = True
 
-    if training_args.lora_enable:
-        state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
-        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
-        if training_args.local_rank == 0 or training_args.local_rank == -1:
-            model.config.save_pretrained(training_args.output_dir)
-            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
-            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    if training_args.lora_enable and (training_args.local_rank == 0 or training_args.local_rank == -1):
+        output_dir = training_args.output_dir
+        try:
+            state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
+            non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
+            model.config.save_pretrained(output_dir)
+            model.save_pretrained(output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
+        except Exception as e:
+            import traceback
+            log_path = os.path.join(output_dir, 'final_save_error.txt')
+            with open(log_path, 'w') as f:
+                f.write(str(e) + '\n')
+                f.write(traceback.format_exc())
+            # Fallback: build adapter_model.bin and non_lora_trainables.bin from latest checkpoint
+            ckpt_dirs = sorted(pathlib.Path(output_dir).glob('checkpoint-*'), key=lambda p: int(p.name.split('-')[1]))
+            if ckpt_dirs:
+                last_ckpt = ckpt_dirs[-1]
+                finetune_path = last_ckpt / 'finetune_weights.bin'
+                if finetune_path.exists():
+                    weights = torch.load(finetune_path, map_location='cpu')
+                    adapter_dict = {k: v for k, v in weights.items() if 'lora_' in k}
+                    non_lora_dict = {k: v for k, v in weights.items() if 'lora_' not in k}
+                    if adapter_dict:
+                        torch.save(adapter_dict, os.path.join(output_dir, 'adapter_model.bin'))
+                    if non_lora_dict:
+                        torch.save(non_lora_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
+                    with open(log_path, 'a') as f:
+                        f.write(f'\nFallback: wrote adapter/non_lora from {last_ckpt}\n')
+                else:
+                    raise
+            else:
+                raise
 
 
 if __name__ == "__main__":
     train()
 
 
-
diff --git a/AudioVisualText/scripts/finetune/ft_ave.sh b/AudioVisualText/scripts/finetune/ft_ave.sh
index 6d87ede..e4b74d6 100644
--- a/AudioVisualText/scripts/finetune/ft_ave.sh
+++ b/AudioVisualText/scripts/finetune/ft_ave.sh
@@ -1,12 +1,19 @@
 #!/bin/bash
+#SBATCH --partition=overcap
+#SBATCH --requeue
+#SBATCH --open-mode=append
 
 # Environment Variables
 WORLD_SIZE=1
-NPROC_PER_NODE=8
-MASTER_PORT=6666
+NPROC_PER_NODE=${NPROC_PER_NODE:-$(nvidia-smi -L 2>/dev/null | wc -l)}
+MASTER_PORT=${MASTER_PORT:-6666}
 RANK=0
 
-llama_ckpt_path=/dockerdata/Llama-2-7b-chat-hf
+if [ "$NPROC_PER_NODE" -le 0 ]; then
+    NPROC_PER_NODE=1
+fi
+
+llama_ckpt_path=/nethome/rkhan96/flash/weights/Llama-2-7b-chat-hf
 
 # Training Arguments
 LOCAL_BATCH_SIZE=4
@@ -16,18 +23,72 @@ GLOBAL_BATCH_SIZE=$WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE*$GRADIENT_ACCUMU
 # Log Arguments
 export TRANSFORMERS_OFFLINE=1
 export WANDB_PROJECT=finetune
-RUN_NAME=llama_ave
+export CROSS_ATTN_KV_MODE=${CROSS_ATTN_KV_MODE:-question}
+export CROSS_MODAL_MODE=${CROSS_MODAL_MODE:-trilinear}
+MODEL_NAME=${MODEL_NAME:-llama2}
+DATASET=${DATASET:-ave}
+PRECISION=${PRECISION:-bf16}
+GRAD_SENS_RUN=${GRAD_SENS_RUN:-0}
+BLC_WEIGHT=${BLC_WEIGHT:-1}
+TRILINEAR_PACK_TOKENS=${TRILINEAR_PACK_TOKENS:-False}
+LOGGING_STEPS=${LOGGING_STEPS:-10}
+
+# Map fusion mode to a short tag for naming:
+#   - pairwise  -> moka
+#   - trilinear -> trilinear
+if [ "$CROSS_MODAL_MODE" = "pairwise" ]; then
+    CROSS_MODAL_TAG="moka"
+else
+    CROSS_MODAL_TAG="trilinear"
+    # Triton trilinear uses more GPU memory; reduce batch size unless overridden.
+    LOCAL_BATCH_SIZE=${TRILINEAR_BATCH:-2}
+fi
+
+# Epochs (default 3). Override via NUM_EPOCHS=1 for fast ablations.
+NUM_EPOCHS=${NUM_EPOCHS:-3}
+
+# Run dir: model_dataset_precision_question|full_text_<moka|trilinear>_[ablation_]epN_bwN_date_time
+# ABLATION_TAG: optional extra suffix such as kernel or packed
+RUN_NAME="${MODEL_NAME}_${DATASET}_${PRECISION}_${CROSS_ATTN_KV_MODE}_${CROSS_MODAL_TAG}"
+[ -n "${ABLATION_TAG:-}" ] && RUN_NAME="${RUN_NAME}_${ABLATION_TAG}"
+[ "$GRAD_SENS_RUN" = "1" ] && RUN_NAME="${RUN_NAME}_gradsense"
+RUN_NAME="${RUN_NAME}_ep${NUM_EPOCHS}_bw${BLC_WEIGHT}_$(date +%Y%m%d_%H%M%S)"
+export RUN_NAME
 OUTP_DIR=results
+OUTPUT_DIR=${OUTPUT_DIR:-$OUTP_DIR/$WANDB_PROJECT/$RUN_NAME}
+echo "Output directory: $OUTPUT_DIR"
+mkdir -p "$OUTPUT_DIR"
+
+# Mirror Slurm stdout/stderr into the run directory only when the scheduler
+# is not already writing directly to those files. This avoids duplicate lines.
+if [ -n "${SLURM_JOB_ID:-}" ]; then
+    desired_stdout="$(readlink -f "$OUTPUT_DIR/slurm-${SLURM_JOB_ID}.out" 2>/dev/null || printf '%s' "$OUTPUT_DIR/slurm-${SLURM_JOB_ID}.out")"
+    desired_stderr="$(readlink -f "$OUTPUT_DIR/slurm-${SLURM_JOB_ID}.err" 2>/dev/null || printf '%s' "$OUTPUT_DIR/slurm-${SLURM_JOB_ID}.err")"
+    current_stdout="$(readlink -f "/proc/$$/fd/1" 2>/dev/null || true)"
+    current_stderr="$(readlink -f "/proc/$$/fd/2" 2>/dev/null || true)"
+
+    if [ "$current_stdout" != "$desired_stdout" ]; then
+        exec > >(tee -a "$OUTPUT_DIR/slurm-${SLURM_JOB_ID}.out")
+    fi
+    if [ "$current_stderr" != "$desired_stderr" ]; then
+        exec 2> >(tee -a "$OUTPUT_DIR/slurm-${SLURM_JOB_ID}.err" >&2)
+    fi
+fi
+
+DEEPSPEED_CONFIG=${DEEPSPEED_CONFIG:-deepspeed/stage2-offload.json}
 export TOKENIZERS_PARALLELISM='true'
 export ASCEND_LAUNCH_BLOCKING='1'
+SEED=${SEED:-42}
 
 torchrun --nproc_per_node $NPROC_PER_NODE \
     --master_port $MASTER_PORT \
     scripts/finetune/finetune.py \
-    --deepspeed deepspeed/stage2-offload.json \
+    --deepspeed $DEEPSPEED_CONFIG \
     --llm_name llama \
     --reserved_modality None \
     --loramethod train \
+    --cross_attn_kv_mode $CROSS_ATTN_KV_MODE \
+    --cross_modal_mode $CROSS_MODAL_MODE \
     --model_name_or_path $llama_ckpt_path \
     --exp_desc "baseline" \
     --freeze_backbone True \
@@ -36,9 +97,10 @@ torchrun --nproc_per_node $NPROC_PER_NODE \
     --lora_r 444 \
     --lora_alpha 16 \
     --lora_dropout 0.05 \
-    --blc_weight 1 \
+    --blc_weight $BLC_WEIGHT \
     --blc_alpha 1 \
-    --bf16 True \
+    --trilinear_pack_tokens $TRILINEAR_PACK_TOKENS \
+    --bf16 $([ "$PRECISION" = "bf16" ] && echo True || echo False) \
     --tf32 False \
     --fp16 False \
     --avqa_task False \
@@ -46,30 +108,33 @@ torchrun --nproc_per_node $NPROC_PER_NODE \
     --save_modules vl_projector,al_projector,lora \
     --visual_branch True \
     --video_frame_nums 10 \
-    --vit_ckpt_path /dockerdata/clip-vit-large-patch14 \
+    --vit_ckpt_path /nethome/rkhan96/flash/weights/clip-vit-large-patch14 \
     --select_feature patch \
     --image_size 224 \
     --patch_size 14 \
     --visual_query_token_nums 32 \
     --audio_branch True \
-    --BEATs_ckpt_path /dockerdata/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
+    --BEATs_ckpt_path /nethome/rkhan96/flash/weights/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
     --audio_query_token_nums 32 \
-    --output_dir $OUTP_DIR/$WANDB_PROJECT/$RUN_NAME \
-    --num_train_epochs 3 \
+    --output_dir $OUTPUT_DIR \
+    --seed $SEED \
+    --num_train_epochs $NUM_EPOCHS \
     --per_device_train_batch_size $LOCAL_BATCH_SIZE \
     --per_device_eval_batch_size $LOCAL_BATCH_SIZE \
     --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
     --ddp_find_unused_parameters True \
     --evaluation_strategy "no" \
     --save_strategy "steps" \
-    --save_steps 0.1 \
+    --save_steps 150 \
     --save_total_limit 10 \
     --learning_rate 1e-4 \
     --weight_decay 0. \
     --warmup_ratio 0.03 \
     --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
+    --logging_steps $LOGGING_STEPS \
     --gradient_checkpointing True \
     --half_precision_backend "auto" \
     --dataloader_num_workers 4 \
-    --report_to tensorboard \
\ No newline at end of file
+    --grad_sensitivity_enable $([ "$GRAD_SENS_RUN" = "1" ] && echo True || echo False) \
+    --grad_sensitivity_include_projectors $([ "$GRAD_SENS_RUN" = "1" ] && echo True || echo False) \
+    --report_to tensorboard \
diff --git a/AudioVisualText/scripts/finetune/infer_ave.sh b/AudioVisualText/scripts/finetune/infer_ave.sh
index 004bbf3..2e436ea 100644
--- a/AudioVisualText/scripts/finetune/infer_ave.sh
+++ b/AudioVisualText/scripts/finetune/infer_ave.sh
@@ -2,12 +2,25 @@
 
 # Environment Variables
 WORLD_SIZE=1
-NPROC_PER_NODE=8
+NPROC_PER_NODE=${NPROC_PER_NODE:-$(nvidia-smi -L 2>/dev/null | wc -l)}
 MASTER_PORT=6668
 RANK=0
 
-llama_ckpt_path=/dockerdata/Llama-2-7b-chat-hf
-YOUR_CKPT_PARH=AVE_checkpoint
+llama_ckpt_path=/nethome/rkhan96/flash/weights/Llama-2-7b-chat-hf
+YOUR_CKPT_PARH=${YOUR_CKPT_PARH:?Set YOUR_CKPT_PARH to the run directory (e.g. results/finetune/llama2_ave_bf16_question_moka_bw0.5_... or ..._question_trilinear_bw0.5_...)}
+export CROSS_ATTN_KV_MODE=${CROSS_ATTN_KV_MODE:-question}
+export CROSS_MODAL_MODE=${CROSS_MODAL_MODE:-trilinear}
+PRECISION=${PRECISION:-bf16}
+BLC_WEIGHT=${BLC_WEIGHT:-1}
+
+if [ "$NPROC_PER_NODE" -le 0 ]; then
+    NPROC_PER_NODE=1
+fi
+
+if [ -z "$YOUR_CKPT_PARH" ]; then
+    echo "No checkpoint found. Set YOUR_CKPT_PARH manually."
+    exit 1
+fi
 
 # Training Arguments
 LOCAL_BATCH_SIZE=1
@@ -29,6 +42,8 @@ torchrun --nproc_per_node $NPROC_PER_NODE \
     --llm_name llama \
     --reserved_modality None \
     --loramethod test \
+    --cross_attn_kv_mode $CROSS_ATTN_KV_MODE \
+    --cross_modal_mode $CROSS_MODAL_MODE \
     --model_name_or_path $llama_ckpt_path \
     --freeze_backbone True \
     --lora_enable True \
@@ -36,9 +51,9 @@ torchrun --nproc_per_node $NPROC_PER_NODE \
     --lora_r 444 \
     --lora_alpha 16 \
     --lora_dropout 0.05 \
-    --blc_weight 1 \
+    --blc_weight $BLC_WEIGHT \
     --blc_alpha 1 \
-    --bf16 True \
+    --bf16 $([ "$PRECISION" = "bf16" ] && echo True || echo False) \
     --tf32 False \
     --fp16 False \
     --ckpt_dir $YOUR_CKPT_PARH \
@@ -46,12 +61,17 @@ torchrun --nproc_per_node $NPROC_PER_NODE \
     --ave_task True \
     --visual_branch True \
     --video_frame_nums 10 \
-    --vit_ckpt_path /dockerdata/clip-vit-large-patch14 \
+    --vit_ckpt_path /nethome/rkhan96/flash/weights/clip-vit-large-patch14 \
     --image_size 224 \
     --patch_size 14 \
     --visual_query_token_nums 32 \
     --audio_branch True \
-    --BEATs_ckpt_path /dockerdata/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
+    --BEATs_ckpt_path /nethome/rkhan96/flash/weights/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt \
     --audio_query_token_nums 32 \
-    --output_dir 'not_used' \
+    --output_dir 'not_used'
 
+# After inference, automatically run AVE evaluation and write metrics next to predictions.
+python scripts/evaluation/ave_eval.py \
+    --jsonl_path "$YOUR_CKPT_PARH/inference_results/inference_ave.jsonl" \
+    --annotations_path "AVE_data/annotations.txt" \
+    --output_metrics_path "$YOUR_CKPT_PARH/inference_results/metrics.jsonl" \
diff --git a/AudioVisualText/scripts/finetune/inference_cut.py b/AudioVisualText/scripts/finetune/inference_cut.py
index cd9b3ff..193f106 100644
--- a/AudioVisualText/scripts/finetune/inference_cut.py
+++ b/AudioVisualText/scripts/finetune/inference_cut.py
@@ -19,10 +19,22 @@
 from torch.utils.data import DataLoader
 import transformers
 
+import json
+
 from configs.unified_config import ModelArguments,DataArguments,TrainingArguments,InferenceArguments
 
 from dataset.unified_dataset import get_dataset_collator
 from utils.util import set_seed,find_all_linear_names,prepare_sample,write2json,load_ckpt
+
+
+def _load_ckpt_config(ckpt_dir):
+    """Load adapter_config.json from checkpoint. Used to ensure inference uses the same
+    cross_attn_kv_mode/cross_modal_mode as training (avoids train/inference mismatch)."""
+    path = join(ckpt_dir, "adapter_config.json")
+    if not exists(path):
+        return {}
+    with open(path) as f:
+        return json.load(f)
 from utils.deepspeed_utils import *
 
 local_rank = None
@@ -55,6 +67,8 @@ def inference(dataloader,ckpt_dir,model,tokenizer,task):
     os.makedirs(save_dir,exist_ok=True)
     pbar = tqdm(total=len(dataloader),desc=f'inference {task}')
     fp = join(save_dir,f'inference_{task}.jsonl')
+    if exists(fp):
+        os.remove(fp)
     for step, sample in enumerate(dataloader):
         batch_metadata = sample.pop('batch_metadata')
         bs = len(batch_metadata)
@@ -80,10 +94,9 @@ def inference(dataloader,ckpt_dir,model,tokenizer,task):
 
 def train(attn_implementation=None):
     global local_rank
-    set_seed(42)
-
     parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments, InferenceArguments))
     model_args, data_args, training_args, infer_args = parser.parse_args_into_dataclasses()
+    set_seed(training_args.seed)
 
     if model_args.llm_name == 'llama':
         d_model = 4096
@@ -121,8 +134,15 @@ def make_inputs_require_grad(module, input, output):
             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
 
     if training_args.lora_enable:
-        from peft_hyper import LoraConfig,get_peft_model
-        lora_trainable="q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj"
+        from peft_hyper import LoraConfig, get_peft_model
+        # Load checkpoint's adapter config so inference matches training (avoids KV mode mismatch)
+        ckpt_dir = infer_args.ckpt_dir
+        ckpt_config = _load_ckpt_config(ckpt_dir)
+        cross_attn_kv_mode = ckpt_config.get("cross_attn_kv_mode") or training_args.cross_attn_kv_mode
+        cross_modal_mode = ckpt_config.get("cross_modal_mode") or training_args.cross_modal_mode
+        if ckpt_config:
+            print(f"Using checkpoint config: cross_attn_kv_mode={cross_attn_kv_mode}, cross_modal_mode={cross_modal_mode}")
+        lora_trainable = "q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj"
         target_modules = lora_trainable.split(',')
         lora_rank = training_args.lora_r
         lora_alpha = 16
@@ -130,17 +150,19 @@ def make_inputs_require_grad(module, input, output):
         lora_nums = int(len(str(training_args.lora_r)))
         modules_to_save = None
         peft_config = LoraConfig(
-            task_type = "CAUSAL_LM",
-            target_modules = target_modules,
-            inference_mode = False,
-            r = lora_rank, 
-            loramethod= training_args.loramethod,
+            task_type="CAUSAL_LM",
+            target_modules=target_modules,
+            inference_mode=False,
+            r=lora_rank,
+            loramethod=training_args.loramethod,
             reserved_modality=training_args.reserved_modality,
-            lora_alpha = lora_alpha,
-            lora_dropout = lora_dropout,
-            lora_nums = lora_nums,
-            blc_alpha= training_args.blc_alpha,
-            blc_weight=training_args.blc_weight,
+            cross_attn_kv_mode=cross_attn_kv_mode,
+            cross_modal_mode=cross_modal_mode,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            lora_nums=lora_nums,
+            blc_alpha=ckpt_config.get("blc_alpha", training_args.blc_alpha),
+            blc_weight=ckpt_config.get("blc_weight", training_args.blc_weight),
         )
         model = get_peft_model(model, peft_config)
 
@@ -208,4 +230,3 @@ def make_inputs_require_grad(module, input, output):
 
 if __name__ == "__main__":
     train()
-
diff --git a/AudioVisualText/trainer.py b/AudioVisualText/trainer.py
index 3931dfa..80efc45 100644
--- a/AudioVisualText/trainer.py
+++ b/AudioVisualText/trainer.py
@@ -1,5 +1,6 @@
 # Adopted from: https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py
 import os
+import json
 from typing import List, Optional, Union, Any, Mapping
 
 import torch
@@ -161,6 +162,148 @@ def __iter__(self):
 
 
 class UnifiedTrainer(Trainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._grad_sens_latest: Optional[Mapping[str, float]] = None
+        self._grad_sens_jsonl_path: Optional[str] = None
+        self._grad_sens_last_logged_step: int = -1
+        self._grad_sens_hook_handles = []
+        self._grad_sens_hooked_model_id: Optional[int] = None
+        self._grad_sens_step_g2 = {}
+
+    def _grad_sensitivity_enabled(self) -> bool:
+        return bool(getattr(self.args, "grad_sensitivity_enable", False))
+
+    def _grad_group_name(self, param_name: str) -> Optional[str]:
+        """
+        Map parameter names to analysis groups.
+        LoRA branch mapping is based on this project's custom LoRA implementation:
+        - lora_A0: text branch
+        - lora_A1: visual branch
+        - lora_A2: audio branch
+        - lora_B0: shared projection
+        """
+        if "lora_A0" in param_name:
+            return "lora_A_text"
+        if "lora_A1" in param_name:
+            return "lora_A_visual"
+        if "lora_A2" in param_name:
+            return "lora_A_audio"
+        if "lora_B0" in param_name:
+            return "lora_B_shared"
+
+        if getattr(self.args, "grad_sensitivity_include_projectors", True):
+            if "vl_projector" in param_name:
+                return "vl_projector"
+            if "al_projector" in param_name:
+                return "al_projector"
+        return None
+
+    def _ensure_grad_sensitivity_hooks(self, model: nn.Module) -> None:
+        """
+        Register backward hooks once on the active training model.
+        This is more reliable than reading param.grad directly under DeepSpeed/ZeRO.
+        """
+        model_id = id(model)
+        if self._grad_sens_hooked_model_id == model_id and self._grad_sens_hook_handles:
+            return
+
+        for handle in self._grad_sens_hook_handles:
+            handle.remove()
+        self._grad_sens_hook_handles = []
+        self._grad_sens_hooked_model_id = model_id
+        self._grad_sens_step_g2 = {}
+
+        for name, param in model.named_parameters():
+            group = self._grad_group_name(name)
+            if group is None or not param.requires_grad:
+                continue
+            self._grad_sens_step_g2.setdefault(group, 0.0)
+
+            def _make_hook(group_name):
+                def _hook(grad):
+                    if grad is None:
+                        return
+                    g = grad.detach().float()
+                    self._grad_sens_step_g2[group_name] += float((g * g).sum().item())
+                return _hook
+
+            self._grad_sens_hook_handles.append(param.register_hook(_make_hook(group)))
+
+    def _reset_grad_sensitivity_step_accumulator(self) -> None:
+        for group in list(self._grad_sens_step_g2.keys()):
+            self._grad_sens_step_g2[group] = 0.0
+
+    def _collect_grad_sensitivity(self, model: nn.Module) -> Mapping[str, float]:
+        eps = 1e-12
+        # Floor used to stabilize relative-grad metric when a group starts near zero
+        # (e.g., LoRA B matrices initialized to zeros).
+        param_norm_floor = float(getattr(self.args, "grad_sensitivity_param_norm_floor", 1e-2))
+        accum = {}
+        for name, param in model.named_parameters():
+            group = self._grad_group_name(name)
+            if group is None:
+                continue
+            if group not in accum:
+                accum[group] = {"p2": 0.0, "n": 0}
+
+            # Parameter norm is always tracked when present for scale reference.
+            p = param.detach().float()
+            accum[group]["p2"] += float((p * p).sum().item())
+            accum[group]["n"] += int(param.numel())
+
+        metrics = {}
+        for group, vals in accum.items():
+            # Gradient norm comes from backward hooks (more robust in ZeRO setups).
+            # NOTE: hook accumulation is already per-group, so read it once here.
+            g2 = float(self._grad_sens_step_g2.get(group, 0.0))
+            grad_norm = g2 ** 0.5
+            param_norm = vals["p2"] ** 0.5
+            # Size-normalized gradient scale (independent of parameter magnitude).
+            grad_rms = grad_norm / ((vals["n"] ** 0.5) + eps)
+            # Original relative gradient norm.
+            rel_grad_norm = grad_norm / (param_norm + eps)
+            # Stabilized relative metric to reduce denominator-near-zero inflation.
+            rel_grad_norm_floored = grad_norm / (max(param_norm, param_norm_floor) + eps)
+            metrics[f"grad_sens/{group}_grad_norm"] = grad_norm
+            metrics[f"grad_sens/{group}_grad_rms"] = grad_rms
+            metrics[f"grad_sens/{group}_param_norm"] = param_norm
+            metrics[f"grad_sens/{group}_relative_grad_norm"] = rel_grad_norm
+            metrics[f"grad_sens/{group}_relative_grad_norm_floored"] = rel_grad_norm_floored
+            metrics[f"grad_sens/{group}_num_params"] = float(vals["n"])
+        return metrics
+
+    def _append_grad_sensitivity_jsonl(self, metrics: Mapping[str, float]) -> None:
+        if not self.is_world_process_zero():
+            return
+        if self._grad_sens_jsonl_path is None:
+            self._grad_sens_jsonl_path = os.path.join(self.args.output_dir, "grad_sensitivity.jsonl")
+        payload = {"step": int(self.state.global_step), "epoch": float(self.state.epoch or 0.0)}
+        payload.update(metrics)
+        with open(self._grad_sens_jsonl_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(payload) + "\n")
+
+    def training_step(self, model: nn.Module, inputs: Mapping[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        if self._grad_sensitivity_enabled():
+            self._ensure_grad_sensitivity_hooks(model)
+            self._reset_grad_sensitivity_step_accumulator()
+        loss = super().training_step(model, inputs)
+        if self._grad_sensitivity_enabled():
+            # Capture per-step grad statistics accumulated by hooks during backward.
+            self._grad_sens_latest = self._collect_grad_sensitivity(model)
+        return loss
+
+    def log(self, logs: Mapping[str, float]) -> None:
+        logs = dict(logs)
+        if (
+            self._grad_sensitivity_enabled()
+            and self._grad_sens_latest is not None
+            and self.state.global_step != self._grad_sens_last_logged_step
+        ):
+            logs.update(self._grad_sens_latest)
+            self._append_grad_sensitivity_jsonl(self._grad_sens_latest)
+            self._grad_sens_last_logged_step = int(self.state.global_step)
+        super().log(logs)
 
     def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
         if self.train_dataset is None or not has_length(self.train_dataset):
@@ -219,9 +362,9 @@ def _save_checkpoint(self, model, trial, metrics=None):
 
         # Save optimizer and scheduler
         # self._save_optimizer_and_scheduler(output_dir)
-        # # Save RNG state
         # self._save_rng_state(output_dir)
-        # self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+        if self.args.local_rank == 0 or self.args.local_rank == -1:
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
         # self.args.distributed_state.wait_for_everyone()
         # else:
         #     super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
diff --git a/Triton/moka_trilinear.pdf b/Triton/moka_trilinear.pdf
new file mode 100644
index 0000000..7d88daf
Binary files /dev/null and b/Triton/moka_trilinear.pdf differ
diff --git a/Triton/test_two_simplical_attention.py b/Triton/test_two_simplical_attention.py
new file mode 100644
index 0000000..df25362
--- /dev/null
+++ b/Triton/test_two_simplical_attention.py
@@ -0,0 +1,174 @@
+"""Unit tests for two_simplicial_attention_pytorch."""
+
+import torch
+import pytest
+
+from two_simplicial_attention_pytorch import two_simplicial_attention_pytorch
+
+
+# ── Naive reference (triple loop, no tricks) ─────────────────────────────────
+
+def _naive_two_simplicial(q, k1, k2, v1, v2, w1, w2, k2_bias, v2_bias, sm_scale):
+    """Dead-simple loop implementation for correctness checking."""
+    B, S, H, D = q.shape
+    k2 = k2 + k2_bias
+    v2 = v2 + v2_bias
+    out = torch.zeros_like(q, dtype=torch.float32)
+
+    for b in range(B):
+        for h in range(H):
+            for i in range(S):
+                logits = []
+                vals = []
+                for j in range(S):
+                    if abs(i - j) >= w1:
+                        continue
+                    for k in range(S):
+                        if abs(i - k) >= w2:
+                            continue
+                        # score = (q[i] * k1[j]) . k2[k] * scale
+                        score = (q[b, i, h] * k1[b, j, h] * k2[b, k, h]).sum() * sm_scale
+                        logits.append(score)
+                        vals.append(v1[b, j, h] * v2[b, k, h])  # [D]
+
+                if len(logits) == 0:
+                    continue
+
+                logits_t = torch.stack(logits)  # [N]
+                vals_t = torch.stack(vals)       # [N, D]
+                attn = torch.softmax(logits_t.float(), dim=0)
+                out[b, i, h] = (attn.unsqueeze(-1) * vals_t.float()).sum(0)
+
+    return out.to(q.dtype)
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────────────
+
+@pytest.fixture(params=["cpu", "cuda"])
+def device(request):
+    if request.param == "cuda" and not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    return request.param
+
+
+def _make_inputs(B, S, H, D, device, dtype=torch.float32):
+    torch.manual_seed(42)
+    tensors = [torch.randn(B, S, H, D, device=device, dtype=dtype) for _ in range(5)]
+    return tensors  # q, k1, k2, v1, v2
+
+
+# ── Tests ─────────────────────────────────────────────────────────────────────
+
+class TestCorrectnessVsNaive:
+    """Compare vectorized einsum implementation against naive triple loop."""
+
+    def test_full_attention(self, device):
+        B, S, H, D = 1, 16, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+        sm_scale = D ** -0.5
+
+        out = two_simplicial_attention_pytorch(q, k1, k2, v1, v2)
+        ref = _naive_two_simplicial(q, k1, k2, v1, v2, S, S, 0.0, 0.0, sm_scale)
+
+        torch.testing.assert_close(out, ref, atol=1e-5, rtol=1e-5)
+
+    def test_with_window(self, device):
+        B, S, H, D = 1, 16, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+        w1, w2 = 4, 6
+        sm_scale = D ** -0.5
+
+        out = two_simplicial_attention_pytorch(q, k1, k2, v1, v2, w1=w1, w2=w2)
+        ref = _naive_two_simplicial(q, k1, k2, v1, v2, w1, w2, 0.0, 0.0, sm_scale)
+
+        torch.testing.assert_close(out, ref, atol=1e-5, rtol=1e-5)
+
+    def test_with_biases(self, device):
+        B, S, H, D = 1, 16, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+        sm_scale = D ** -0.5
+
+        out = two_simplicial_attention_pytorch(
+            q, k1, k2, v1, v2, k2_bias=0.5, v2_bias=-0.3,
+        )
+        ref = _naive_two_simplicial(q, k1, k2, v1, v2, S, S, 0.5, -0.3, sm_scale)
+
+        torch.testing.assert_close(out, ref, atol=1e-5, rtol=1e-5)
+
+    def test_prescale(self, device):
+        B, S, H, D = 1, 16, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+        sf = D ** (-1.0 / 6.0)
+
+        out = two_simplicial_attention_pytorch(q, k1, k2, v1, v2, prescale=True)
+        ref = _naive_two_simplicial(
+            q * sf, k1 * sf, k2 * sf, v1, v2, S, S, 0.0, 0.0, 1.0,
+        )
+
+        torch.testing.assert_close(out, ref, atol=1e-5, rtol=1e-5)
+
+
+class TestShape:
+    def test_output_shape(self, device):
+        B, S, H, D = 2, 32, 4, 16
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+        out = two_simplicial_attention_pytorch(q, k1, k2, v1, v2)
+        assert out.shape == (B, S, H, D)
+
+    def test_output_dtype_matches_input(self, device):
+        B, S, H, D = 1, 16, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+        out = two_simplicial_attention_pytorch(q, k1, k2, v1, v2)
+        assert out.dtype == q.dtype
+
+
+class TestGradients:
+    def test_all_inputs_get_gradients(self, device):
+        B, S, H, D = 1, 12, 2, 8
+        inputs = [t.requires_grad_(True) for t in _make_inputs(B, S, H, D, device)]
+        out = two_simplicial_attention_pytorch(*inputs)
+        out.sum().backward()
+
+        for name, t in zip(["q", "k1", "k2", "v1", "v2"], inputs):
+            assert t.grad is not None, f"No gradient for {name}"
+            assert t.grad.shape == t.shape, f"Wrong grad shape for {name}"
+            assert t.grad.isfinite().all(), f"Non-finite gradient for {name}"
+
+
+class TestWindowMasking:
+    def test_window_changes_output(self, device):
+        """Windowed attention should differ from full attention."""
+        B, S, H, D = 1, 32, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+
+        out_full = two_simplicial_attention_pytorch(q, k1, k2, v1, v2)
+        out_win = two_simplicial_attention_pytorch(q, k1, k2, v1, v2, w1=4, w2=4)
+
+        assert not torch.allclose(out_full, out_win, atol=1e-5), \
+            "Windowed output should differ from full attention"
+
+    def test_window_equals_full_when_large(self, device):
+        """Window >= seq_len should give same result as full attention."""
+        B, S, H, D = 1, 16, 2, 8
+        q, k1, k2, v1, v2 = _make_inputs(B, S, H, D, device)
+
+        out_full = two_simplicial_attention_pytorch(q, k1, k2, v1, v2)
+        out_win = two_simplicial_attention_pytorch(q, k1, k2, v1, v2, w1=S, w2=S)
+
+        torch.testing.assert_close(out_full, out_win, atol=1e-6, rtol=1e-6)
+
+
+class TestAttentionProperties:
+    def test_uniform_inputs_give_uniform_attention(self, device):
+        """When Q=K1=K2=constant, all positions should get ~same output."""
+        B, S, H, D = 1, 16, 2, 8
+        const = torch.ones(B, S, H, D, device=device)
+        v1 = torch.randn(B, S, H, D, device=device)
+        v2 = torch.ones(B, S, H, D, device=device)
+
+        out = two_simplicial_attention_pytorch(const, const, const, v1, v2)
+
+        # Each position should see identical attention weights,
+        # so output[i] = mean(v1) * 1 for all i
+        expected = v1.mean(dim=1, keepdim=True).expand_as(out)
+        torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5)
diff --git a/Triton/two_simplical_attention.py b/Triton/two_simplical_attention.py
new file mode 100644
index 0000000..cefe4bb
--- /dev/null
+++ b/Triton/two_simplical_attention.py
@@ -0,0 +1,1938 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from triton import Config
+
+
+@triton.autotune(
+    configs=[
+        Config(
+            {
+                "BLOCK_SIZE_Q": 64,
+                "BLOCK_SIZE_KV": 32,
+                "num_stages": 1,
+            },
+            num_warps=4,
+        )
+    ],
+    key=["HEAD_DIM"],
+)
+@triton.jit
+def two_simplicial_attn_fwd_kernel(
+    Q_ptr,
+    K1_ptr,
+    K2_ptr,
+    V1_ptr,
+    V2_ptr,
+    O_ptr,
+    M_ptr,
+    SLOPES_ptr,  # ALiBi slopes tensor [num_heads]
+    bs,
+    seq_len,
+    num_heads,
+    head_dim,
+    w1: tl.constexpr,
+    w2: tl.constexpr,
+    # Stride parameters for memory layout
+    q_stride_b,
+    q_stride_s,
+    q_stride_k,
+    q_stride_h,
+    k1_stride_b,
+    k1_stride_s,
+    k1_stride_k,
+    k1_stride_h,
+    k2_stride_b,
+    k2_stride_s,
+    k2_stride_k,
+    k2_stride_h,
+    v1_stride_b,
+    v1_stride_s,
+    v1_stride_k,
+    v1_stride_h,
+    v2_stride_b,
+    v2_stride_s,
+    v2_stride_k,
+    v2_stride_h,
+    out_stride_b,
+    out_stride_s,
+    out_stride_k,
+    out_stride_h,
+    m_stride_b,
+    m_stride_k,
+    m_stride_s,
+    slopes_stride_h,
+    # Compile-time constants
+    BLOCK_SIZE_Q: tl.constexpr,
+    BLOCK_SIZE_KV: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    SM_SCALE: tl.constexpr,
+    K2_BIAS: tl.constexpr,
+    V2_BIAS: tl.constexpr,
+    USE_ALIBI3D: tl.constexpr,
+    ALIBI_ALPHA: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    num_stages: tl.constexpr,
+    # dtype parameters
+    DATA_DTYPE: tl.constexpr,
+    COMPUTE_DTYPE: tl.constexpr,
+    GEMM_DTYPE: tl.constexpr,
+):
+    """Two Simplicial Attention Forward Kernel with ALiBi3D Positional Bias.
+
+    Implements the Two Simplicial Attention mechanism from https://arxiv.org/abs/2507.02754
+    with our custom ALiBi3D positional bias extension for improved length extrapolation.
+
+    The core simplicial attention mechanism computes attention over pairs of
+    key-value vectors (K1, V1) and (K2, V2). The ALiBi3D positional bias is our
+    novel addition to enable better extrapolation to longer sequences.
+
+    Mathematical Formulation
+    ------------------------
+
+    The core attention computation follows::
+
+        Attention(Q, K1, K2, V1, V2) = softmax((Q ⊙ K1) @ K2^T / √d + B_ALiBi3D) @ (V1 ⊙ V2)
+
+    ALiBi3D Positional Bias::
+
+        B_ALiBi3D[i,k] = -m * (|i - j|^α + |i - k|^α)
+
+    Extends ALiBi to 3D by penalizing attention based on the sum of distances
+    from query position i to both key positions j and k, encouraging locality.
+    The α parameter prevents degenerate cases where |i-j| and |i-k| differ greatly:
+    α > 1 ensures large distances dominate the penalty.
+
+    Where:
+        - i: query position index
+        - j: first key (K1) position index
+        - k: second key (K2) position index
+        - m: head-specific slope parameter (from SLOPES_ptr)
+        - α: distance power factor (ALIBI_ALPHA)
+
+    Local Attention Windows::
+
+        j ∈ [max(0, i - w1), min(n, i)]
+        k ∈ [max(0, i - w2), min(n, i)]
+
+    Causal Masking: j ≤ i and k ≤ i for all valid positions
+
+    Kernel Design Choices
+    ---------------------
+
+    The computation is split strategically for efficiency:
+
+    - Q ⊙ K1: Element-wise multiplication using ALU (avoids tensor cores but enables
+    efficient broadcasting since K1 is a single vector per iteration)
+    - (Q ⊙ K1) @ K2^T: Matrix multiplication using tensor cores (the computational
+    bottleneck with O(n²) complexity, worth the tensor core overhead)
+
+    :param Q_ptr: Query tensor [batch, seq_len, num_heads, head_dim]
+    :type Q_ptr: Tensor pointer
+    :param K1_ptr: First key tensor with same shape as Q
+    :type K1_ptr: Tensor pointer
+    :param K2_ptr: Second key tensor with same shape as Q
+    :type K2_ptr: Tensor pointer
+    :param V1_ptr: First value tensor with same shape as Q
+    :type V1_ptr: Tensor pointer
+    :param V2_ptr: Second value tensor with same shape as Q
+    :type V2_ptr: Tensor pointer
+    :param O_ptr: Output tensor with same shape as Q
+    :type O_ptr: Tensor pointer
+    :param M_ptr: Log-sum-exp values for numerical stability [batch, num_heads, seq_len]
+    :type M_ptr: Tensor pointer
+    :param SLOPES_ptr: ALiBi slopes per head [num_heads]
+    :type SLOPES_ptr: Tensor pointer
+    :param bs: Batch size dimension
+    :type bs: int
+    :param seq_len: Sequence length dimension
+    :type seq_len: int
+    :param num_heads: Number of attention heads dimension
+    :type num_heads: int
+    :param head_dim: Head dimension
+    :type head_dim: int
+    :param w1: Local attention window size for K1/V1
+    :type w1: int
+    :param w2: Local attention window size for K2/V2
+    :type w2: int
+    :param Q_stride_b: Query tensor batch stride
+    :type Q_stride_b: int
+    :param Q_stride_s: Query tensor sequence stride
+    :type Q_stride_s: int
+    :param Q_stride_k: Query tensor head stride
+    :type Q_stride_k: int
+    :param Q_stride_h: Query tensor dimension stride
+    :type Q_stride_h: int
+    :param K1_stride_b: K1 tensor batch stride
+    :type K1_stride_b: int
+    :param K1_stride_s: K1 tensor sequence stride
+    :type K1_stride_s: int
+    :param K1_stride_k: K1 tensor head stride
+    :type K1_stride_k: int
+    :param K1_stride_h: K1 tensor dimension stride
+    :type K1_stride_h: int
+    :param K2_stride_b: K2 tensor batch stride
+    :type K2_stride_b: int
+    :param K2_stride_s: K2 tensor sequence stride
+    :type K2_stride_s: int
+    :param K2_stride_k: K2 tensor head stride
+    :type K2_stride_k: int
+    :param K2_stride_h: K2 tensor dimension stride
+    :type K2_stride_h: int
+    :param V1_stride_b: V1 tensor batch stride
+    :type V1_stride_b: int
+    :param V1_stride_s: V1 tensor sequence stride
+    :type V1_stride_s: int
+    :param V1_stride_k: V1 tensor head stride
+    :type V1_stride_k: int
+    :param V1_stride_h: V1 tensor dimension stride
+    :type V1_stride_h: int
+    :param V2_stride_b: V2 tensor batch stride
+    :type V2_stride_b: int
+    :param V2_stride_s: V2 tensor sequence stride
+    :type V2_stride_s: int
+    :param V2_stride_k: V2 tensor head stride
+    :type V2_stride_k: int
+    :param V2_stride_h: V2 tensor dimension stride
+    :type V2_stride_h: int
+    :param O_stride_b: Output tensor batch stride
+    :type O_stride_b: int
+    :param O_stride_s: Output tensor sequence stride
+    :type O_stride_s: int
+    :param O_stride_k: Output tensor head stride
+    :type O_stride_k: int
+    :param O_stride_h: Output tensor dimension stride
+    :type O_stride_h: int
+    :param M_stride_b: M tensor batch stride
+    :type M_stride_b: int
+    :param M_stride_k: M tensor head stride
+    :type M_stride_k: int
+    :param M_stride_s: M tensor sequence stride
+    :type M_stride_s: int
+    :param BLOCK_SIZE_Q: Tile size for query processing
+    :type BLOCK_SIZE_Q: int
+    :param BLOCK_SIZE_KV: Tile size for key-value processing
+    :type BLOCK_SIZE_KV: int
+    :param HEAD_DIM: Head dimension (must match head_dim)
+    :type HEAD_DIM: int
+    :param SM_SCALE: Attention scaling factor (typically 1/√head_dim)
+    :type SM_SCALE: float
+    :param K2_BIAS: Additive bias for K2 tensor
+    :type K2_BIAS: float
+    :param V2_BIAS: Additive bias for V2 tensor
+    :type V2_BIAS: float
+    :param USE_ALIBI3D: Whether to apply ALiBi3D positional bias
+    :type USE_ALIBI3D: bool
+    :param ALIBI_ALPHA: Power factor for distance computation in ALiBi3D
+    :type ALIBI_ALPHA: float
+    :param num_stages: Pipeline stages for memory optimization
+    :type num_stages: int
+    :param DATA_DTYPE: Storage precision for input/output tensors
+    :type DATA_DTYPE: tl.dtype
+    :param COMPUTE_DTYPE: Computation precision for accumulation and arithmetic
+    :type COMPUTE_DTYPE: tl.dtype
+    :param GEMM_DTYPE: Matrix multiplication precision for performance
+    :type GEMM_DTYPE: tl.dtype
+    """
+
+    # ============================================================================
+    # THREAD BLOCK SETUP AND MEMORY INDEXING
+    # ============================================================================
+    # Each thread block processes BLOCK_SIZE_Q query positions
+    q_start = tl.program_id(0) * BLOCK_SIZE_Q
+    q_end = q_start + BLOCK_SIZE_Q
+
+    # Decode batch and head indices from program_id(1)
+    bk = tl.program_id(1)
+    offs_b = bk // num_heads  # Batch index
+    offs_k = bk % num_heads  # Head index
+
+    # Calculate base memory offsets for current batch/head
+    qkv_offs_bk = offs_b * q_stride_b + offs_k * q_stride_k
+    Q_ptr += qkv_offs_bk
+    K1_ptr += qkv_offs_bk
+    K2_ptr += qkv_offs_bk
+    V1_ptr += qkv_offs_bk
+    V2_ptr += qkv_offs_bk
+    O_ptr += qkv_offs_bk
+    M_ptr += offs_b * m_stride_b + offs_k * m_stride_k
+
+    # ============================================================================
+    # INITIALIZE ONLINE SOFTMAX ACCUMULATORS AND LOAD Q-TILE
+    # ============================================================================
+    # Online softmax state: m_i tracks max logits, l_i tracks sum of exp
+    m_i = tl.zeros((BLOCK_SIZE_Q,), dtype=COMPUTE_DTYPE) - float("inf")
+    l_i = tl.zeros((BLOCK_SIZE_Q,), dtype=COMPUTE_DTYPE)
+    acc = tl.zeros((BLOCK_SIZE_Q, HEAD_DIM), dtype=COMPUTE_DTYPE)
+
+    # Load query tile with proper masking
+    q_offs_s = q_start + tl.arange(0, BLOCK_SIZE_Q)
+    qkv_offs_h = tl.arange(0, HEAD_DIM)
+    q_mask_s = q_offs_s < seq_len
+    qkv_mask_h = qkv_offs_h < head_dim
+
+    q_offs = q_offs_s[:, None] * q_stride_s + qkv_offs_h[None, :] * q_stride_h
+    q_mask = q_mask_s[:, None] & (qkv_mask_h[None, :])
+    q_tile = tl.load(Q_ptr + q_offs, mask=q_mask).to(COMPUTE_DTYPE)
+    softmax_scale = tl.cast(SM_SCALE, GEMM_DTYPE)
+
+    # ============================================================================
+    # DUAL-KEY ATTENTION WITH ONLINE ALiBi3D BIAS
+    # ============================================================================
+    # Load ALiBi slope for current head (done once per kernel)
+    if USE_ALIBI3D:
+        slope = tl.load(SLOPES_ptr + offs_k * slopes_stride_h)
+
+    # Outer loop: iterate over K1/V1 positions within local window
+    kv1_upper = tl.minimum(seq_len, q_end) if CAUSAL else tl.minimum(seq_len, q_end + w1)
+    for kv1_idx in tl.range(tl.maximum(0, q_start - w1), kv1_upper):
+        # Load single K1 vector and broadcast for matrix multiplication
+        k1_offs = kv1_idx * k1_stride_s + qkv_offs_h * k1_stride_h
+        k1_tile = (tl.load(K1_ptr + k1_offs, mask=qkv_mask_h).to(COMPUTE_DTYPE))[None, :]
+
+        # Compute Q * K1 (element-wise, will be used in (Q ⊙ K1) @ K2^T)
+        qk1 = (q_tile * k1_tile).to(GEMM_DTYPE)
+
+        # Load corresponding V1 vector
+        v1_offs = kv1_idx * v1_stride_s + qkv_offs_h * v1_stride_h
+        v1_tile = (tl.load(V1_ptr + v1_offs, mask=qkv_mask_h).to(COMPUTE_DTYPE))[None, :]
+
+        # Inner loop: iterate over K2/V2 positions in blocks for efficiency
+        kv2_upper = tl.minimum(seq_len, q_end) if CAUSAL else tl.minimum(seq_len, q_end + w2)
+        for kv2_idx in tl.range(
+            tl.maximum(0, q_start - w2),
+            kv2_upper,
+            BLOCK_SIZE_KV,
+            num_stages=num_stages,
+        ):
+            # Load K2 and V2 blocks with masking
+            kv2_offs_s = kv2_idx + tl.arange(0, BLOCK_SIZE_KV)
+            kv2_mask_s = kv2_offs_s < seq_len
+            k2t_mask = kv2_mask_s[None, :] & qkv_mask_h[:, None]  # For transposed K2
+            v2_mask = kv2_mask_s[:, None] & qkv_mask_h[None, :]
+
+            k2_offs = kv2_offs_s[None, :] * k2_stride_s + qkv_offs_h[:, None] * k2_stride_h
+            v2_offs = kv2_offs_s[:, None] * v2_stride_s + qkv_offs_h[None, :] * v2_stride_h
+
+            k2t_tile = tl.load(K2_ptr + k2_offs, mask=k2t_mask).to(COMPUTE_DTYPE)
+            v2_tile = tl.load(V2_ptr + v2_offs, mask=v2_mask).to(COMPUTE_DTYPE)
+
+            # Apply biases to K2 and V2
+            k2t_tile += K2_BIAS
+            v2_tile += V2_BIAS
+            k2t_tile = k2t_tile.to(GEMM_DTYPE)
+            v2_tile = v2_tile.to(COMPUTE_DTYPE)
+
+            # Compute attention logits: (Q * K1) @ K2^T * scale
+            qk = tl.dot(qk1 * softmax_scale, k2t_tile, input_precision="tf32", out_dtype=tl.float32)
+
+            # --- ONLINE ALiBi3D POSITIONAL BIAS ---
+            if USE_ALIBI3D:
+                # Compute 3D distances: i=query_pos, j=k1_pos, k=k2_pos
+                # Distance metric: |i-j|^α + |i-k|^α
+                dist_ij = tl.abs(q_offs_s[:, None] - kv1_idx)
+                dist_ik = tl.abs(q_offs_s[:, None] - kv2_offs_s[None, :])
+
+                # Apply power scaling if α ≠ 1
+                if ALIBI_ALPHA != 1.0:
+                    # eps = 1e-7  # Numerical stability
+                    eps = 1e-7
+                    dist_ij = tl.exp(ALIBI_ALPHA * tl.log(dist_ij.to(tl.float32) + eps))
+                    dist_ik = tl.exp(ALIBI_ALPHA * tl.log(dist_ik.to(tl.float32) + eps))
+
+                combined_dist = dist_ij + dist_ik
+                alibi_bias = -slope * combined_dist
+                qk += alibi_bias.to(qk.dtype)
+
+            # --- CAUSAL AND LOCAL ATTENTION MASKING ---
+            # Base sequence masking
+            qk_mask = q_mask_s[:, None] & kv2_mask_s[None, :]
+
+            # Local window constraints
+            if CAUSAL:
+                kv1_local_mask = ((q_offs_s[:, None] - w1) < kv1_idx) & (kv1_idx <= q_offs_s[:, None])
+                kv2_local_mask = ((q_offs_s[:, None] - w2) < kv2_offs_s[None, :]) & (
+                    kv2_offs_s[None, :] <= q_offs_s[:, None]
+                )
+            else:
+                kv1_local_mask = ((q_offs_s[:, None] - w1) < kv1_idx) & (kv1_idx < (q_offs_s[:, None] + w1))
+                kv2_local_mask = ((q_offs_s[:, None] - w2) < kv2_offs_s[None, :]) & (
+                    kv2_offs_s[None, :] < (q_offs_s[:, None] + w2)
+                )
+
+            # Combine all masking conditions
+            qk_mask &= kv1_local_mask & kv2_local_mask
+            qk += tl.where(qk_mask, 0, -1.0e38)  # Large negative for masked positions
+
+            # --- ONLINE SOFTMAX UPDATE ---
+            # Update running maximum and compute probabilities
+            m_ij = tl.maximum(m_i, tl.max(qk, 1))
+            p = tl.math.exp(qk - m_ij[:, None])
+            l_ij = tl.sum(p, 1)
+
+            # Rescale previous accumulator and add current contribution
+            alpha = tl.math.exp(m_i - m_ij)
+            l_i = l_i * alpha + l_ij
+            acc = acc * alpha[:, None]
+
+            # Compute value contribution: V1 ⊙ V2 weighted by attention
+            v12_tile = v1_tile * v2_tile  # Element-wise product
+            acc += tl.dot(p.to(GEMM_DTYPE), v12_tile.to(GEMM_DTYPE), input_precision="ieee", out_dtype=tl.float32)
+            m_i = m_ij
+
+    # ============================================================================
+    # FINALIZE AND STORE OUTPUT
+    # ============================================================================
+    # Apply final softmax normalization
+    acc = acc / l_i[:, None]
+    acc = tl.where(q_mask, acc, 0.0).to(DATA_DTYPE)
+
+    # Store output tensor
+    out_offs = q_offs_s[:, None] * out_stride_s + qkv_offs_h[None, :] * out_stride_h
+    tl.store(O_ptr + out_offs, acc, mask=q_mask)
+
+    # Store log-sum-exp values for potential backward pass
+    m = m_i + tl.log(l_i)
+    m_offs = q_offs_s * m_stride_s
+    m_mask = q_offs_s < seq_len
+    tl.store(M_ptr + m_offs, m, mask=m_mask)
+
+
+@triton.jit
+def two_simplicial_attn_bwd_kv1_kernel(
+    Q_ptr,
+    K1_ptr,
+    K2_ptr,
+    V1_ptr,
+    V2_ptr,
+    dO_ptr,
+    M_ptr,
+    D_ptr,  # Input tensors
+    SLOPES_ptr,  # ALiBi slopes tensor [num_heads]
+    dQ_ptr,
+    dK1_ptr,
+    dV1_ptr,  # Output gradient tensors
+    bs,
+    seq_len,
+    num_heads,
+    head_dim,
+    w1,
+    w2,  # Shape and window parameters
+    # Stride parameters grouped by tensor
+    q_stride_b,
+    q_stride_s,
+    q_stride_k,
+    q_stride_h,
+    k1_stride_b,
+    k1_stride_s,
+    k1_stride_k,
+    k1_stride_h,
+    k2_stride_b,
+    k2_stride_s,
+    k2_stride_k,
+    k2_stride_h,
+    v1_stride_b,
+    v1_stride_s,
+    v1_stride_k,
+    v1_stride_h,
+    v2_stride_b,
+    v2_stride_s,
+    v2_stride_k,
+    v2_stride_h,
+    dO_stride_b,
+    dO_stride_s,
+    dO_stride_k,
+    dO_stride_h,
+    m_stride_b,
+    m_stride_k,
+    m_stride_s,
+    d_stride_b,
+    d_stride_k,
+    d_stride_s,
+    slopes_stride_h,  # Stride for slopes tensor
+    dq_stride_b,
+    dq_stride_s,
+    dq_stride_k,
+    dq_stride_h,
+    dk1_stride_b,
+    dk1_stride_s,
+    dk1_stride_k,
+    dk1_stride_h,
+    dv1_stride_b,
+    dv1_stride_s,
+    dv1_stride_k,
+    dv1_stride_h,
+    # Compile-time constants
+    BLOCK_SIZE_Q: tl.constexpr,
+    BLOCK_SIZE_KV: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    SM_SCALE: tl.constexpr,
+    K2_BIAS: tl.constexpr,
+    V2_BIAS: tl.constexpr,
+    USE_ALIBI3D: tl.constexpr,
+    ALIBI_ALPHA: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    COMPUTE_DQ: tl.constexpr,
+    is_flipped: tl.constexpr,
+    # dtype parameters
+    DATA_DTYPE: tl.constexpr,
+    COMPUTE_DTYPE: tl.constexpr,
+    GEMM_DTYPE: tl.constexpr,
+):
+    """Two Simplicial Attention Backward Kernel for K1/V1 Gradients.
+
+    Computes gradients for K1, V1, and optionally Q in the Two Simplicial Attention
+    mechanism from https://arxiv.org/abs/2507.02754. This kernel processes blocks
+    of K1/V1 positions and computes their gradients by iterating over all relevant
+    K2/V2 and query positions.
+
+    Mathematical Background
+    -----------------------
+
+    For the Two Simplicial Attention mechanism::
+
+        Attention(Q, K1, K2, V1, V2) = softmax((Q ⊙ K1) @ K2^T / √d) @ (V1 ⊙ V2)
+
+    The backward pass computes:
+
+    - ∂L/∂K1: Gradient w.r.t. first key tensor
+    - ∂L/∂V1: Gradient w.r.t. first value tensor
+    - ∂L/∂Q: Gradient w.r.t. query tensor (optional)
+
+    Gradient Computation Strategy
+    -----------------------------
+
+    This kernel uses a K1/V1-centric approach:
+
+    1. Each thread block processes a tile of K1/V1 positions
+    2. For each K1/V1 position, iterate over all valid K2/V2 positions
+    3. For each (K1,K2) pair, iterate over all query positions that can attend to this pair
+    4. Accumulate gradients using the chain rule and saved forward pass values
+
+    The gradients are computed as::
+
+        dV1[j] = Σ_i,k P[i,j,k] * dO[i] * V2[k]
+        dK1[j] = Σ_i,k (∂s/∂K1[j]) * K2[k] * scale
+        dQ[i] = Σ_j,k (∂s/∂Q[i]) * (K1[j] ⊙ K2[k]) * scale
+
+    where P[i,j,k] are the attention probabilities and s are the pre-softmax scores.
+
+    Kernel Architecture
+    -------------------
+
+    - Thread blocks are launched per (batch, head, K1/V1_block)
+    - Each block processes BLOCK_SIZE_KV K1/V1 positions simultaneously
+    - Triple nested loops: K2 positions → Q positions → accumulate gradients
+    - Uses atomic operations for dQ to handle overlapping query contributions
+    - Leverages saved forward pass values (M, D) for efficient gradient computation
+
+    :param Q_ptr: Query tensor from forward pass [batch, seq_len, num_heads, head_dim]
+    :type Q_ptr: Tensor pointer
+    :param K1_ptr: First key tensor from forward pass with same shape as Q
+    :type K1_ptr: Tensor pointer
+    :param K2_ptr: Second key tensor from forward pass with same shape as Q
+    :type K2_ptr: Tensor pointer
+    :param V1_ptr: First value tensor from forward pass with same shape as Q
+    :type V1_ptr: Tensor pointer
+    :param V2_ptr: Second value tensor from forward pass with same shape as Q
+    :type V2_ptr: Tensor pointer
+    :param dO_ptr: Gradient of output [batch, seq_len, num_heads, head_dim]
+    :type dO_ptr: Tensor pointer
+    :param M_ptr: Saved max values from forward pass [batch, num_heads, seq_len]
+    :type M_ptr: Tensor pointer
+    :param D_ptr: Saved row sums from forward pass [batch, num_heads, seq_len]
+    :type D_ptr: Tensor pointer
+    :param dQ_ptr: Output gradient tensor for queries with same shape as Q
+    :type dQ_ptr: Tensor pointer
+    :param dK1_ptr: Output gradient tensor for K1 with same shape as K1
+    :type dK1_ptr: Tensor pointer
+    :param dV1_ptr: Output gradient tensor for V1 with same shape as V1
+    :type dV1_ptr: Tensor pointer
+    :param bs: Batch size dimension
+    :type bs: int
+    :param seq_len: Sequence length dimension
+    :type seq_len: int
+    :param num_heads: Number of attention heads dimension
+    :type num_heads: int
+    :param head_dim: Head dimension
+    :type head_dim: int
+    :param w1: Local attention window size for K1/V1
+    :type w1: int
+    :param w2: Local attention window size for K2/V2
+    :type w2: int
+    :param Q_stride_b: Query tensor batch stride
+    :type Q_stride_b: int
+    :param Q_stride_s: Query tensor sequence stride
+    :type Q_stride_s: int
+    :param Q_stride_k: Query tensor head stride
+    :type Q_stride_k: int
+    :param Q_stride_h: Query tensor dimension stride
+    :type Q_stride_h: int
+    :param K1_stride_b: K1 tensor batch stride
+    :type K1_stride_b: int
+    :param K1_stride_s: K1 tensor sequence stride
+    :type K1_stride_s: int
+    :param K1_stride_k: K1 tensor head stride
+    :type K1_stride_k: int
+    :param K1_stride_h: K1 tensor dimension stride
+    :type K1_stride_h: int
+    :param K2_stride_b: K2 tensor batch stride
+    :type K2_stride_b: int
+    :param K2_stride_s: K2 tensor sequence stride
+    :type K2_stride_s: int
+    :param K2_stride_k: K2 tensor head stride
+    :type K2_stride_k: int
+    :param K2_stride_h: K2 tensor dimension stride
+    :type K2_stride_h: int
+    :param V1_stride_b: V1 tensor batch stride
+    :type V1_stride_b: int
+    :param V1_stride_s: V1 tensor sequence stride
+    :type V1_stride_s: int
+    :param V1_stride_k: V1 tensor head stride
+    :type V1_stride_k: int
+    :param V1_stride_h: V1 tensor dimension stride
+    :type V1_stride_h: int
+    :param V2_stride_b: V2 tensor batch stride
+    :type V2_stride_b: int
+    :param V2_stride_s: V2 tensor sequence stride
+    :type V2_stride_s: int
+    :param V2_stride_k: V2 tensor head stride
+    :type V2_stride_k: int
+    :param V2_stride_h: V2 tensor dimension stride
+    :type V2_stride_h: int
+    :param dO_stride_b: Output gradient tensor batch stride
+    :type dO_stride_b: int
+    :param dO_stride_s: Output gradient tensor sequence stride
+    :type dO_stride_s: int
+    :param dO_stride_k: Output gradient tensor head stride
+    :type dO_stride_k: int
+    :param dO_stride_h: Output gradient tensor dimension stride
+    :type dO_stride_h: int
+    :param M_stride_b: M tensor batch stride
+    :type M_stride_b: int
+    :param M_stride_k: M tensor head stride
+    :type M_stride_k: int
+    :param M_stride_s: M tensor sequence stride
+    :type M_stride_s: int
+    :param D_stride_b: D tensor batch stride
+    :type D_stride_b: int
+    :param D_stride_k: D tensor head stride
+    :type D_stride_k: int
+    :param D_stride_s: D tensor sequence stride
+    :type D_stride_s: int
+    :param dQ_stride_b: Query gradient tensor batch stride
+    :type dQ_stride_b: int
+    :param dQ_stride_s: Query gradient tensor sequence stride
+    :type dQ_stride_s: int
+    :param dQ_stride_k: Query gradient tensor head stride
+    :type dQ_stride_k: int
+    :param dQ_stride_h: Query gradient tensor dimension stride
+    :type dQ_stride_h: int
+    :param dK1_stride_b: K1 gradient tensor batch stride
+    :type dK1_stride_b: int
+    :param dK1_stride_s: K1 gradient tensor sequence stride
+    :type dK1_stride_s: int
+    :param dK1_stride_k: K1 gradient tensor head stride
+    :type dK1_stride_k: int
+    :param dK1_stride_h: K1 gradient tensor dimension stride
+    :type dK1_stride_h: int
+    :param dV1_stride_b: V1 gradient tensor batch stride
+    :type dV1_stride_b: int
+    :param dV1_stride_s: V1 gradient tensor sequence stride
+    :type dV1_stride_s: int
+    :param dV1_stride_k: V1 gradient tensor head stride
+    :type dV1_stride_k: int
+    :param dV1_stride_h: V1 gradient tensor dimension stride
+    :type dV1_stride_h: int
+    :param BLOCK_SIZE_Q: Tile size for query processing
+    :type BLOCK_SIZE_Q: int
+    :param BLOCK_SIZE_KV: Tile size for key-value processing
+    :type BLOCK_SIZE_KV: int
+    :param HEAD_DIM: Head dimension (must match head_dim)
+    :type HEAD_DIM: int
+    :param SM_SCALE: Attention scaling factor (typically 1/√head_dim)
+    :type SM_SCALE: float
+    :param K2_BIAS: Additive bias for K2 tensor
+    :type K2_BIAS: float
+    :param V2_BIAS: Additive bias for V2 tensor
+    :type V2_BIAS: float
+    :param COMPUTE_DQ: Whether to compute query gradients (may be handled by separate kernel)
+    :type COMPUTE_DQ: bool
+    :param is_flipped: Whether this kernel handles the "flipped" case (K1↔K2, V1↔V2 swapped)
+    :type is_flipped: bool
+    :param DATA_DTYPE: Storage precision for input/output tensors
+    :type DATA_DTYPE: tl.dtype
+    :param COMPUTE_DTYPE: Computation precision for accumulation and arithmetic
+    :type COMPUTE_DTYPE: tl.dtype
+    :param GEMM_DTYPE: Matrix multiplication precision for performance
+    :type GEMM_DTYPE: tl.dtype
+    """
+    # ============================================================================
+    # THREAD BLOCK SETUP - PROCESS K1/V1 POSITIONS
+    # ============================================================================
+    # Each thread block processes BLOCK_SIZE_KV K1/V1 positions
+    kv1_start = tl.program_id(0) * BLOCK_SIZE_KV
+    kv1_end = kv1_start + BLOCK_SIZE_KV
+
+    # Decode batch and head indices from program_id(1)
+    bk = tl.program_id(1)
+    offs_b = bk // num_heads  # Batch index
+    offs_k = bk % num_heads  # Head index
+
+    # Calculate base memory offsets for current batch/head
+    qkv_offs_bk = offs_b * q_stride_b + offs_k * q_stride_k
+    Q_ptr += qkv_offs_bk
+    K1_ptr += qkv_offs_bk
+    K2_ptr += qkv_offs_bk
+    V1_ptr += qkv_offs_bk
+    V2_ptr += qkv_offs_bk
+
+    dO_ptr += offs_b * dO_stride_b + offs_k * dO_stride_k
+    M_ptr += offs_b * m_stride_b + offs_k * m_stride_k
+    D_ptr += offs_b * d_stride_b + offs_k * d_stride_k
+    dK1_ptr += offs_b * dk1_stride_b + offs_k * dk1_stride_k
+    dV1_ptr += offs_b * dv1_stride_b + offs_k * dv1_stride_k
+    if COMPUTE_DQ:  # Query gradients may be computed in separate kernel
+        dQ_ptr += offs_b * dq_stride_b + offs_k * dq_stride_k
+
+    softmax_scale = tl.cast(SM_SCALE, GEMM_DTYPE)
+    qkv_offs_h = tl.arange(0, HEAD_DIM)
+    qkv_mask_h = qkv_offs_h < head_dim
+
+    # Load ALiBi slope for current head (done once per kernel)
+    if USE_ALIBI3D:
+        slope = tl.load(SLOPES_ptr + offs_k * slopes_stride_h)
+
+    # ============================================================================
+    # LOAD CURRENT K1/V1 BLOCK
+    # ============================================================================
+    kv1_offs_s = kv1_start + tl.arange(0, BLOCK_SIZE_KV)
+
+    k1_offs = kv1_offs_s[:, None] * k1_stride_s + qkv_offs_h[None, :] * k1_stride_h
+    kv1_mask_s = kv1_offs_s < seq_len
+    kv1_mask = kv1_mask_s[:, None] & qkv_mask_h[None, :]
+    k1_tile = tl.load(K1_ptr + k1_offs, mask=kv1_mask).to(COMPUTE_DTYPE)  # [BLOCK_SIZE_KV, HEAD_DIM]
+    v1_offs = kv1_offs_s[:, None] * v1_stride_s + qkv_offs_h[None, :] * v1_stride_h
+    v1_tile = tl.load(V1_ptr + v1_offs, mask=kv1_mask).to(COMPUTE_DTYPE)  # [BLOCK_SIZE_KV, HEAD_DIM]
+
+    # Apply biases based on kernel mode (normal vs flipped)
+    if is_flipped:
+        k1_tile += K2_BIAS
+        v1_tile += V2_BIAS
+
+    # Initialize gradient accumulation for this K1/V1 block
+    dv1 = tl.zeros((BLOCK_SIZE_KV, HEAD_DIM), COMPUTE_DTYPE)
+    dk1 = tl.zeros((BLOCK_SIZE_KV, HEAD_DIM), COMPUTE_DTYPE)
+
+    # ============================================================================
+    # ITERATE OVER K2/V2 POSITIONS
+    # ============================================================================
+    # For each K1 position, we need to consider all K2 positions that could
+    # form valid attention pairs. The range accounts for local window constraints.
+    kv2_bwd_upper = tl.minimum(seq_len, kv1_end + w1) if CAUSAL else tl.minimum(seq_len, kv1_end + w1 + w2)
+    for kv2_idx in tl.range(tl.maximum(0, kv1_start - w2), kv2_bwd_upper):
+        # Load single K2/V2 vectors for current position
+        k2_offs = kv2_idx * k2_stride_s + qkv_offs_h * k2_stride_h
+        k2_tile = (tl.load(K2_ptr + k2_offs, mask=qkv_mask_h).to(COMPUTE_DTYPE))[None, :]  # [1, HEAD_DIM]
+        v2_offs = kv2_idx * v2_stride_s + qkv_offs_h * v2_stride_h
+        v2_tile = (tl.load(V2_ptr + v2_offs, mask=qkv_mask_h).to(COMPUTE_DTYPE))[None, :]  # [1, HEAD_DIM]
+
+        # Apply biases based on kernel mode
+        if not is_flipped:
+            k2_tile += K2_BIAS
+            v2_tile += V2_BIAS
+
+        # Compute element-wise products for attention and value computation
+        k1k2 = k1_tile * k2_tile  # [BLOCK_SIZE_KV, HEAD_DIM]
+        v1v2 = v1_tile * v2_tile  # [BLOCK_SIZE_KV, HEAD_DIM]
+        k1k2 = k1k2.to(GEMM_DTYPE)
+        v1v2 = v1v2.to(GEMM_DTYPE)
+
+        # ========================================================================
+        # DETERMINE VALID QUERY RANGE FOR CURRENT (K1, K2) PAIR
+        # ========================================================================
+        # Query positions must satisfy local window constraints for both K1 and K2:
+        # Causal:     kv1 ∈ (q-w1, q] and kv2 ∈ (q-w2, q]
+        # Non-causal: kv1 ∈ (q-w1, q+w1) and kv2 ∈ (q-w2, q+w2)
+        if CAUSAL:
+            q_start = tl.maximum(kv1_start, kv2_idx)
+            q_end = tl.minimum(seq_len, tl.minimum(kv1_end + w1, kv2_idx + w2))
+        else:
+            q_start = tl.maximum(tl.maximum(0, kv1_start - w1), kv2_idx - w2)
+            q_end = tl.minimum(seq_len, tl.minimum(kv1_end + w1, kv2_idx + w2))
+
+        # ========================================================================
+        # ITERATE OVER QUERY POSITIONS IN BLOCKS
+        # ========================================================================
+        for q_idx in tl.range(q_start, q_end, BLOCK_SIZE_Q):
+            # Load query block and saved forward pass values
+            q_offs_s = q_idx + tl.arange(0, BLOCK_SIZE_Q)
+            q_offs = q_offs_s[None, :] * q_stride_s + qkv_offs_h[:, None] * q_stride_h
+            q_mask_s = q_offs_s < seq_len
+            qt_mask = q_mask_s[None, :] & qkv_mask_h[:, None]
+            qt_tile = tl.load(Q_ptr + q_offs, mask=qt_mask).to(GEMM_DTYPE)  # [HEAD_DIM, BLOCK_SIZE_Q]
+
+            # Load saved forward pass values for numerical stability
+            m_offs = q_offs_s * m_stride_s
+            m_tile = tl.load(M_ptr + m_offs, mask=q_mask_s).to(COMPUTE_DTYPE)[
+                None, :
+            ]  # [1, BLOCK_SIZE_Q] - saved max values
+            d_offs = q_offs_s * d_stride_s
+            d_tile = tl.load(D_ptr + d_offs, mask=q_mask_s).to(COMPUTE_DTYPE)[
+                None, :
+            ]  # [1, BLOCK_SIZE_Q] - saved row sums
+
+            # Load gradient of output
+            dO_offs = q_offs_s[:, None] * dO_stride_s + qkv_offs_h[None, :] * dO_stride_h
+            dO_tile = tl.load(dO_ptr + dO_offs, mask=q_mask_s[:, None] & qkv_mask_h[None, :]).to(
+                COMPUTE_DTYPE
+            )  # [BLOCK_SIZE_Q, HEAD_DIM]
+
+            if COMPUTE_DQ:
+                dq = tl.zeros((BLOCK_SIZE_Q, HEAD_DIM), tl.float32)
+
+            # ====================================================================
+            # COMPUTE ATTENTION SCORES AND PROBABILITIES
+            # ====================================================================
+            # Reconstruct attention scores: (K1 ⊙ K2) @ (Q * scale)^T
+            qkkT = tl.dot(k1k2, qt_tile * softmax_scale, input_precision="tf32", out_dtype=tl.float32)  # [BLOCK_SIZE_KV, BLOCK_SIZE_Q]
+
+            # --- RECOMPUTE ALiBi3D BIAS (same as forward) ---
+            if USE_ALIBI3D:
+                # Compute 3D distances: i=q_offs_s, j=kv1_offs_s, k=kv2_idx
+                dist_ij = tl.abs(q_offs_s[None, :] - kv1_offs_s[:, None])
+                dist_ik = tl.abs(q_offs_s[None, :] - kv2_idx)
+
+                # Apply power scaling if α ≠ 1
+                if ALIBI_ALPHA != 1.0:
+                    eps = 1e-7
+                    dist_ij = tl.exp(ALIBI_ALPHA * tl.log(dist_ij.to(tl.float32) + eps))
+                    dist_ik = tl.exp(ALIBI_ALPHA * tl.log(dist_ik.to(tl.float32) + eps))
+
+                combined_dist = dist_ij + dist_ik
+                alibi_bias = -slope * combined_dist
+                qkkT += alibi_bias.to(qkkT.dtype)
+
+            # Apply local attention window constraints
+            if CAUSAL:
+                kv1_local_mask = ((q_offs_s[None, :] - w1) < kv1_offs_s[:, None]) & (
+                    kv1_offs_s[:, None] <= q_offs_s[None, :]
+                )
+                kv2_local_mask = ((q_offs_s - w2) < kv2_idx) & (kv2_idx <= q_offs_s)
+            else:
+                kv1_local_mask = ((q_offs_s[None, :] - w1) < kv1_offs_s[:, None]) & (
+                    kv1_offs_s[:, None] < (q_offs_s[None, :] + w1)
+                )
+                kv2_local_mask = ((q_offs_s - w2) < kv2_idx) & (kv2_idx < (q_offs_s + w2))
+            local_mask = kv1_local_mask & kv2_local_mask[None, :]  # [BLOCK_SIZE_KV, BLOCK_SIZE_Q]
+
+            # Mask invalid positions
+            qkkT = tl.where(local_mask, qkkT, -1.0e38)
+
+            # Compute attention probabilities using saved max values from forward pass
+            pT = tl.exp(qkkT - m_tile)  # [BLOCK_SIZE_KV, BLOCK_SIZE_Q]
+            pT = tl.where(local_mask, pT, 0.0)  # Zero out invalid positions
+
+            # ====================================================================
+            # COMPUTE GRADIENTS USING CHAIN RULE
+            # ====================================================================
+            # Gradient w.r.t. V1: dV1 = P^T @ (dO ⊙ V2)
+            dOv2 = dO_tile * v2_tile  # [BLOCK_SIZE_Q, HEAD_DIM]
+            dv1 += tl.dot(pT.to(GEMM_DTYPE), dOv2.to(GEMM_DTYPE), input_precision="ieee", out_dtype=tl.float32)  # [BLOCK_SIZE_KV, HEAD_DIM]
+
+            # Compute gradient w.r.t. pre-softmax scores
+            # First: gradient w.r.t. attention output = (V1 ⊙ V2)^T @ dO
+            dpT = tl.dot(v1v2, tl.trans(dO_tile.to(GEMM_DTYPE)), input_precision="ieee", out_dtype=tl.float32)  # [BLOCK_SIZE_KV, BLOCK_SIZE_Q]
+
+            # Apply softmax backward: ds = P ⊙ (dp - d) where d is row sum from forward
+            dsT = pT * (dpT - d_tile)  # [BLOCK_SIZE_KV, BLOCK_SIZE_Q]
+            dsT = tl.where(local_mask, dsT, 0.0)
+            dsT = dsT.to(GEMM_DTYPE)
+
+            # Gradient w.r.t. K1: dK1 = (ds @ Q^T) ⊙ K2 * scale
+            dk1 += tl.dot(dsT, tl.trans(qt_tile), input_precision="ieee", out_dtype=tl.float32) * k2_tile.to(tl.float32) * softmax_scale
+
+            # Gradient w.r.t. Q (optional): dQ = ds^T @ (K1 ⊙ K2) * scale
+            if COMPUTE_DQ:
+                dq += tl.dot(tl.trans(dsT), k1k2, input_precision="ieee", out_dtype=tl.float32) * softmax_scale  # [BLOCK_SIZE_Q, HEAD_DIM]
+
+                # Store query gradients using atomic operations (multiple blocks contribute)
+                dq_offs = q_offs_s[:, None] * dq_stride_s + qkv_offs_h[None, :] * dq_stride_h
+                tl.atomic_add(dQ_ptr + dq_offs, dq, mask=q_mask_s[:, None] & qkv_mask_h[None, :])
+
+    # ============================================================================
+    # STORE K1/V1 GRADIENTS
+    # ============================================================================
+    dv1_offs = kv1_offs_s[:, None] * dv1_stride_s + qkv_offs_h[None, :] * dv1_stride_h
+    dk1_offs = kv1_offs_s[:, None] * dk1_stride_s + qkv_offs_h[None, :] * dk1_stride_h
+    tl.store(dV1_ptr + dv1_offs, dv1.to(DATA_DTYPE), mask=kv1_mask)
+    tl.store(dK1_ptr + dk1_offs, dk1.to(DATA_DTYPE), mask=kv1_mask)
+
+
+@triton.autotune(
+    configs=[
+        Config(
+            {
+                "BLOCK_SIZE_Q": 32,
+                "BLOCK_SIZE_KV2": 64,
+                "num_stages": 1,
+            },
+            num_warps=4,
+        )
+    ],
+    key=["HEAD_DIM"],
+)
+@triton.jit
+def two_simplicial_attn_bwd_kv2q_kernel(
+    Q_ptr,
+    K1_ptr,
+    K2_ptr,
+    V1_ptr,
+    V2_ptr,
+    dO_ptr,
+    M_ptr,
+    D_ptr,  # Input tensors
+    SLOPES_ptr,  # ALiBi slopes tensor [num_heads]
+    dQ_ptr,
+    dK2_ptr,
+    dV2_ptr,  # Output gradient tensors
+    bs,
+    seq_len,
+    num_heads,
+    head_dim,
+    w1,
+    w2,  # Shape and window parameters
+    # Stride parameters grouped by tensor
+    q_stride_b,
+    q_stride_s,
+    q_stride_k,
+    q_stride_h,
+    k1_stride_b,
+    k1_stride_s,
+    k1_stride_k,
+    k1_stride_h,
+    k2_stride_b,
+    k2_stride_s,
+    k2_stride_k,
+    k2_stride_h,
+    v1_stride_b,
+    v1_stride_s,
+    v1_stride_k,
+    v1_stride_h,
+    v2_stride_b,
+    v2_stride_s,
+    v2_stride_k,
+    v2_stride_h,
+    dO_stride_b,
+    dO_stride_s,
+    dO_stride_k,
+    dO_stride_h,
+    m_stride_b,
+    m_stride_k,
+    m_stride_s,
+    d_stride_b,
+    d_stride_k,
+    d_stride_s,
+    slopes_stride_h,  # Stride for slopes tensor
+    dq_stride_b,
+    dq_stride_s,
+    dq_stride_k,
+    dq_stride_h,
+    dk2_stride_b,
+    dk2_stride_s,
+    dk2_stride_k,
+    dk2_stride_h,
+    dv2_stride_b,
+    dv2_stride_s,
+    dv2_stride_k,
+    dv2_stride_h,
+    # Compile-time constants
+    BLOCK_SIZE_Q: tl.constexpr,
+    BLOCK_SIZE_KV2: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    SM_SCALE: tl.constexpr,
+    K2_BIAS: tl.constexpr,
+    V2_BIAS: tl.constexpr,
+    USE_ALIBI3D: tl.constexpr,
+    ALIBI_ALPHA: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    num_stages: tl.constexpr,
+    IS_SECOND_PASS: tl.constexpr,
+    # dtype parameters
+    DATA_DTYPE: tl.constexpr,
+    COMPUTE_DTYPE: tl.constexpr,
+    GEMM_DTYPE: tl.constexpr,
+):
+    """Two Simplicial Attention Backward Kernel for K2/V2/Q Gradients.
+
+    Computes gradients for K2, V2, and Q in the Two Simplicial Attention
+    mechanism from https://arxiv.org/abs/2507.02754. This kernel uses a
+    Q/K2/V2-centric approach with a two-pass strategy to handle overlapping
+    memory accesses efficiently.
+
+    Mathematical Background
+    -----------------------
+
+    For the Two Simplicial Attention mechanism::
+
+        Attention(Q, K1, K2, V1, V2) = softmax((Q ⊙ K1) @ K2^T / √d) @ (V1 ⊙ V2)
+
+    The backward pass computes:
+
+    - ∂L/∂K2: Gradient w.r.t. second key tensor
+    - ∂L/∂V2: Gradient w.r.t. second value tensor
+    - ∂L/∂Q: Gradient w.r.t. query tensor
+
+    Two-Pass Strategy
+    -----------------
+
+    Due to overlapping memory access patterns in the local attention windows,
+    this kernel uses a two-pass approach:
+
+    **Pass 1 (IS_SECOND_PASS=False)**: Processes "even" tiles
+        - q_start = program_id(0) * BLOCK_SIZE_KV2
+        - Processes positions [q_start, q_start + BLOCK_SIZE_Q]
+        - Writes initial gradients to output tensors
+
+    **Pass 2 (IS_SECOND_PASS=True)**: Processes "odd" tiles
+        - q_start = program_id(0) * BLOCK_SIZE_KV2 + BLOCK_SIZE_Q
+        - Processes positions [q_start, q_start + BLOCK_SIZE_Q]
+        - Reads previous gradients and accumulates additional contributions
+
+    This prevents race conditions while maximizing parallelism.
+
+    Kernel Architecture
+    -------------------
+
+    Unlike the K1/V1 kernel, this one processes:
+
+    1. **Fixed Q block**: BLOCK_SIZE_Q query positions
+    2. **Fixed K2/V2 block**: BLOCK_SIZE_KV2 = BLOCK_SIZE_Q + w2 positions
+    3. **Variable K1/V1**: Iterates over individual K1/V1 positions
+
+    The constraint BLOCK_SIZE_KV2 = BLOCK_SIZE_Q + w2 ensures that all
+    K2/V2 positions that can attend to the Q block are loaded together.
+
+    Gradient Computation Strategy
+    -----------------------------
+
+    For each (Q, K2/V2) block pair:
+
+    1. Load Q, K2, V2 blocks and saved forward pass values (M, D)
+    2. For each K1/V1 position that can attend to this Q block:
+
+    a. Compute attention scores: (Q ⊙ K1) @ K2^T / √d
+    b. Reconstruct probabilities using saved max values
+    c. Apply local window masking
+    d. Accumulate gradients::
+
+            dV2 += P^T @ (dO ⊙ V1)
+            dK2 += ds @ (Q ⊙ K1)
+            dQ += ds^T @ (K1 ⊙ K2)
+
+    Design Rationale
+    ----------------
+
+    This Q/K2/V2-centric approach is complementary to the K1/V1-centric kernel:
+
+    - **K1/V1 kernel**: Efficient for computing K1, V1 gradients (processes K1/V1 blocks)
+    - **K2/V2/Q kernel**: Efficient for K2, V2, Q gradients (processes Q/K2/V2 blocks)
+
+    The two-pass strategy handles the fundamental challenge that multiple thread
+    blocks may need to write to overlapping memory regions due to the local
+    attention windows, which would cause race conditions in a naive implementation.
+
+    :param Q_ptr: Query tensor from forward pass [batch, seq_len, num_heads, head_dim]
+    :type Q_ptr: Tensor pointer
+    :param K1_ptr: First key tensor from forward pass with same shape as Q
+    :type K1_ptr: Tensor pointer
+    :param K2_ptr: Second key tensor from forward pass with same shape as Q
+    :type K2_ptr: Tensor pointer
+    :param V1_ptr: First value tensor from forward pass with same shape as Q
+    :type V1_ptr: Tensor pointer
+    :param V2_ptr: Second value tensor from forward pass with same shape as Q
+    :type V2_ptr: Tensor pointer
+    :param dO_ptr: Gradient of output [batch, seq_len, num_heads, head_dim]
+    :type dO_ptr: Tensor pointer
+    :param M_ptr: Saved max values from forward pass [batch, num_heads, seq_len]
+    :type M_ptr: Tensor pointer
+    :param D_ptr: Saved row sums from forward pass [batch, num_heads, seq_len]
+    :type D_ptr: Tensor pointer
+    :param dQ_ptr: Output gradient tensor for queries with same shape as Q
+    :type dQ_ptr: Tensor pointer
+    :param dK2_ptr: Output gradient tensor for K2 with same shape as K2
+    :type dK2_ptr: Tensor pointer
+    :param dV2_ptr: Output gradient tensor for V2 with same shape as V2
+    :type dV2_ptr: Tensor pointer
+    :param bs: Batch size dimension
+    :type bs: int
+    :param seq_len: Sequence length dimension
+    :type seq_len: int
+    :param num_heads: Number of attention heads dimension
+    :type num_heads: int
+    :param head_dim: Head dimension
+    :type head_dim: int
+    :param w1: Local attention window size for K1/V1
+    :type w1: int
+    :param w2: Local attention window size for K2/V2
+    :type w2: int
+    :param Q_stride_b: Query tensor batch stride
+    :type Q_stride_b: int
+    :param Q_stride_s: Query tensor sequence stride
+    :type Q_stride_s: int
+    :param Q_stride_k: Query tensor head stride
+    :type Q_stride_k: int
+    :param Q_stride_h: Query tensor dimension stride
+    :type Q_stride_h: int
+    :param K1_stride_b: K1 tensor batch stride
+    :type K1_stride_b: int
+    :param K1_stride_s: K1 tensor sequence stride
+    :type K1_stride_s: int
+    :param K1_stride_k: K1 tensor head stride
+    :type K1_stride_k: int
+    :param K1_stride_h: K1 tensor dimension stride
+    :type K1_stride_h: int
+    :param K2_stride_b: K2 tensor batch stride
+    :type K2_stride_b: int
+    :param K2_stride_s: K2 tensor sequence stride
+    :type K2_stride_s: int
+    :param K2_stride_k: K2 tensor head stride
+    :type K2_stride_k: int
+    :param K2_stride_h: K2 tensor dimension stride
+    :type K2_stride_h: int
+    :param V1_stride_b: V1 tensor batch stride
+    :type V1_stride_b: int
+    :param V1_stride_s: V1 tensor sequence stride
+    :type V1_stride_s: int
+    :param V1_stride_k: V1 tensor head stride
+    :type V1_stride_k: int
+    :param V1_stride_h: V1 tensor dimension stride
+    :type V1_stride_h: int
+    :param V2_stride_b: V2 tensor batch stride
+    :type V2_stride_b: int
+    :param V2_stride_s: V2 tensor sequence stride
+    :type V2_stride_s: int
+    :param V2_stride_k: V2 tensor head stride
+    :type V2_stride_k: int
+    :param V2_stride_h: V2 tensor dimension stride
+    :type V2_stride_h: int
+    :param dO_stride_b: Output gradient tensor batch stride
+    :type dO_stride_b: int
+    :param dO_stride_s: Output gradient tensor sequence stride
+    :type dO_stride_s: int
+    :param dO_stride_k: Output gradient tensor head stride
+    :type dO_stride_k: int
+    :param dO_stride_h: Output gradient tensor dimension stride
+    :type dO_stride_h: int
+    :param M_stride_b: M tensor batch stride
+    :type M_stride_b: int
+    :param M_stride_k: M tensor head stride
+    :type M_stride_k: int
+    :param M_stride_s: M tensor sequence stride
+    :type M_stride_s: int
+    :param D_stride_b: D tensor batch stride
+    :type D_stride_b: int
+    :param D_stride_k: D tensor head stride
+    :type D_stride_k: int
+    :param D_stride_s: D tensor sequence stride
+    :type D_stride_s: int
+    :param dQ_stride_b: Query gradient tensor batch stride
+    :type dQ_stride_b: int
+    :param dQ_stride_s: Query gradient tensor sequence stride
+    :type dQ_stride_s: int
+    :param dQ_stride_k: Query gradient tensor head stride
+    :type dQ_stride_k: int
+    :param dQ_stride_h: Query gradient tensor dimension stride
+    :type dQ_stride_h: int
+    :param dK2_stride_b: K2 gradient tensor batch stride
+    :type dK2_stride_b: int
+    :param dK2_stride_s: K2 gradient tensor sequence stride
+    :type dK2_stride_s: int
+    :param dK2_stride_k: K2 gradient tensor head stride
+    :type dK2_stride_k: int
+    :param dK2_stride_h: K2 gradient tensor dimension stride
+    :type dK2_stride_h: int
+    :param dV2_stride_b: V2 gradient tensor batch stride
+    :type dV2_stride_b: int
+    :param dV2_stride_s: V2 gradient tensor sequence stride
+    :type dV2_stride_s: int
+    :param dV2_stride_k: V2 gradient tensor head stride
+    :type dV2_stride_k: int
+    :param dV2_stride_h: V2 gradient tensor dimension stride
+    :type dV2_stride_h: int
+    :param BLOCK_SIZE_Q: Tile size for query processing
+    :type BLOCK_SIZE_Q: int
+    :param BLOCK_SIZE_KV2: Tile size for K2/V2 processing (equals BLOCK_SIZE_Q + w2)
+    :type BLOCK_SIZE_KV2: int
+    :param HEAD_DIM: Head dimension (must match head_dim)
+    :type HEAD_DIM: int
+    :param SM_SCALE: Attention scaling factor (typically 1/√head_dim)
+    :type SM_SCALE: float
+    :param K2_BIAS: Additive bias for K2 tensor
+    :type K2_BIAS: float
+    :param V2_BIAS: Additive bias for V2 tensor
+    :type V2_BIAS: float
+    :param num_stages: Pipeline stages for memory optimization
+    :type num_stages: int
+    :param IS_SECOND_PASS: Whether this is the second pass (accumulates to existing gradients)
+    :type IS_SECOND_PASS: bool
+    :param DATA_DTYPE: Storage precision for input/output tensors
+    :type DATA_DTYPE: tl.dtype
+    :param COMPUTE_DTYPE: Computation precision for accumulation and arithmetic
+    :type COMPUTE_DTYPE: tl.dtype
+    :param GEMM_DTYPE: Matrix multiplication precision for performance
+    :type GEMM_DTYPE: tl.dtype
+    """
+    # Constraint verification for correct memory access patterns
+    assert BLOCK_SIZE_KV2 == BLOCK_SIZE_Q + w2
+
+    # ============================================================================
+    # THREAD BLOCK SETUP - TWO-PASS STRATEGY
+    # ============================================================================
+    # Implement two-pass strategy to handle overlapping memory accesses
+    q_start = tl.program_id(0) * BLOCK_SIZE_KV2
+    if IS_SECOND_PASS:
+        q_start += BLOCK_SIZE_Q  # Offset for second pass
+    q_end = q_start + BLOCK_SIZE_Q
+    kv2_start = q_start - w2  # K2/V2 block starts w2 positions earlier
+
+    # Decode batch and head indices from program_id(1)
+    bk = tl.program_id(1)
+    offs_b = bk // num_heads  # Batch index
+    offs_k = bk % num_heads  # Head index
+
+    # Calculate base memory offsets for current batch/head
+    qkv_offs_bk = offs_b * q_stride_b + offs_k * q_stride_k
+    Q_ptr += qkv_offs_bk
+    K1_ptr += qkv_offs_bk
+    K2_ptr += qkv_offs_bk
+    V1_ptr += qkv_offs_bk
+    V2_ptr += qkv_offs_bk
+
+    dO_ptr += offs_b * dO_stride_b + offs_k * dO_stride_k
+    M_ptr += offs_b * m_stride_b + offs_k * m_stride_k
+    D_ptr += offs_b * d_stride_b + offs_k * d_stride_k
+    dQ_ptr += offs_b * dq_stride_b + offs_k * dq_stride_k
+    dK2_ptr += offs_b * dk2_stride_b + offs_k * dk2_stride_k
+    dV2_ptr += offs_b * dv2_stride_b + offs_k * dv2_stride_k
+
+    softmax_scale = tl.cast(SM_SCALE, GEMM_DTYPE)
+    qkv_offs_h = tl.arange(0, HEAD_DIM)
+    qkv_mask_h = qkv_offs_h < head_dim
+
+    # Load ALiBi slope for current head (done once per kernel)
+    if USE_ALIBI3D:
+        slope = tl.load(SLOPES_ptr + offs_k * slopes_stride_h)
+
+    # ============================================================================
+    # LOAD Q/K2/V2 BLOCKS AND FORWARD PASS VALUES
+    # ============================================================================
+    # Set up memory offsets and masks for current blocks
+    q_offs_s = q_start + tl.arange(0, BLOCK_SIZE_Q)
+    kv2_offs_s = kv2_start + tl.arange(0, BLOCK_SIZE_KV2)
+    q_offs = q_offs_s[:, None] * q_stride_s + qkv_offs_h[None, :] * q_stride_h
+    kv2_offs = kv2_offs_s[:, None] * k2_stride_s + qkv_offs_h[None, :] * k2_stride_h
+    m_offs = q_offs_s * m_stride_s
+    d_offs = q_offs_s * d_stride_s
+    dO_offs = q_offs_s[:, None] * dO_stride_s + qkv_offs_h[None, :] * dO_stride_h
+
+    # Create masks for valid positions
+    q_mask_s = q_offs_s < seq_len
+    q_mask = q_mask_s[:, None] & qkv_mask_h[None, :]
+    kv2_mask_s = (0 <= kv2_offs_s) & (kv2_offs_s < seq_len)
+    kv2_mask = kv2_mask_s[:, None] & qkv_mask_h[None, :]
+
+    # Load all required tensors for current blocks
+    q_tile = tl.load(Q_ptr + q_offs, mask=q_mask).to(COMPUTE_DTYPE)  # [BLOCK_SIZE_Q, HEAD_DIM]
+    k2_tile = tl.load(K2_ptr + kv2_offs, mask=kv2_mask).to(GEMM_DTYPE)  # [BLOCK_SIZE_KV2, HEAD_DIM]
+    v2_tile = tl.load(V2_ptr + kv2_offs, mask=kv2_mask).to(GEMM_DTYPE)  # [BLOCK_SIZE_KV2, HEAD_DIM]
+    m_tile = tl.load(M_ptr + m_offs, mask=q_mask_s).to(COMPUTE_DTYPE)  # [BLOCK_SIZE_Q]
+    d_tile = tl.load(D_ptr + d_offs, mask=q_mask_s).to(COMPUTE_DTYPE)  # [BLOCK_SIZE_Q]
+    dO_tile = tl.load(dO_ptr + dO_offs, mask=q_mask).to(GEMM_DTYPE)  # [BLOCK_SIZE_Q, HEAD_DIM]
+
+    # Apply biases to K2 and V2
+    k2_tile += K2_BIAS
+    v2_tile += V2_BIAS
+    k2_tile = k2_tile.to(GEMM_DTYPE)
+    v2_tile = v2_tile.to(GEMM_DTYPE)
+
+    # Initialize gradient accumulators
+    dq = tl.zeros((BLOCK_SIZE_Q, HEAD_DIM), tl.float32)
+    dk2 = tl.zeros((BLOCK_SIZE_KV2, HEAD_DIM), tl.float32)
+    dv2 = tl.zeros((BLOCK_SIZE_KV2, HEAD_DIM), tl.float32)
+
+    # ============================================================================
+    # ITERATE OVER K1/V1 POSITIONS
+    # ============================================================================
+    # Determine valid K1/V1 range based on local attention windows
+    kv1_start = tl.maximum(0, q_start - w1)
+    kv1_end = tl.minimum(seq_len, q_end) if CAUSAL else tl.minimum(seq_len, q_end + w1)
+
+    for kv1_idx in tl.range(kv1_start, kv1_end, num_stages=num_stages):
+        # Load single K1 and V1 vectors
+        k1_offs = kv1_idx * k1_stride_s + qkv_offs_h * k1_stride_h
+        v1_offs = kv1_idx * v1_stride_s + qkv_offs_h * v1_stride_h
+        k1_tile = tl.load(K1_ptr + k1_offs, mask=qkv_mask_h).to(COMPUTE_DTYPE)  # [HEAD_DIM]
+
+        v1_tile = tl.load(V1_ptr + v1_offs, mask=qkv_mask_h).to(COMPUTE_DTYPE)  # [HEAD_DIM]
+
+        # ====================================================================
+        # COMPUTE ATTENTION SCORES AND PROBABILITIES
+        # ====================================================================
+        # Compute Q ⊙ K1 with scaling applied early for efficiency
+        qk1_s = q_tile * (k1_tile[None, :] * softmax_scale)  # [BLOCK_SIZE_Q, HEAD_DIM]
+        qk1_s = qk1_s.to(GEMM_DTYPE)
+
+        # Compute attention scores: K2 @ (Q ⊙ K1)^T
+        qkkT = tl.dot(k2_tile, qk1_s.T, input_precision="tf32", out_dtype=tl.float32)  # [BLOCK_SIZE_KV2, BLOCK_SIZE_Q]
+
+        # --- RECOMPUTE ALiBi3D BIAS (same as forward) ---
+        if USE_ALIBI3D:
+            # Compute 3D distances: i=q_offs_s, j=kv1_idx, k=kv2_offs_s
+            dist_ij = tl.abs(q_offs_s[None, :] - kv1_idx)
+            dist_ik = tl.abs(q_offs_s[None, :] - kv2_offs_s[:, None])
+
+            # Apply power scaling if α ≠ 1
+            if ALIBI_ALPHA != 1.0:
+                eps = 1e-7
+                dist_ij = tl.exp(ALIBI_ALPHA * tl.log(dist_ij.to(tl.float32) + eps))
+                dist_ik = tl.exp(ALIBI_ALPHA * tl.log(dist_ik.to(tl.float32) + eps))
+
+            combined_dist = dist_ij + dist_ik
+            alibi_bias = -slope * combined_dist
+            qkkT += alibi_bias.to(qkkT.dtype)
+
+        # Apply local attention window constraints
+        qkT_mask = kv2_mask_s[:, None] & q_mask_s[None, :]
+        if CAUSAL:
+            kv1_local_mask = ((q_offs_s[None, :] - w1) < kv1_idx) & (
+                kv1_idx <= q_offs_s[None, :]
+            )
+            kv2_local_mask = ((q_offs_s[None, :] - w2) < kv2_offs_s[:, None]) & (
+                kv2_offs_s[:, None] <= q_offs_s[None, :]
+            )
+        else:
+            kv1_local_mask = ((q_offs_s[None, :] - w1) < kv1_idx) & (
+                kv1_idx < (q_offs_s[None, :] + w1)
+            )
+            kv2_local_mask = ((q_offs_s[None, :] - w2) < kv2_offs_s[:, None]) & (
+                kv2_offs_s[:, None] < (q_offs_s[None, :] + w2)
+            )
+        local_mask = kv1_local_mask & kv2_local_mask  # [BLOCK_SIZE_KV2, BLOCK_SIZE_Q]
+        qkT_mask &= kv1_local_mask & kv2_local_mask
+
+        # Compute attention probabilities using saved max values
+        pT = tl.exp(qkkT - m_tile[None, :])  # [BLOCK_SIZE_KV2, BLOCK_SIZE_Q]
+        pT = tl.where(qkT_mask, pT, 0.0)
+
+        # Mask invalid positions in scores (used later for gradient computation)
+        qkkT = tl.where(local_mask, qkkT, -1.0e38)
+
+        # ====================================================================
+        # COMPUTE GRADIENTS USING CHAIN RULE
+        # ====================================================================
+        # Gradient w.r.t. V2: dV2 += P^T @ (dO ⊙ V1)
+        dOv1 = dO_tile * v1_tile[None, :]  # [BLOCK_SIZE_Q, HEAD_DIM]
+        dOv1 = dOv1.to(GEMM_DTYPE)
+        dv2 += tl.dot(pT.to(GEMM_DTYPE), dOv1, input_precision="ieee", out_dtype=tl.float32)
+
+        # Compute gradient w.r.t. pre-softmax scores
+        # First: gradient w.r.t. attention output = V2^T @ (dO ⊙ V1)
+        dpT = tl.dot(v2_tile, dOv1.T, input_precision="ieee", out_dtype=tl.float32)  # [BLOCK_SIZE_KV2, BLOCK_SIZE_Q]
+
+        # Apply softmax backward: ds = P ⊙ (dp - d)
+        dsT = pT * (dpT - d_tile[None, :])  # [BLOCK_SIZE_KV2, BLOCK_SIZE_Q]
+        dsT = tl.where(qkT_mask, dsT, 0.0)
+        dsT = dsT.to(GEMM_DTYPE)  # [BLOCK_SIZE_KV2, BLOCK_SIZE_Q]
+
+        # Gradient w.r.t. K2: dK2 += ds @ (Q ⊙ K1)
+        dk2 += tl.dot(dsT, qk1_s, input_precision="ieee", out_dtype=tl.float32)
+
+        # Gradient w.r.t. Q: dQ += ds^T @ (K1 ⊙ K2) * scale
+        k1k2 = k1_tile[None, :] * k2_tile  # [BLOCK_SIZE_KV2, HEAD_DIM]
+        k1k2 = k1k2.to(GEMM_DTYPE)
+        dq += tl.dot(dsT.T, k1k2, input_precision="ieee", out_dtype=tl.float32)  # Scale applied at the end
+
+    # ============================================================================
+    # HANDLE TWO-PASS ACCUMULATION AND STORE RESULTS
+    # ============================================================================
+    # In the second pass, accumulate to existing gradients
+    if IS_SECOND_PASS:
+        prev_dk2 = tl.load(dK2_ptr + kv2_offs, kv2_mask)
+        prev_dv2 = tl.load(dV2_ptr + kv2_offs, kv2_mask)
+        dk2 += prev_dk2
+        dv2 += prev_dv2
+
+    # Apply final scaling to query gradients
+    dq *= softmax_scale
+
+    # Store all computed gradients
+    tl.store(dK2_ptr + kv2_offs, dk2.to(DATA_DTYPE), kv2_mask)
+    tl.store(dV2_ptr + kv2_offs, dv2.to(DATA_DTYPE), kv2_mask)
+    tl.store(dQ_ptr + q_offs, dq.to(DATA_DTYPE), q_mask)
+
+
+# ===============================================
+# TRITON KERNEL WRAPPER
+# ===============================================
+
+
+class TwoSimplicialAttentionFunction(torch.autograd.Function):
+    """PyTorch autograd Function wrapper for Two Simplicial Attention with optional ALiBi3D.
+
+    Implements the forward and backward passes for the Two Simplicial Attention mechanism
+    from https://arxiv.org/abs/2507.02754 with our custom ALiBi3D positional bias extension.
+
+    The core computation is:
+        Attention(Q, K1, K2, V1, V2) = softmax((Q ⊙ K1) @ K2^T / √d + B_ALiBi3D) @ (V1 ⊙ V2)
+
+    Where B_ALiBi3D is an optional 3D positional bias that encourages local attention patterns.
+    """
+
+    BLOCK_SIZE_Q = 32
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k1,
+        k2,
+        v1,
+        v2,
+        w1,
+        w2,
+        alibi_slopes: Optional[torch.Tensor] = None,
+        alibi_alpha: Optional[float] = None,
+        k2_bias=0.0,
+        v2_bias=0.0,
+        sm_scale=None,
+        prescale: bool = False,
+        causal: bool = False,
+        data_dtype=tl.bfloat16,
+        compute_dtype=tl.float32,
+        gemm_dtype=tl.bfloat16,
+    ):
+        """Forward pass through Two Simplicial Attention.
+
+        :param q: Query tensor with shape [batch_size, seq_len, num_heads, head_dim]
+        :type q: torch.Tensor
+        :param k1: First key tensor with shape [batch_size, seq_len, num_heads, head_dim]
+        :type k1: torch.Tensor
+        :param k2: Second key tensor with shape [batch_size, seq_len, num_heads, head_dim]
+        :type k2: torch.Tensor
+        :param v1: First value tensor with shape [batch_size, seq_len, num_heads, head_dim]
+        :type v1: torch.Tensor
+        :param v2: Second value tensor with shape [batch_size, seq_len, num_heads, head_dim]
+        :type v2: torch.Tensor
+        :param w1: Local attention window size for K1/V1
+        :type w1: int
+        :param w2: Local attention window size for K2/V2
+        :type w2: int
+        :param alibi_slopes: ALiBi3D slope parameters [num_heads]. If None, no positional bias is applied.
+        :type alibi_slopes: torch.Tensor, optional
+        :param alibi_alpha: Power factor for distance computation in ALiBi3D bias
+        :type alibi_alpha: float
+        :param k2_bias: Additive bias applied to K2 tensor
+        :type k2_bias: float
+        :param v2_bias: Additive bias applied to V2 tensor
+        :type v2_bias: float
+        :param sm_scale: Attention scaling factor (only used if prescale=False), defaults to None
+        :type sm_scale: float, optional
+        :param prescale: If True, pre-scale Q, K1, K2 by d**(-1/3) instead of applying 1/√d at logits
+        :type prescale: bool
+        :param causal: If True, apply causal masking (j <= i, k <= i). Default False.
+        :type causal: bool
+        :param data_dtype: Triton data type for storage operations
+        :type data_dtype: tl.dtype
+        :param compute_dtype: Triton data type for computation operations
+        :type compute_dtype: tl.dtype
+        :param gemm_dtype: Triton data type for GEMM operations
+        :type gemm_dtype: tl.dtype
+        :return: Attention output with same shape as input tensors
+        :rtype: torch.Tensor
+        """
+        batch_size, seq_len, num_heads, head_dim = q.shape
+        device = q.device
+        if not device.type == "cuda":
+            raise RuntimeError("TwoSimplicialAttention requires CUDA tensors.")
+
+        w1 = w1 if w1 is not None else seq_len
+        w2 = w2 if w2 is not None else seq_len
+
+        # Scaling
+        if prescale:
+            scale_factor = head_dim**-0.1666666667  # d^(-1/6)
+            q = q * scale_factor
+            k1 = k1 * scale_factor
+            k2 = k2 * scale_factor
+            sm_scale = 1.0  # already normalized
+        else:
+            if sm_scale is None:
+                sm_scale = head_dim**-0.5  # d^(-1/2)
+
+        # Ensure contiguous
+        q, k1, k2, v1, v2 = [t.contiguous() for t in (q, k1, k2, v1, v2)]
+
+        # Handle ALiBi3D slopes
+        use_alibi3d = alibi_slopes is not None
+        if use_alibi3d:
+            alibi_slopes = alibi_slopes.contiguous()
+        else:
+            alibi_slopes = torch.zeros(num_heads, dtype=q.dtype, device=device)
+
+        # Output + scratch
+        output = torch.empty_like(q)
+        m = torch.full((batch_size, num_heads, seq_len), -float("inf"), dtype=torch.float32, device=device)
+
+        # Strides
+        q_s, k1_s, k2_s, v1_s, v2_s, o_s, m_s = [t.stride() for t in (q, k1, k2, v1, v2, output, m)]
+
+        def grid(meta):
+            return (
+                triton.cdiv(seq_len, meta["BLOCK_SIZE_Q"]),
+                batch_size * num_heads,
+            )
+
+        # Kernel call
+        two_simplicial_attn_fwd_kernel[grid](
+            Q_ptr=q,
+            K1_ptr=k1,
+            K2_ptr=k2,
+            V1_ptr=v1,
+            V2_ptr=v2,
+            O_ptr=output,
+            M_ptr=m,
+            SLOPES_ptr=alibi_slopes,
+            bs=batch_size,
+            seq_len=seq_len,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            w1=w1,
+            w2=w2,
+            q_stride_b=q_s[0],
+            q_stride_s=q_s[1],
+            q_stride_k=q_s[2],
+            q_stride_h=q_s[3],
+            k1_stride_b=k1_s[0],
+            k1_stride_s=k1_s[1],
+            k1_stride_k=k1_s[2],
+            k1_stride_h=k1_s[3],
+            k2_stride_b=k2_s[0],
+            k2_stride_s=k2_s[1],
+            k2_stride_k=k2_s[2],
+            k2_stride_h=k2_s[3],
+            v1_stride_b=v1_s[0],
+            v1_stride_s=v1_s[1],
+            v1_stride_k=v1_s[2],
+            v1_stride_h=v1_s[3],
+            v2_stride_b=v2_s[0],
+            v2_stride_s=v2_s[1],
+            v2_stride_k=v2_s[2],
+            v2_stride_h=v2_s[3],
+            out_stride_b=o_s[0],
+            out_stride_s=o_s[1],
+            out_stride_k=o_s[2],
+            out_stride_h=o_s[3],
+            m_stride_b=m_s[0],
+            m_stride_k=m_s[1],
+            m_stride_s=m_s[2],
+            slopes_stride_h=alibi_slopes.stride(0),
+            HEAD_DIM=head_dim,
+            SM_SCALE=sm_scale,
+            K2_BIAS=k2_bias,
+            V2_BIAS=v2_bias,
+            USE_ALIBI3D=use_alibi3d,
+            ALIBI_ALPHA=alibi_alpha,
+            CAUSAL=causal,
+            DATA_DTYPE=data_dtype,
+            COMPUTE_DTYPE=compute_dtype,
+            GEMM_DTYPE=gemm_dtype,
+        )
+
+        # Save for backward (including dtype parameters)
+        ctx.save_for_backward(q, k1, k2, v1, v2, output, m, alibi_slopes)
+        ctx.w1, ctx.w2 = w1, w2
+        ctx.k2_bias, ctx.v2_bias, ctx.sm_scale = k2_bias, v2_bias, sm_scale
+        ctx.batch_size, ctx.seq_len, ctx.num_heads, ctx.head_dim = batch_size, seq_len, num_heads, head_dim
+        ctx.use_alibi3d, ctx.prescale = use_alibi3d, prescale
+        ctx.causal = causal
+        ctx.alibi_alpha = alibi_alpha if alibi_alpha is not None else 1.0
+        # Save dtype parameters for backward pass
+        ctx.data_dtype = data_dtype
+        ctx.compute_dtype = compute_dtype
+        ctx.gemm_dtype = gemm_dtype
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward pass for Two Simplicial Attention.
+
+        Computes gradients for all input tensors using a three-kernel approach:
+        1. K1/V1-centric kernel for ∂L/∂K1 and ∂L/∂V1
+        2. Two-pass K2/V2/Q-centric kernel for ∂L/∂K2, ∂L/∂V2, and ∂L/∂Q
+
+        The approach splits gradient computation to maximize memory efficiency
+        and handle overlapping access patterns in local attention windows.
+        """
+
+        # Retrieve saved tensors from forward pass
+        q, k1, k2, v1, v2, output, m, alibi_slopes = ctx.saved_tensors
+
+        # Retrieve saved dtype parameters
+        data_dtype = ctx.data_dtype
+        compute_dtype = ctx.compute_dtype
+        gemm_dtype = ctx.gemm_dtype
+
+        # Ensure contiguous grad_output on correct device
+        grad_output = grad_output.contiguous()
+        device = grad_output.device
+
+        if not device.type == "cuda":
+            raise RuntimeError(
+                f"TwoSimplicialAttention requires CUDA tensors for backward pass. Got grad_output on device: {device}"
+            )
+
+        # Initialize gradient tensors on GPU
+        grad_q = torch.zeros_like(q)
+        grad_k1 = torch.zeros_like(k1)
+        grad_k2 = torch.zeros_like(k2)
+        grad_v1 = torch.zeros_like(v1)
+        grad_v2 = torch.zeros_like(v2)
+
+        # Compute diagonal values for softmax backward pass
+        # D[i] = Σ_j P[i,j] * ∂L/∂output[i,j] (needed for softmax gradient)
+        d = torch.sum(grad_output * output, dim=-1)  # [B, S, H]
+        d = d.permute(0, 2, 1).contiguous()  # Reorder to [B, H, S] to match M layout
+
+        # Extract memory strides for all tensors
+        q_strides = q.stride()
+        k1_strides = k1.stride()
+        k2_strides = k2.stride()
+        v1_strides = v1.stride()
+        v2_strides = v2.stride()
+        do_strides = grad_output.stride()
+        dq_strides = grad_q.stride()
+        dk1_strides = grad_k1.stride()
+        dk2_strides = grad_k2.stride()
+        dv1_strides = grad_v1.stride()
+        dv2_strides = grad_v2.stride()
+        m_strides = m.stride()
+        d_strides = d.stride()
+
+        # ====================================================================
+        # KERNEL 1: K1/V1-CENTRIC BACKWARD PASS
+        # ====================================================================
+        # Compute gradients for K1 and V1 by processing blocks of K1/V1 positions
+        # Each thread block handles BLOCK_SIZE_KV K1/V1 positions across all relevant K2/Q pairs
+
+        def grid_kv1(meta):
+            return (
+                triton.cdiv(ctx.seq_len, meta["BLOCK_SIZE_KV"]),  # K1/V1 position blocks
+                ctx.batch_size * ctx.num_heads,  # Batch * head combinations
+            )
+
+        two_simplicial_attn_bwd_kv1_kernel[grid_kv1](
+            # Input tensors
+            Q_ptr=q,
+            K1_ptr=k1,
+            K2_ptr=k2,
+            V1_ptr=v1,
+            V2_ptr=v2,
+            dO_ptr=grad_output,
+            M_ptr=m,
+            D_ptr=d,
+            SLOPES_ptr=alibi_slopes,
+            # Output gradient tensors
+            dQ_ptr=grad_q,
+            dK1_ptr=grad_k1,
+            dV1_ptr=grad_v1,
+            # Shape parameters
+            bs=ctx.batch_size,
+            seq_len=ctx.seq_len,
+            num_heads=ctx.num_heads,
+            head_dim=ctx.head_dim,
+            w1=ctx.w1,
+            w2=ctx.w2,
+            # Stride parameters
+            q_stride_b=q_strides[0],
+            q_stride_s=q_strides[1],
+            q_stride_k=q_strides[2],
+            q_stride_h=q_strides[3],
+            k1_stride_b=k1_strides[0],
+            k1_stride_s=k1_strides[1],
+            k1_stride_k=k1_strides[2],
+            k1_stride_h=k1_strides[3],
+            k2_stride_b=k2_strides[0],
+            k2_stride_s=k2_strides[1],
+            k2_stride_k=k2_strides[2],
+            k2_stride_h=k2_strides[3],
+            v1_stride_b=v1_strides[0],
+            v1_stride_s=v1_strides[1],
+            v1_stride_k=v1_strides[2],
+            v1_stride_h=v1_strides[3],
+            v2_stride_b=v2_strides[0],
+            v2_stride_s=v2_strides[1],
+            v2_stride_k=v2_strides[2],
+            v2_stride_h=v2_strides[3],
+            dO_stride_b=do_strides[0],
+            dO_stride_s=do_strides[1],
+            dO_stride_k=do_strides[2],
+            dO_stride_h=do_strides[3],
+            m_stride_b=m_strides[0],
+            m_stride_k=m_strides[1],
+            m_stride_s=m_strides[2],
+            d_stride_b=d_strides[0],
+            d_stride_k=d_strides[1],
+            d_stride_s=d_strides[2],
+            slopes_stride_h=alibi_slopes.stride(0),
+            dq_stride_b=dq_strides[0],
+            dq_stride_s=dq_strides[1],
+            dq_stride_k=dq_strides[2],
+            dq_stride_h=dq_strides[3],
+            dk1_stride_b=dk1_strides[0],
+            dk1_stride_s=dk1_strides[1],
+            dk1_stride_k=dk1_strides[2],
+            dk1_stride_h=dk1_strides[3],
+            dv1_stride_b=dv1_strides[0],
+            dv1_stride_s=dv1_strides[1],
+            dv1_stride_k=dv1_strides[2],
+            dv1_stride_h=dv1_strides[3],
+            # Compile-time constants
+            BLOCK_SIZE_Q=32,
+            BLOCK_SIZE_KV=64,
+            HEAD_DIM=ctx.head_dim,
+            SM_SCALE=ctx.sm_scale,
+            K2_BIAS=ctx.k2_bias,
+            V2_BIAS=ctx.v2_bias,
+            USE_ALIBI3D=ctx.use_alibi3d,
+            ALIBI_ALPHA=ctx.alibi_alpha,
+            CAUSAL=ctx.causal,
+            COMPUTE_DQ=False,
+            is_flipped=False,
+            DATA_DTYPE=data_dtype,
+            COMPUTE_DTYPE=compute_dtype,
+            GEMM_DTYPE=gemm_dtype,
+        )
+
+        # ====================================================================
+        # KERNEL 2: K2/V2/Q-CENTRIC BACKWARD PASS (TWO PASSES)
+        # ====================================================================
+        # Compute gradients for K2, V2, and Q using a two-pass strategy to handle
+        # overlapping memory accesses due to local attention windows
+
+        block_size_kv2 = (
+            TwoSimplicialAttentionFunction.BLOCK_SIZE_Q + ctx.w2
+        )  # BLOCK_SIZE_Q + w2 (ensures all relevant K2 positions are loaded)
+
+        def grid_kv2q(meta):
+            return (
+                triton.cdiv(ctx.seq_len, block_size_kv2),
+                ctx.batch_size * ctx.num_heads,
+            )
+
+        # First pass: Process "even" tiles
+        two_simplicial_attn_bwd_kv2q_kernel[grid_kv2q](
+            # Input tensors
+            Q_ptr=q,
+            K1_ptr=k1,
+            K2_ptr=k2,
+            V1_ptr=v1,
+            V2_ptr=v2,
+            dO_ptr=grad_output,
+            M_ptr=m,
+            D_ptr=d,
+            SLOPES_ptr=alibi_slopes,
+            # Output gradient tensors
+            dQ_ptr=grad_q,
+            dK2_ptr=grad_k2,
+            dV2_ptr=grad_v2,
+            # Shape parameters
+            bs=ctx.batch_size,
+            seq_len=ctx.seq_len,
+            num_heads=ctx.num_heads,
+            head_dim=ctx.head_dim,
+            w1=ctx.w1,
+            w2=ctx.w2,
+            # Stride parameters
+            q_stride_b=q_strides[0],
+            q_stride_s=q_strides[1],
+            q_stride_k=q_strides[2],
+            q_stride_h=q_strides[3],
+            k1_stride_b=k1_strides[0],
+            k1_stride_s=k1_strides[1],
+            k1_stride_k=k1_strides[2],
+            k1_stride_h=k1_strides[3],
+            k2_stride_b=k2_strides[0],
+            k2_stride_s=k2_strides[1],
+            k2_stride_k=k2_strides[2],
+            k2_stride_h=k2_strides[3],
+            v1_stride_b=v1_strides[0],
+            v1_stride_s=v1_strides[1],
+            v1_stride_k=v1_strides[2],
+            v1_stride_h=v1_strides[3],
+            v2_stride_b=v2_strides[0],
+            v2_stride_s=v2_strides[1],
+            v2_stride_k=v2_strides[2],
+            v2_stride_h=v2_strides[3],
+            dO_stride_b=do_strides[0],
+            dO_stride_s=do_strides[1],
+            dO_stride_k=do_strides[2],
+            dO_stride_h=do_strides[3],
+            m_stride_b=m_strides[0],
+            m_stride_k=m_strides[1],
+            m_stride_s=m_strides[2],
+            d_stride_b=d_strides[0],
+            d_stride_k=d_strides[1],
+            d_stride_s=d_strides[2],
+            slopes_stride_h=alibi_slopes.stride(0),
+            dq_stride_b=dq_strides[0],
+            dq_stride_s=dq_strides[1],
+            dq_stride_k=dq_strides[2],
+            dq_stride_h=dq_strides[3],
+            dk2_stride_b=dk2_strides[0],
+            dk2_stride_s=dk2_strides[1],
+            dk2_stride_k=dk2_strides[2],
+            dk2_stride_h=dk2_strides[3],
+            dv2_stride_b=dv2_strides[0],
+            dv2_stride_s=dv2_strides[1],
+            dv2_stride_k=dv2_strides[2],
+            dv2_stride_h=dv2_strides[3],
+            # Compile-time constants
+            HEAD_DIM=ctx.head_dim,
+            SM_SCALE=ctx.sm_scale,
+            K2_BIAS=ctx.k2_bias,
+            V2_BIAS=ctx.v2_bias,
+            USE_ALIBI3D=ctx.use_alibi3d,
+            ALIBI_ALPHA=ctx.alibi_alpha,
+            CAUSAL=ctx.causal,
+            IS_SECOND_PASS=False,
+            DATA_DTYPE=data_dtype,
+            COMPUTE_DTYPE=compute_dtype,
+            GEMM_DTYPE=gemm_dtype,
+        )
+
+        # Second pass: Process "odd" tiles and accumulate with first pass results
+        two_simplicial_attn_bwd_kv2q_kernel[grid_kv2q](
+            # Input tensors
+            Q_ptr=q,
+            K1_ptr=k1,
+            K2_ptr=k2,
+            V1_ptr=v1,
+            V2_ptr=v2,
+            dO_ptr=grad_output,
+            M_ptr=m,
+            D_ptr=d,
+            SLOPES_ptr=alibi_slopes,
+            # Output gradient tensors
+            dQ_ptr=grad_q,
+            dK2_ptr=grad_k2,
+            dV2_ptr=grad_v2,
+            # Shape parameters
+            bs=ctx.batch_size,
+            seq_len=ctx.seq_len,
+            num_heads=ctx.num_heads,
+            head_dim=ctx.head_dim,
+            w1=ctx.w1,
+            w2=ctx.w2,
+            # Stride parameters
+            q_stride_b=q_strides[0],
+            q_stride_s=q_strides[1],
+            q_stride_k=q_strides[2],
+            q_stride_h=q_strides[3],
+            k1_stride_b=k1_strides[0],
+            k1_stride_s=k1_strides[1],
+            k1_stride_k=k1_strides[2],
+            k1_stride_h=k1_strides[3],
+            k2_stride_b=k2_strides[0],
+            k2_stride_s=k2_strides[1],
+            k2_stride_k=k2_strides[2],
+            k2_stride_h=k2_strides[3],
+            v1_stride_b=v1_strides[0],
+            v1_stride_s=v1_strides[1],
+            v1_stride_k=v1_strides[2],
+            v1_stride_h=v1_strides[3],
+            v2_stride_b=v2_strides[0],
+            v2_stride_s=v2_strides[1],
+            v2_stride_k=v2_strides[2],
+            v2_stride_h=v2_strides[3],
+            dO_stride_b=do_strides[0],
+            dO_stride_s=do_strides[1],
+            dO_stride_k=do_strides[2],
+            dO_stride_h=do_strides[3],
+            m_stride_b=m_strides[0],
+            m_stride_k=m_strides[1],
+            m_stride_s=m_strides[2],
+            d_stride_b=d_strides[0],
+            d_stride_k=d_strides[1],
+            d_stride_s=d_strides[2],
+            slopes_stride_h=alibi_slopes.stride(0),
+            dq_stride_b=dq_strides[0],
+            dq_stride_s=dq_strides[1],
+            dq_stride_k=dq_strides[2],
+            dq_stride_h=dq_strides[3],
+            dk2_stride_b=dk2_strides[0],
+            dk2_stride_s=dk2_strides[1],
+            dk2_stride_k=dk2_strides[2],
+            dk2_stride_h=dk2_strides[3],
+            dv2_stride_b=dv2_strides[0],
+            dv2_stride_s=dv2_strides[1],
+            dv2_stride_k=dv2_strides[2],
+            dv2_stride_h=dv2_strides[3],
+            # Compile-time constants
+            HEAD_DIM=ctx.head_dim,
+            SM_SCALE=ctx.sm_scale,
+            K2_BIAS=ctx.k2_bias,
+            V2_BIAS=ctx.v2_bias,
+            USE_ALIBI3D=ctx.use_alibi3d,
+            ALIBI_ALPHA=ctx.alibi_alpha,
+            CAUSAL=ctx.causal,
+            IS_SECOND_PASS=True,
+            DATA_DTYPE=data_dtype,
+            COMPUTE_DTYPE=compute_dtype,
+            GEMM_DTYPE=gemm_dtype,
+        )
+
+        # ====================================================================
+        # APPLY PRESCALE GRADIENT ADJUSTMENT
+        # ====================================================================
+        # If prescaling was applied in forward pass, we need to scale gradients
+        # by the same factor to get gradients w.r.t. original inputs (chain rule)
+        # Forward: q_original * scale_factor = q_prescaled
+        # Backward: grad_q_prescaled * scale_factor = grad_q_original
+        if ctx.prescale:
+            scale_factor = ctx.head_dim ** -0.1666666667  # d^(-1/6)
+            
+            # Scale gradients for Q, K1, K2 (which were prescaled in forward)
+            grad_q = grad_q * scale_factor
+            grad_k1 = grad_k1 * scale_factor
+            grad_k2 = grad_k2 * scale_factor
+            # Note: V1 and V2 were not prescaled, so no adjustment needed
+
+        # Return gradients (None for non-tensor parameters)
+        # Order: q, k1, k2, v1, v2, w1, w2, alibi_slopes, alibi_alpha,
+        #        k2_bias, v2_bias, sm_scale, prescale, causal,
+        #        data_dtype, compute_dtype, gemm_dtype
+        return (
+            grad_q,
+            grad_k1,
+            grad_k2,
+            grad_v1,
+            grad_v2,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
diff --git a/Triton/two_simplical_attention_pytorch.py b/Triton/two_simplical_attention_pytorch.py
new file mode 100644
index 0000000..50a7308
--- /dev/null
+++ b/Triton/two_simplical_attention_pytorch.py
@@ -0,0 +1,85 @@
+from typing import Optional
+
+import torch
+
+
+def two_simplicial_attention_pytorch(
+    q: torch.Tensor,
+    k1: torch.Tensor,
+    k2: torch.Tensor,
+    v1: torch.Tensor,
+    v2: torch.Tensor,
+    w1: Optional[int] = None,
+    w2: Optional[int] = None,
+    k2_bias: float = 0.0,
+    v2_bias: float = 0.0,
+    sm_scale: Optional[float] = None,
+    prescale: bool = False,
+) -> torch.Tensor:
+    """Pure PyTorch implementation of Two Simplicial Attention.
+
+    Computes:
+        O = softmax((Q ⊙ K1) @ K2^T / √d) @ (V1 ⊙ V2)
+
+    where the softmax is over the flattened (j, k) pairs per query i.
+
+    :param q: Query tensor [batch, seq_len, num_heads, head_dim]
+    :param k1: First key tensor [batch, seq_len, num_heads, head_dim]
+    :param k2: Second key tensor [batch, seq_len, num_heads, head_dim]
+    :param v1: First value tensor [batch, seq_len, num_heads, head_dim]
+    :param v2: Second value tensor [batch, seq_len, num_heads, head_dim]
+    :param w1: Local attention window for K1/V1 (None = full sequence)
+    :param w2: Local attention window for K2/V2 (None = full sequence)
+    :param k2_bias: Additive bias for K2
+    :param v2_bias: Additive bias for V2
+    :param sm_scale: Attention scale (default: head_dim^-0.5)
+    :param prescale: If True, prescale Q, K1, K2 by d^(-1/6) and use sm_scale=1
+    :return: Output tensor [batch, seq_len, num_heads, head_dim]
+    """
+    batch_size, seq_len, num_heads, head_dim = q.shape
+
+    w1 = w1 if w1 is not None else seq_len
+    w2 = w2 if w2 is not None else seq_len
+
+    if prescale:
+        scale_factor = head_dim ** (-1.0 / 6.0)
+        q = q * scale_factor
+        k1 = k1 * scale_factor
+        k2 = k2 * scale_factor
+        sm_scale = 1.0
+    else:
+        if sm_scale is None:
+            sm_scale = head_dim ** -0.5
+
+    # Apply biases
+    k2 = k2 + k2_bias
+    v2 = v2 + v2_bias
+
+    # Transpose to [B, H, S, D]
+    q = q.permute(0, 2, 1, 3)
+    k1 = k1.permute(0, 2, 1, 3)
+    k2 = k2.permute(0, 2, 1, 3)
+    v1 = v1.permute(0, 2, 1, 3)
+    v2 = v2.permute(0, 2, 1, 3)
+
+    # Logits: score[i,j,k] = (q[i] ⊙ k1[j]) · k2[k] * scale
+    qk1 = q[:, :, :, None, :] * k1[:, :, None, :, :]  # [B, H, S_i, S_j, D]
+    logits = torch.einsum("bhijd,bhkd->bhijk", qk1, k2) * sm_scale  # [B, H, S_i, S_j, S_k]
+
+    # Window mask: |i - j| < w1 and |i - k| < w2
+    pos = torch.arange(seq_len, device=q.device)
+    mask_j = (pos.view(-1, 1, 1) - pos.view(1, -1, 1)).abs() < w1  # [S_i, S_j, 1]
+    mask_k = (pos.view(-1, 1, 1) - pos.view(1, 1, -1)).abs() < w2  # [S_i, 1, S_k]
+    logits = logits.masked_fill(~(mask_j & mask_k)[None, None], -1e38)
+
+    # Softmax over flattened (j, k) pairs
+    attn = torch.softmax(logits.reshape(batch_size, num_heads, seq_len, -1), dim=-1)
+    attn = attn.reshape_as(logits)
+
+    # Values: v12[j,k] = v1[j] ⊙ v2[k]
+    v12 = v1[:, :, :, None, :] * v2[:, :, None, :, :]  # [B, H, S_j, S_k, D]
+
+    # Output: o[i] = sum_{j,k} attn[i,j,k] * v12[j,k]
+    output = torch.einsum("bhijk,bhjkd->bhid", attn, v12)
+
+    return output.to(q.dtype).permute(0, 2, 1, 3)
diff --git a/Triton/two_simplical_attention_usage.py b/Triton/two_simplical_attention_usage.py
new file mode 100644
index 0000000..97589c8
--- /dev/null
+++ b/Triton/two_simplical_attention_usage.py
@@ -0,0 +1,113 @@
+"""
+Two Simplicial Attention — quick-start usage guide.
+
+Standard attention computes:
+    O = softmax(Q @ K^T / √d) @ V
+
+Two Simplicial Attention generalises this to *pairs* of keys and values:
+    O = softmax((Q ⊙ K1) @ K2^T / √d) @ (V1 ⊙ V2)
+
+In the simplest case you can set K1=K2=K and V1=V2=V (with biases +1) to
+recover something close to standard attention, so the interface is a strict
+superset.
+
+Both the Triton and PyTorch implementations share the same interface:
+    f(Q, K1, K2, V1, V2, w1, w2, k2_bias, v2_bias, sm_scale, prescale)
+"""
+
+import torch
+
+# ── Triton (fast, CUDA-only) ────────────────────────────────────────────────
+from two_simplicial_attention import TwoSimplicialAttentionFunction
+
+# ── Pure PyTorch (reference, any device) ────────────────────────────────────
+from two_simplicial_attention_pytorch import two_simplicial_attention_pytorch
+
+
+def main():
+    # Shape convention: [batch, seq_len, num_heads, head_dim]
+    B, S, H, D = 2, 128, 4, 64
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    torch.manual_seed(0)
+    Q  = torch.randn(B, S, H, D, device=device, dtype=dtype)
+    K1 = torch.randn(B, S, H, D, device=device, dtype=dtype)
+    K2 = torch.randn(B, S, H, D, device=device, dtype=dtype)
+    V1 = torch.randn(B, S, H, D, device=device, dtype=dtype)
+    V2 = torch.randn(B, S, H, D, device=device, dtype=dtype)
+
+    # ── 1. Minimal call — identical interface ─────────────────────────────────
+    #
+    # Both implementations take the same arguments in the same order.
+    # w1/w2 control the local-attention window size for K1/V1 and K2/V2.
+    # Set them to seq_len (or None) for full (global) attention.
+
+    out_triton = TwoSimplicialAttentionFunction.apply(
+        Q, K1, K2, V1, V2,
+        S, S,  # w1, w2
+    )
+    out_pytorch = two_simplicial_attention_pytorch(
+        Q, K1, K2, V1, V2,
+        w1=S, w2=S,
+    )
+    print(f"1. Triton shape:  {out_triton.shape}")   # [B, S, H, D]
+    print(f"   PyTorch shape: {out_pytorch.shape}")   # [B, S, H, D]
+
+    # ── 2. Local (sliding-window) attention ───────────────────────────────────
+    #
+    # Restrict each key stream to a window — the higher-order analogue of
+    # sliding-window attention.
+
+    W = 32
+    out_triton_local = TwoSimplicialAttentionFunction.apply(
+        Q, K1, K2, V1, V2,
+        W, W,  # w1, w2
+    )
+    out_pytorch_local = two_simplicial_attention_pytorch(
+        Q, K1, K2, V1, V2,
+        w1=W, w2=W,
+    )
+    print(f"2. Local (w={W}) — Triton shape:  {out_triton_local.shape}")
+    print(f"   Local (w={W}) — PyTorch shape: {out_pytorch_local.shape}")
+
+    # ── 3. Biases and prescaling ──────────────────────────────────────────────
+    #
+    # k2_bias/v2_bias add constants to K2/V2 before the computation.
+    # prescale=True applies d^(-1/6) to Q, K1, K2 (useful for stability).
+
+    out_triton_bias = TwoSimplicialAttentionFunction.apply(
+        Q, K1, K2, V1, V2,
+        S, S,          # w1, w2
+        0.5, -0.3,     # k2_bias, v2_bias
+        None,          # sm_scale (auto)
+        True,          # prescale
+    )
+    out_pytorch_bias = two_simplicial_attention_pytorch(
+        Q, K1, K2, V1, V2,
+        w1=S, w2=S,
+        k2_bias=0.5, v2_bias=-0.3,
+        prescale=True,
+    )
+    print(f"3. With biases + prescale — Triton shape:  {out_triton_bias.shape}")
+    print(f"   With biases + prescale — PyTorch shape: {out_pytorch_bias.shape}")
+
+    # ── 4. Backward pass (gradients flow through both) ────────────────────────
+
+    Q_t  = Q.clone().requires_grad_(True)
+    K1_t = K1.clone().requires_grad_(True)
+    out = TwoSimplicialAttentionFunction.apply(Q_t, K1_t, K2, V1, V2, S, S)
+    out.sum().backward()
+    print(f"4. Triton  — dQ norm: {Q_t.grad.norm().item():.4f}, "
+          f"dK1 norm: {K1_t.grad.norm().item():.4f}")
+
+    Q_p  = Q.clone().requires_grad_(True)
+    K1_p = K1.clone().requires_grad_(True)
+    out = two_simplicial_attention_pytorch(Q_p, K1_p, K2, V1, V2, w1=S, w2=S)
+    out.sum().backward()
+    print(f"   PyTorch — dQ norm: {Q_p.grad.norm().item():.4f}, "
+          f"dK1 norm: {K1_p.grad.norm().item():.4f}")
+
+
+if __name__ == "__main__":
+    main()