diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3a2413a --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +# Python cache/artifacts +__pycache__/ +*.pyc + +# Local datasets and weights +AudioVisualText/AVE_data/ +AudioVisualText/pre-trained/ +AudioVisualText/google-bert-base-uncased/ + +# Local outputs/logs +AudioVisualText/results/ +AudioVisualText/slurm-*.out +AudioVisualText/slurm-*.err + +# Local helper scripts +AudioVisualText/run_infer_ave.sbatch +AudioVisualText/run_ft_ave.sbatch diff --git a/AudioVisualText/README_CHANGES.md b/AudioVisualText/README_CHANGES.md new file mode 100644 index 0000000..831caee --- /dev/null +++ b/AudioVisualText/README_CHANGES.md @@ -0,0 +1,91 @@ +# AudioVisualText Local Changes (Run/Eval/Analysis) + +This document summarizes the practical changes made to run AVE finetuning locally, evaluate results, and add gradient-sensitivity analysis. + +## 1) Runtime/Path Updates + +- Updated local checkpoint paths in finetune and inference scripts: + - `AudioVisualText/scripts/finetune/ft_ave.sh` + - `AudioVisualText/scripts/finetune/infer_ave.sh` +- Added dynamic GPU process detection: + - `NPROC_PER_NODE=${NPROC_PER_NODE:-$(nvidia-smi -L 2>/dev/null | wc -l)}` + - Fallback to `1` when detection returns `0`. +- Set AVE scripts to use local weights under: + - `/nethome/rkhan96/flash/weights/...` + +## 2) Precision/Compatibility Fixes + +- Training and inference were aligned to FP32 (`bf16=False`) to avoid dtype mismatch issues observed with BF16 on this setup. +- `AudioVisualText/deepspeed/stage2-offload.json` + - `bf16.enabled` set to `false`. + +## 3) Gradient Sensitivity Instrumentation + +### Config Flags + +- Added training flags in: + - `AudioVisualText/configs/unified_config.py` +- New fields: + - `grad_sensitivity_enable` + - `grad_sensitivity_include_projectors` + +### Trainer Logging + +- Extended `UnifiedTrainer` in: + - `AudioVisualText/trainer.py` +- Added per-step logging for: + - `lora_A_text`, `lora_A_visual`, `lora_A_audio`, `lora_B_shared` + - optional `vl_projector`, `al_projector` +- Logged metrics: + - `*_grad_norm` + - `*_param_norm` + - `*_relative_grad_norm` + - `*_num_params` +- Output file: + - `/grad_sensitivity.jsonl` + +### DeepSpeed/ZeRO Reliability Fix + +- Initial implementation used `param.grad` and produced near-zero gradients in logs. +- Updated implementation to use parameter backward hooks to accumulate grad norms reliably under DeepSpeed. + +## 4) Script Controls for Clean Runs + +- Updated `AudioVisualText/scripts/finetune/ft_ave.sh` run naming: + - If `RUN_NAME` is set, use it. + - Else if `GRAD_SENS_RUN=1`, use `llama_ave_gradsens`. + - Else use `llama_ave`. +- This avoids accidental resume collisions with existing checkpoint directories. + +## 5) Evaluation Summary (AVE) + +- Finetune result (3 epochs) was reproduced near paper-level: + - AVE accuracy: **77.24%** + - Reported reference: **77.06%** +- Parse-valid samples differed due to format strictness in evaluator: + - Local run: `394/402` + - Reference: `397/402` + +## 6) Gradient Analysis Artifacts + +### Analysis Script + +- Added: + - `AudioVisualText/scripts/analysis/plot_grad_sensitivity.py` +- Script outputs: + - `grad_sensitivity_long.csv` + - `grad_sensitivity_summary.csv` + - PNG plots (if `matplotlib` is installed): + - `lora_grad.png` + - `lora_rel.png` + - `projector_rel.png` + +### Current Analysis Output Location + +- `AudioVisualText/results/finetune/llama_ave_gradsens_v2/analysis/` + +## 7) Notes on Job Interruptions + +- One long run was preempted by scheduler, but partial gradient logs were captured. +- The partial `grad_sensitivity.jsonl` still confirms non-zero gradient signals after hook-based fix. + diff --git a/AudioVisualText/configs/unified_config.py b/AudioVisualText/configs/unified_config.py index 5d765b9..9849b40 100644 --- a/AudioVisualText/configs/unified_config.py +++ b/AudioVisualText/configs/unified_config.py @@ -97,6 +97,18 @@ class TrainingArguments(transformers.TrainingArguments): ## my reserved_modality: str = field(default=None) loramethod: str = field(default=None) + cross_attn_kv_mode: str = field( + default="question", + metadata={"help": "Cross-attn KV source for LoRA branches: question or full_text."}, + ) + cross_modal_mode: str = field( + default="trilinear", + metadata={"help": "Cross-modal fusion mode for LoRA: pairwise or trilinear."}, + ) + trilinear_pack_tokens: bool = field( + default=False, + metadata={"help": "If True, compact active tokens before Triton trilinear attention."}, + ) blc_alpha: float = field(default=0.5) blc_weight: float = field(default=0.5) @@ -106,3 +118,9 @@ class TrainingArguments(transformers.TrainingArguments): save_modules: str = field(default='vl_projector,al_projector,lora') exp_desc: str = field(default='exp') + + # Gradient sensitivity analysis toggles. + # When enabled, UnifiedTrainer logs per-modality gradient statistics for + # LoRA branches (text/visual/audio A, shared B) and optional projectors. + grad_sensitivity_enable: bool = field(default=False) + grad_sensitivity_include_projectors: bool = field(default=True) diff --git a/AudioVisualText/dataset/unified_dataset.py b/AudioVisualText/dataset/unified_dataset.py index 0d40aae..5e0f653 100644 --- a/AudioVisualText/dataset/unified_dataset.py +++ b/AudioVisualText/dataset/unified_dataset.py @@ -54,7 +54,6 @@ def __init__( print(f'tot training sample nums: {self.tot}') - def add_avqa_task_samples(self): avqa_annotation_path = 'MUSIC_AVQA_data/train_samples_with_reasoning_avqa.json' tot = 0 diff --git a/AudioVisualText/deepspeed/stage2-offload-torch25.json b/AudioVisualText/deepspeed/stage2-offload-torch25.json new file mode 100644 index 0000000..0aaac8e --- /dev/null +++ b/AudioVisualText/deepspeed/stage2-offload-torch25.json @@ -0,0 +1,51 @@ +{ + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto", + "torch_adam": true + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": false, + "loss_scale": 0, + "initial_scale_power": 16, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 100000000.0, + "reduce_scatter": true, + "reduce_bucket_size": 100000000.0, + "overlap_comm": true, + "contiguous_gradients": true, + "offload_optimizer": { + "device": "cpu" + }, + "round_robin_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/AudioVisualText/deepspeed/stage2-offload.json b/AudioVisualText/deepspeed/stage2-offload.json index 85ec8e1..aba1816 100644 --- a/AudioVisualText/deepspeed/stage2-offload.json +++ b/AudioVisualText/deepspeed/stage2-offload.json @@ -19,7 +19,7 @@ } }, "bf16": { - "enabled": "auto", + "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, diff --git a/AudioVisualText/docs/moka.pdf b/AudioVisualText/docs/moka.pdf new file mode 100644 index 0000000..8feed6d Binary files /dev/null and b/AudioVisualText/docs/moka.pdf differ diff --git a/AudioVisualText/docs/moka_math_from_code.pdf b/AudioVisualText/docs/moka_math_from_code.pdf new file mode 100644 index 0000000..6629a98 Binary files /dev/null and b/AudioVisualText/docs/moka_math_from_code.pdf differ diff --git a/AudioVisualText/docs/moka_math_from_code.tex b/AudioVisualText/docs/moka_math_from_code.tex new file mode 100644 index 0000000..8645f85 --- /dev/null +++ b/AudioVisualText/docs/moka_math_from_code.tex @@ -0,0 +1,281 @@ +\documentclass[11pt]{article} +\usepackage[margin=1in]{geometry} +\usepackage{amsmath,amssymb,mathtools,bm} +\usepackage{booktabs} +\usepackage{hyperref} + +\title{MokA: Forward/Backward Math From Code} +\author{Derived directly from repository implementation} +\date{\today} + +\begin{document} +\maketitle + +\section{Scope and Grounding} +This note is derived from these files: +\begin{itemize} +\item \texttt{models/unified\_arch.py}: multimodal embedding and modality masks. +\item \texttt{models/modeling\_llama.py}: masked LoRA projections used by attention/MLP and LM loss. +\item \texttt{peft\_hyper/tuners/lora.py}: custom \texttt{Linear} with \texttt{lora\_A0/A1/A2} and shared \texttt{lora\_B0}. +\item \texttt{scripts/finetune/finetune.py}: which modules are wrapped and which parameters are trainable. +\item \texttt{trainer.py}: gradient-sensitivity metric definitions. +\end{itemize} + +\section{Notation and Shapes} +Per adapted linear layer: +\begin{itemize} +\item Input: $X \in \mathbb{R}^{B\times T\times d_{\text{in}}}$. +\item Base weight: $W_0 \in \mathbb{R}^{d_{\text{out}}\times d_{\text{in}}}$, bias $b$. +\item LoRA branch matrices: +\[ +A_0\in\mathbb{R}^{r\times d_{\text{in}}},\quad +A_1\in\mathbb{R}^{r\times d_{\text{in}}},\quad +A_2\in\mathbb{R}^{r\times d_{\text{in}}},\quad +B_0\in\mathbb{R}^{d_{\text{out}}\times r}. +\] +\item Scale $s=\alpha/r$ (code uses \texttt{self.scaling[0]} for all branches). +\item Binary masks from input construction: +\[ +M_t,M_v,M_a,M_q\in\{0,1\}^{B\times T\times 1}. +\] +\end{itemize} + +\paragraph{Important implementation constraint.} +In code, \texttt{r} is parsed digit-wise (\texttt{str(r)}), and branch outputs are later summed. Therefore branch ranks must match in practice (or the sum would fail). + +\section{Multimodal Input Construction} +The model builds one embedding sequence and four aligned masks: +\[ +E\in\mathbb{R}^{B\times T\times d_{\text{model}}},\quad +M_t,M_v,M_a,M_q\in\{0,1\}^{B\times T\times 1}. +\] +Text tokens map through token embedding; image/video/audio special tokens are replaced by projector features; labels for non-text inserted modal features are set to $-100$ (ignored in CE). Then sequences are left-padded to common $T$. + +\section{Forward: One Adapted Linear Layer} +PyTorch linear convention is +\[ +Y_{\text{base}} = XW_0^\top + b. +\] + +Define masked inputs: +\[ +X_t=X\odot M_t,\quad X_v=X\odot M_v,\quad X_a=X\odot M_a. +\] +With dropout operator $D(\cdot)$: +\[ +U_i = s\,A_i(D(X_i)),\quad i\in\{0,1,2\}, +\] +where $A_i(\cdot)$ means linear map with weight $A_i$. + +\subsection{Cross-modal branch coupling} +For each sample $b$, video branch: +\[ +Q_v=U_1^{(b)},\quad K_q=V_q=U_0^{(b)}\odot M_q^{(b)}, +\] +\[ +S_v=\operatorname{softmax}\!\left(\frac{Q_vK_q^\top}{\sqrt{r}}\right),\quad +O_v=S_vV_q, +\] +\[ +\widetilde U_1^{(b)} = U_1^{(b)} + \beta\,(M_v^{(b)}\odot O_v), +\] +with $\beta=\texttt{blc\_weight}$. + +Similarly for audio: +\[ +\widetilde U_2^{(b)} = U_2^{(b)} + \beta\,(M_a^{(b)}\odot O_a), +\] +where $O_a$ uses query $U_2^{(b)}$ and key/value from question slice of $U_0^{(b)}$. + +Then +\[ +U_{\Sigma}=U_0+\widetilde U_1+\widetilde U_2,\qquad +Y_{\text{lora}} = B_0(U_{\Sigma}), +\] +and final layer output: +\[ +Y = Y_{\text{base}} + Y_{\text{lora}}. +\] + +\paragraph{What does ``$B$ multiply $A$'' mean?} +Ignoring masking/cross-attn and using one branch: +\[ +Y_{\text{lora},i}=B_0\big(sA_i(X_i)\big) += X_i(s\,B_0A_i)^\top. +\] +So the effective low-rank update to base weight is +\[ +\Delta W_i = s\,B_0A_i,\quad +W_{\text{eff}} = W_0 + \sum_i \Delta W_i. +\] + +\section{Where It Is Used} +The wrapped linear layers are injected into +\[ +\{q\_proj,k\_proj,v\_proj,o\_proj,gate\_proj,up\_proj,down\_proj\} +\] +across decoder layers. In attention and MLP forward passes, modality masks are passed to these projections. + +\section{Language Modeling Loss} +Given logits $Z\in\mathbb{R}^{B\times T\times V}$ and labels $y$: +\[ +Z_{\text{shift}}=Z_{:,1:T-1,:},\quad +y_{\text{shift}}=y_{:,2:T}. +\] +Loss is cross-entropy over flattened tokens with ignore index $-100$: +\[ +\mathcal L = -\frac{1}{N_{\text{valid}}}\sum_{n\in\mathcal I} +\log\operatorname{softmax}(Z_n)_{y_n}. +\] + +\section{Backward Propagation} +\subsection{From CE to logits} +For each valid token row $n$: +\[ +\frac{\partial \mathcal L}{\partial Z_n} += +\frac{1}{N_{\text{valid}}}\left(\operatorname{softmax}(Z_n)-e_{y_n}\right), +\] +and $0$ for ignored rows. + +\subsection{Adapter local gradients} +Let $G=\partial\mathcal L/\partial Y_{\text{lora}}$. +For $Y_{\text{lora}}=B_0(U_{\Sigma})$: +\[ +\frac{\partial\mathcal L}{\partial B_0} += +\sum_{b,t} G_{b,t}^{\top}\,U_{\Sigma,b,t}, +\qquad +\frac{\partial\mathcal L}{\partial U_{\Sigma}} += +G\,B_0. +\] + +Define split upstreams: +\[ +G_0=\frac{\partial\mathcal L}{\partial U_0},\; +G_1=\frac{\partial\mathcal L}{\partial U_1},\; +G_2=\frac{\partial\mathcal L}{\partial U_2}, +\] +where $G_1,G_2$ include direct residual path and query-through-attention path. + +For branch $i$ with $U_i=s\,A_i(D(X_i))$: +\[ +\frac{\partial\mathcal L}{\partial A_i} += +s\sum_{b,t} G_{i,b,t}^{\top}\,D(X_{i,b,t}). +\] +This gives explicitly: +\[ +\boxed{ +\frac{\partial\mathcal L}{\partial A_1} += +s\sum_{b,t} G_{1,b,t}^{\top}\,D(X_{v,b,t}) +} +\] +\[ +\boxed{ +\frac{\partial\mathcal L}{\partial A_2} += +s\sum_{b,t} G_{2,b,t}^{\top}\,D(X_{a,b,t}) +} +\] +with +\[ +X_v=X\odot M_v,\quad X_a=X\odot M_a. +\] + +\paragraph{What is inside $G_1,G_2$?} +For video (analogous audio): +\[ +\widetilde U_1 = U_1 + \beta(M_v\odot O_v),\quad O_v=\text{Attn}(U_1,U_0\odot M_q,U_0\odot M_q). +\] +Hence +\[ +\frac{\partial\mathcal L}{\partial U_1} += +\frac{\partial\mathcal L}{\partial \widetilde U_1} +\left[ +I ++ +\beta\,\frac{\partial (M_v\odot O_v)}{\partial U_1} +\right]. +\] +So $A_1$ receives gradient from both: +\begin{itemize} +\item direct residual term ($I$), +\item attention-query Jacobian term. +\end{itemize} + +\subsection{How many gradients are there?} +Per adapted linear module, there are parameter-gradient tensors: +\[ +\frac{\partial\mathcal L}{\partial A_0},\; +\frac{\partial\mathcal L}{\partial A_1},\; +\frac{\partial\mathcal L}{\partial A_2},\; +\frac{\partial\mathcal L}{\partial B_0} +\] +(plus any trainable non-LoRA parameters you kept in \texttt{save\_modules}). + +If $N_{\text{adapt}}$ modules are wrapped, LoRA matrix-gradient tensor count is: +\[ +4N_{\text{adapt}}. +\] +With current target list (7 projections per decoder layer), for $L$ layers: +\[ +N_{\text{adapt}} = 7L,\quad \text{LoRA gradient tensors}=28L. +\] + +\section{What Gets Updated} +\begin{itemize} +\item Base wrapped weight is frozen in custom \texttt{Linear} (\texttt{weight.requires\_grad=False}). +\item \texttt{mark\_only\_lora\_as\_trainable} first disables non-LoRA params. +\item Then finetune script applies final filter: a parameter is trainable iff its name contains one of \texttt{save\_modules}. +\end{itemize} +So the \emph{actual} update set is: +\[ +\Theta_{\text{update}}=\{\theta:\texttt{requires\_grad}(\theta)=\texttt{True after final filter}\}. +\] + +\section{Optimizer Update (conceptual)} +For each $\theta\in\Theta_{\text{update}}$, AdamW-style step: +\[ +m_t=\beta_1m_{t-1}+(1-\beta_1)g_t,\quad +v_t=\beta_2v_{t-1}+(1-\beta_2)g_t^2, +\] +\[ +\hat m_t=\frac{m_t}{1-\beta_1^t},\quad +\hat v_t=\frac{v_t}{1-\beta_2^t}, +\] +\[ +\theta_{t+1}=\theta_t-\eta\left(\frac{\hat m_t}{\sqrt{\hat v_t}+\epsilon}+\lambda\theta_t\right). +\] + +\section{Gradient Sensitivity Metrics in Trainer} +For each group $g\in\{\text{A0,A1,A2,B0,projectors}\}$: +\[ +G_g^2 = \sum_{\theta\in g}\left\|\frac{\partial\mathcal L}{\partial\theta}\right\|_F^2,\quad +P_g^2 = \sum_{\theta\in g}\|\theta\|_F^2, +\] +\[ +\text{grad\_norm}_g=\sqrt{G_g^2},\quad +\text{param\_norm}_g=\sqrt{P_g^2},\quad +\text{relative\_grad\_norm}_g=\frac{\text{grad\_norm}_g}{\text{param\_norm}_g+\varepsilon}. +\] +These are exactly what hooks/logging in \texttt{trainer.py} compute. + +\section{Code-to-Math Trace Checklist} +For any adapted layer in your run: +\begin{enumerate} +\item Build $X,M_t,M_v,M_a,M_q$ from packed multimodal input. +\item Compute base output $XW_0^\top+b$. +\item Compute three masked low-rank branch outputs $U_0,U_1,U_2$. +\item Fuse question-conditioned information into video/audio branches. +\item Sum branches, project by shared $B_0$, add to base. +\item Continue through attention/MLP/stack, then LM head and shifted CE. +\item Backprop: CE $\rightarrow$ logits $\rightarrow$ decoder $\rightarrow$ each adapted linear: + obtain $\partial\mathcal L/\partial A_0,\partial\mathcal L/\partial A_1,\partial\mathcal L/\partial A_2,\partial\mathcal L/\partial B_0$. +\item Optimizer updates only parameters still marked trainable. +\end{enumerate} + +\end{document} diff --git a/AudioVisualText/models/modeling_llama.py b/AudioVisualText/models/modeling_llama.py index 2004e81..155255b 100644 --- a/AudioVisualText/models/modeling_llama.py +++ b/AudioVisualText/models/modeling_llama.py @@ -860,6 +860,8 @@ def forward( ) hidden_states = outputs[0] + # Cast to lm_head dtype to avoid float vs bf16 mismatch at inference + hidden_states = hidden_states.to(self.lm_head.weight.dtype) if self.config.pretraining_tp > 1: lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] diff --git a/AudioVisualText/models/unified_arch.py b/AudioVisualText/models/unified_arch.py index c1b1119..b4333ce 100644 --- a/AudioVisualText/models/unified_arch.py +++ b/AudioVisualText/models/unified_arch.py @@ -149,7 +149,7 @@ def prepare_multimodal_inputs( pre_indice=0 for idx,indice in enumerate(X_token_indices): special_token = self.IDS_2_SPECIAL_TOKEN[input_ids[indice].item()] - + if special_token == '': # token size * emb size tmp=self.encode_ids(input_ids[pre_indice:indice]) diff --git a/AudioVisualText/notebooks/ave_dataset_explorer.ipynb b/AudioVisualText/notebooks/ave_dataset_explorer.ipynb new file mode 100644 index 0000000..51f2fa5 --- /dev/null +++ b/AudioVisualText/notebooks/ave_dataset_explorer.ipynb @@ -0,0 +1,750 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AVE Dataset Explorer (MokA)\n", + "\n", + "This notebook mirrors MokA's AVE loading path for a few samples, renders question/audio/video/output, and adds a small `M_q` vs full-text cross-attention ablation demo." + ] + }, + { + "cell_type": "markdown", + "id": "748180e7", + "metadata": {}, + "source": [ + "## 1) Environment Setup and Imports\n", + "\n", + "This section resolves the project paths, imports the same libraries used by MokA data loading, and verifies that AVE metadata files are available.\n", + "\n", + "Why this matters:\n", + "- Keeps notebook behavior aligned with the training code path.\n", + "- Fails early if the notebook is launched from the wrong directory." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ba73044e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**AVT root:** `/coc/flash5/rkhan96/MokA/AudioVisualText` \n", + "**AVE root:** `/coc/flash5/rkhan96/MokA/AudioVisualText/AVE_data`" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train json exists: True test json exists: True\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import json\n", + "import numpy as np\n", + "import torch\n", + "\n", + "from IPython.display import display, Markdown, Audio, Video\n", + "\n", + "# Optional imports used in feature extraction (same path as unified_dataset.py)\n", + "import librosa\n", + "from PIL import Image\n", + "from decord import VideoReader\n", + "from transformers import CLIPImageProcessor\n", + "\n", + "# MokA audio fbank preprocessing\n", + "import sys\n", + "\n", + "# Resolve project root robustly for both execution locations:\n", + "# - MokA/AudioVisualText\n", + "# - MokA/AudioVisualText/notebooks\n", + "REPO_ROOT = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd().resolve()\n", + "if (REPO_ROOT / 'dataset').exists():\n", + " AVT_ROOT = REPO_ROOT\n", + "elif (REPO_ROOT / 'AudioVisualText' / 'dataset').exists():\n", + " AVT_ROOT = REPO_ROOT / 'AudioVisualText'\n", + "else:\n", + " raise RuntimeError('Run this notebook from MokA or MokA/AudioVisualText.')\n", + "\n", + "# Add AVT root to Python path so we can import local dataset utilities\n", + "sys.path.insert(0, str(AVT_ROOT))\n", + "from dataset.audio_processor import preprocess\n", + "\n", + "# Canonical AVE paths used by MokA\n", + "AVE_ROOT = AVT_ROOT / 'AVE_data'\n", + "TRAIN_JSON = AVE_ROOT / 'train_samples_ave.json'\n", + "TEST_JSON = AVE_ROOT / 'test_samples_ave.json'\n", + "\n", + "# Candidate CLIP processor paths (cluster + local fallback)\n", + "CLIP_PATH_CANDIDATES = [\n", + " Path('/coc/flash5/rkhan96/weights/clip-vit-large-patch14'),\n", + " Path('/nethome/rkhan96/flash/weights/clip-vit-large-patch14'),\n", + " Path('clip-vit-large-patch14'),\n", + "]\n", + "\n", + "display(Markdown(f'**AVT root:** `{AVT_ROOT}` \\n**AVE root:** `{AVE_ROOT}`'))\n", + "print('train json exists:', TRAIN_JSON.exists(), 'test json exists:', TEST_JSON.exists())\n" + ] + }, + { + "cell_type": "markdown", + "id": "e63aa374", + "metadata": {}, + "source": [ + "## 2) Code Provenance: Where MokA Loads and Uses AVE Data\n", + "\n", + "This section prints exact line ranges from the repository so the notebook output is traceable back to source code.\n", + "\n", + "We inspect:\n", + "- AVE sample construction and modal loading in `unified_dataset.py`\n", + "- Question-mask construction in `unified_arch.py`\n", + "- Question-only cross-attention in `peft_hyper/tuners/lora.py`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "83751507", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- unified_dataset.py :: AVE sample construction ---\n", + " 90: def add_ave_task_samples(self):\n", + " 91: ave_annotation_path = 'AVE_data/train_samples_ave.json'\n", + " 92: ave_data_root = 'AVE_data'\n", + " 93: tot = 0\n", + " 94: with open(ave_annotation_path,'r') as f:\n", + " 95: samples = json.load(f)\n", + " 96: for sample in samples:\n", + " 97: event = sample['event']\n", + " 98: vid = sample['vid']\n", + " 99: start_time = sample['start_time']\n", + " 100: end_time = sample['end_time']\n", + " 101: audio_path = join(ave_data_root,'audio_data',vid+'.mp3')\n", + " 102: video_path = join(ave_data_root,'AVE',vid+'.mp4')\n", + " 103: label_path = join(ave_data_root,'converted_label',vid+'.txt')\n", + " 104: output = self.read_label(label_path)\n", + " 105: instruction = f'This is a video:\\n