diff --git a/.gitignore b/.gitignore
index 2641667..187e562 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,16 @@
 *.mp4
+**/__pycache__/
+**/*.pyc
+**/*.pyo
+
+InternVL3-2B-Instruct/
+Yume-5B-720P/
+_vbench_caption.txt
+_vbench_tmp/
+outputs/
+requirements-cog.txt
+temp_caption_3th person.txt
+temp_caption_bus.txt
+temp_caption_default.txt
+temp_caption_gc.txt
+temp_caption_kitchen.txt
diff --git a/download_model.py b/download_model.py
new file mode 100644
index 0000000..cb3e402
--- /dev/null
+++ b/download_model.py
@@ -0,0 +1,14 @@
+"""Download Yume model weights from HuggingFace."""
+
+from huggingface_hub import snapshot_download
+
+REPOS = [
+    "stdstu123/Yume-5B-720P",
+    "OpenGVLab/InternVL3-2B-Instruct",
+]
+
+for repo_id in REPOS:
+    local_dir = f"./{repo_id.split('/')[-1]}"
+    print(f"Downloading {repo_id} -> {local_dir}")
+    snapshot_download(repo_id=repo_id, local_dir=local_dir)
+    print(f"Done: {local_dir}")
diff --git a/fastvideo/__pycache__/__init__.cpython-312.pyc b/fastvideo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..4f89924
Binary files /dev/null and b/fastvideo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/fastvideo/distill/__pycache__/__init__.cpython-312.pyc b/fastvideo/distill/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..28b5a5b
Binary files /dev/null and b/fastvideo/distill/__pycache__/__init__.cpython-312.pyc differ
diff --git a/fastvideo/distill/__pycache__/solver.cpython-312.pyc b/fastvideo/distill/__pycache__/solver.cpython-312.pyc
new file mode 100644
index 0000000..b084a04
Binary files /dev/null and b/fastvideo/distill/__pycache__/solver.cpython-312.pyc differ
diff --git a/fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc b/fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc
new file mode 100644
index 0000000..5c28256
Binary files /dev/null and b/fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/__pycache__/__init__.cpython-312.pyc b/fastvideo/models/hunyuan/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..13622f5
Binary files /dev/null and b/fastvideo/models/hunyuan/__pycache__/__init__.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/__pycache__/constants.cpython-312.pyc b/fastvideo/models/hunyuan/__pycache__/constants.cpython-312.pyc
new file mode 100644
index 0000000..5cb432d
Binary files /dev/null and b/fastvideo/models/hunyuan/__pycache__/constants.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/text_encoder/__pycache__/__init__.cpython-312.pyc b/fastvideo/models/hunyuan/text_encoder/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..1b034e8
Binary files /dev/null and b/fastvideo/models/hunyuan/text_encoder/__pycache__/__init__.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/vae/__pycache__/__init__.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..f02fad1
Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/__init__.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/vae/__pycache__/autoencoder_kl_causal_3d.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/autoencoder_kl_causal_3d.cpython-312.pyc
new file mode 100644
index 0000000..86cddc5
Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/autoencoder_kl_causal_3d.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/vae/__pycache__/unet_causal_3d_blocks.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/unet_causal_3d_blocks.cpython-312.pyc
new file mode 100644
index 0000000..89ee2a6
Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/unet_causal_3d_blocks.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan/vae/__pycache__/vae.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/vae.cpython-312.pyc
new file mode 100644
index 0000000..7a3478b
Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/vae.cpython-312.pyc differ
diff --git a/fastvideo/models/hunyuan_hf/__pycache__/modeling_hunyuan.cpython-312.pyc b/fastvideo/models/hunyuan_hf/__pycache__/modeling_hunyuan.cpython-312.pyc
new file mode 100644
index 0000000..1e42ded
Binary files /dev/null and b/fastvideo/models/hunyuan_hf/__pycache__/modeling_hunyuan.cpython-312.pyc differ
diff --git a/fastvideo/models/mochi_hf/__pycache__/mochi_latents_utils.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/mochi_latents_utils.cpython-312.pyc
new file mode 100644
index 0000000..2a7da29
Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/mochi_latents_utils.cpython-312.pyc differ
diff --git a/fastvideo/models/mochi_hf/__pycache__/modeling_mochi.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/modeling_mochi.cpython-312.pyc
new file mode 100644
index 0000000..642abad
Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/modeling_mochi.cpython-312.pyc differ
diff --git a/fastvideo/models/mochi_hf/__pycache__/norm.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/norm.cpython-312.pyc
new file mode 100644
index 0000000..da49503
Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/norm.cpython-312.pyc differ
diff --git a/fastvideo/models/mochi_hf/__pycache__/pipeline_mochi.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/pipeline_mochi.cpython-312.pyc
new file mode 100644
index 0000000..7a5b8ed
Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/pipeline_mochi.cpython-312.pyc differ
diff --git a/fastvideo/models/mochi_hf/modeling_mochi.py b/fastvideo/models/mochi_hf/modeling_mochi.py
index 330f31a..5fd498e 100644
--- a/fastvideo/models/mochi_hf/modeling_mochi.py
+++ b/fastvideo/models/mochi_hf/modeling_mochi.py
@@ -28,7 +28,14 @@
 from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
                              scale_lora_layers, unscale_lora_layers)
 from diffusers.utils.torch_utils import maybe_allow_in_graph
-from liger_kernel.ops.swiglu import LigerSiLUMulFunction
+try:
+    from liger_kernel.ops.swiglu import LigerSiLUMulFunction
+except ModuleNotFoundError:
+    import torch.nn.functional as _F
+    class LigerSiLUMulFunction:
+        @staticmethod
+        def apply(gate, hidden_states):
+            return _F.silu(gate) * hidden_states
 
 from fastvideo.models.flash_attn_no_pad import flash_attn_no_pad
 from fastvideo.models.mochi_hf.norm import (MochiLayerNormContinuous,
diff --git a/fastvideo/sample/sample.py b/fastvideo/sample/sample.py
index 1d0f66a..5860462 100644
--- a/fastvideo/sample/sample.py
+++ b/fastvideo/sample/sample.py
@@ -912,7 +912,8 @@ def main(args):
     local_rank = int(os.environ["LOCAL_RANK"])
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    backend = "gloo" if os.name == "nt" else "nccl"
+    dist.init_process_group(backend, rank=rank, world_size=world_size)
 
     # Set independent cache directories for each rank
     os.environ["TRITON_CACHE_DIR"] = f"/tmp/triton_cache_{rank}"
diff --git a/fastvideo/sample/sample_5b.py b/fastvideo/sample/sample_5b.py
index cdd1e4e..8f1067a 100644
--- a/fastvideo/sample/sample_5b.py
+++ b/fastvideo/sample/sample_5b.py
@@ -3,6 +3,7 @@
 import argparse
 import math
 import os
+import re
 import sys
 import torchvision
 import time
@@ -867,7 +868,7 @@ def sample_one(
         
 
     # Generate diverse output videos from identical input conditions
-    max_area = 704 * 1280
+    max_area = args.height * args.width
     # pixel_values_vid = torch.nn.functional.interpolate(pixel_values_vid, size=(544, 960), mode='bilinear', align_corners=False)
     
     repeat_nums = 1
@@ -889,14 +890,16 @@ def sample_one(
 
             frame = model_input.shape[1]
 
+            main_print(f"[SAMPLE] VAE encoding input ({model_input.shape}) ...")
             model_input = torch.cat([wan_i2v.vae.encode([model_input.to(device)[:,:-32].to(device)])[0], \
-                                     wan_i2v.vae.encode([model_input.to(device)[:,-32:].to(device)])[0]],dim=1) 
+                                     wan_i2v.vae.encode([model_input.to(device)[:,-32:].to(device)])[0]],dim=1)
+            main_print(f"[SAMPLE] VAE encode done -> latent shape {model_input.shape}")
 
             latents = model_input
 
             img =  model_input[:,:-latent_frame_zero]
 
-
+            main_print(f"[SAMPLE] wan_i2v.generate (i2v, frame_num={frame}) ...")
             with torch.no_grad():
                 arg_c, arg_null, noise, mask2, img = wan_i2v.generate(
                             caption[0],
@@ -904,14 +907,17 @@ def sample_one(
                             max_area=max_area,
                             latent_frame_zero=latent_frame_zero,
                             img=img)
+            main_print("[SAMPLE] wan_i2v.generate done")
         else:
             frame = 32
+            main_print(f"[SAMPLE] wan_i2v.generate (t2v, frame_num={frame}) ...")
             with torch.no_grad():
                 arg_c, arg_null, noise = wan_i2v.generate(
                             caption[0],
                             frame_num=32,
                             max_area=max_area,
                             latent_frame_zero=latent_frame_zero,)
+            main_print("[SAMPLE] wan_i2v.generate done")
         
 
        
@@ -949,15 +955,15 @@ def sample_one(
 
                 import time
                 start_time = time.time()
-                
+                main_print(f"[SAMPLE] Denoising step_sample={step_sample}/{sample_num-1}  steps={sample_step} ...")
+
                 if not t2v or step_sample > 0:
                     latent = torch.cat([img[0][:, :-latent_frame_zero, :, :], latent[:, -latent_frame_zero:, :, :]], dim=1)
                 #(1. - mask2[0]) * img[0]  + mask2[0] * latent
-                print(latent.shape, "nbxkasbcna090-")
                 with torch.no_grad():
                     with torch.autocast("cuda", dtype=torch.bfloat16):
 
-                        for i in range(sample_step):
+                        for i in tqdm(range(sample_step), desc="Sampling", unit="step"):
                             latent_model_input = [latent.squeeze(0)]
 
                             if not t2v or step_sample>0:
@@ -981,7 +987,6 @@ def sample_one(
                                 # ])
                                 # timestep = temp_ts.unsqueeze(0)
 
-                                print(latent_model_input[0].shape,"0-2=ffje0r=----------a")
                                 noise_pred_cond = transformer(latent_model_input, t=timestep, **arg_c)[0]
 
                                 if i+1 == sample_step:
@@ -1010,7 +1015,6 @@ def sample_one(
                                 # timestep = torch.stack(timestep)
                                 # temp_ts = timestep.flatten()
                                 # timestep = temp_ts#.unsqueeze(0)
-                                print(latent_model_input[0].shape,"0-2=ffje0r=----------a")
                                 noise_pred_cond = transformer(latent_model_input, t=timestep, flag=False, **arg_c)[0]
 
                                 # # UniPC
@@ -1048,11 +1052,12 @@ def sample_one(
                     else:
                         model_input = latent
 
+                main_print(f"[SAMPLE] VAE decoding latents ...")
+                torch.cuda.empty_cache()
                 with torch.autocast("cuda", dtype=torch.bfloat16):
-                    video_cat = scale(vae, model_input[:,-latent_frame_zero:,:,:]) 
+                    video_cat = scale(vae, model_input[:,-latent_frame_zero:,:,:])
                     video = video_cat[:,-frame_zero:]
                     video_all.append(video)
-
                     if step_sample > 0:
                         #if video.shape[1] < frame_zero:
                         #    video = torch.cat([video[:,0].unsqueeze(1).repeat(1,frame_zero-video.shape[1],1,1),video],dim=1)                            
@@ -1073,11 +1078,16 @@ def sample_one(
                 else:
                     videoid_str = str(videoid)
 
+                os.makedirs(video_output_dir, exist_ok=True)
+                caption_safe = re.sub(r'[\\/:*?"<>|→←↑↓·\s]+', '_', str(caption_ori))[:60]
                 filename = os.path.join(
-                                        video_output_dir,
-                                        videoid_str+"_"+str(caption_ori)+"_"+str(repeat_num)+"_"+str(rank)+"_"+str(step_sample)+".mp4",
-                                    )
-                export_to_video(video[0] , filename, fps=16)
+                    video_output_dir,
+                    f"{videoid_str}_{caption_safe}_{repeat_num}_{rank}_{step_sample}.mp4",
+                )
+                main_print(f"[SAMPLE] VAE decode done -> video shape {video[0].shape if hasattr(video[0], 'shape') else len(video[0])} frames")
+                main_print(f"[SAVE]   Output path: {filename}")
+                export_to_video(video[0], filename, fps=args.fps)
+                main_print(f"[SAVE]   Saved: {filename}")
 
 
                 if step_sample + 1 < sample_num:
@@ -1124,7 +1134,10 @@ def main(args):
     local_rank = int(os.environ["LOCAL_RANK"])
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    print(f"[rank {rank}] dist init (backend={'gloo' if sys.platform == 'win32' else 'nccl'}) ...")
+    backend = "gloo" if sys.platform == "win32" else "nccl"
+    dist.init_process_group(backend, rank=rank, world_size=world_size)
+    print(f"[rank {rank}] dist init done")
 
     # Set independent cache directories for each rank
     os.environ["TRITON_CACHE_DIR"] = f"/tmp/triton_cache_{rank}"
@@ -1146,20 +1159,20 @@ def main(args):
     ckpt_dir = "./Yume-5B-720P"
 
     # Referenced from https://github.com/Wan-Video/Wan2.2
+    main_print(f"[INIT] Loading wan23.Yume from {ckpt_dir} ...")
     wan_i2v = wan23.Yume(
         config=cfg,
         checkpoint_dir=ckpt_dir,
         device_id=device,
-    )  
-    transformer = wan_i2v.model  
+    )
+    main_print("[INIT] wan23.Yume loaded")
+    transformer = wan_i2v.model
     transformer = transformer.eval().requires_grad_(False)
 
     main_print(
         f"  Total Sample parameters = {sum(p.numel() for p in transformer.parameters() if p.requires_grad) / 1e6} M"
     )
-    main_print(
-        f"--> Initializing FSDP with sharding strategy: {args.fsdp_sharding_startegy}"
-    )
+    main_print(f"[INIT] Initializing FSDP with sharding strategy: {args.fsdp_sharding_startegy} ...")
     fsdp_kwargs, no_split_modules = get_dit_fsdp_kwargs(
         transformer,
         args.fsdp_sharding_startegy,
@@ -1170,6 +1183,7 @@ def main(args):
     )
 
     if args.resume_from_checkpoint:
+        main_print(f"[INIT] Resuming from checkpoint: {args.resume_from_checkpoint} ...")
         (
             transformer,
             init_steps,
@@ -1177,6 +1191,7 @@ def main(args):
             transformer,
             args.resume_from_checkpoint,
         )
+        main_print(f"[INIT] Checkpoint resumed (init_steps={init_steps})")
         
       
     from safetensors import safe_open
@@ -1238,12 +1253,14 @@ def main(args):
 
     # transformer.load_state_dict(merged_weights, strict=False)
     
+    main_print("[INIT] Casting transformer to bfloat16 and wrapping with FSDP ...")
     transformer = transformer.to(torch.bfloat16)
     transformer = FSDP(
         transformer,
         **fsdp_kwargs,
         use_orig_params=True,
     )
+    main_print("[INIT] FSDP wrap done")
 
 
 
@@ -1261,38 +1278,52 @@ def main(args):
     #init t5, clip and vae
     vae = wan_i2v.vae
 
+    main_print("[INIT] dist.barrier ...")
     dist.barrier()
-    
+    main_print("[INIT] barrier passed")
+
     wan_i2v.device = device
+    main_print("[INIT] Loading denoiser ...")
     denoiser = load_denoiser()
-    
-    print("jpg_dir", args.jpg_dir)
+    main_print("[INIT] Denoiser loaded")
+
+    main_print(f"[DATA] jpg_dir={args.jpg_dir}  video_root_dir={args.video_root_dir}  T2V={args.T2V}")
     image_sample = False
     dataset_ddp = None
     dataset_length = None
     if args.jpg_dir != None and not args.T2V:
-        dataset_ddp, dataset_length = create_scaled_videos(args.jpg_dir, 
-                                        total_frames=33, 
-                                        H1=704, 
-                                        W1=1280)
+        main_print(f"[DATA] Building image dataset from {args.jpg_dir} ...")
+        dataset_ddp, dataset_length = create_scaled_videos(args.jpg_dir,
+                                        total_frames=33,
+                                        H1=args.height,
+                                        W1=args.width)
+        main_print(f"[DATA] Image dataset ready: {dataset_length} samples")
         image_sample = True
     elif not args.T2V:
+        main_print(f"[DATA] Building video dataset from {args.video_root_dir} ...")
         dataset_ddp, dataset_length = mp4_data(args.video_root_dir)
+        main_print(f"[DATA] Video dataset ready: {dataset_length} samples")
         image_sample = False
-
-    print(dataset_ddp,"dataset_ddpdataset_ddpdataset_ddp")
+    else:
+        main_print("[DATA] T2V mode — no dataset loaded")
 
     step_times = deque(maxlen=100)
-    #image_sample = True
-    # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
-    path = '/mnt/petrelfs/maoxiaofeng/Yume_v2_release/InternVL3-2B-Instruct'
-    camption_model = AutoModel.from_pretrained(
-        path,
-        torch_dtype=torch.bfloat16,
-        low_cpu_mem_usage=True,
-        use_flash_attn=True,
-        trust_remote_code=True).eval().to(device)
+    _local = os.path.abspath(args.internvl_path)
+    path = _local if os.path.isdir(_local) else "OpenGVLab/InternVL3-2B-Instruct"
+    main_print(f"[INIT] Loading InternVL caption model from {path} ...")
+    # FSDP may leave an active DeviceContext("meta") TorchFunctionMode on the stack.
+    # torch.set_default_device("cpu") only sets a C++ variable and is overridden by
+    # the higher-priority Python TorchFunctionMode.  Pushing a DeviceContext("cpu")
+    # via the context manager sits on top of any lingering meta context and wins.
+    with torch.device("cpu"):
+        camption_model = AutoModel.from_pretrained(
+            path,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=False,
+            use_flash_attn=False,
+            trust_remote_code=True).eval().to(device)
     tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+    main_print("[INIT] InternVL caption model loaded")
 
     if args.prompt!=None:
         prompt1 = args.prompt
@@ -1304,7 +1335,9 @@ def main(args):
     else:
         date_len = int(dataset_length)//world_size + 1
    
+    main_print(f"[LOOP] Starting inference loop: {date_len-1} step(s), world_size={world_size}")
     for step in range(1, date_len):
+        main_print(f"[LOOP] Step {step}/{date_len-1} starting ...")
         start_time = time.time()
         torch.cuda.empty_cache()
         torch.cuda.empty_cache()
@@ -1335,6 +1368,7 @@ def main(args):
         step_time = time.time() - start_time
         step_times.append(step_time)
         avg_step_time = sum(step_times) / len(step_times)
+        main_print(f"[LOOP] Step {step}/{date_len-1} done in {step_time:.1f}s  (avg {avg_step_time:.1f}s)")
 
 
         torch.cuda.empty_cache()
@@ -1396,6 +1430,9 @@ def main(args):
         default=None,
     )
     parser.add_argument("--num_frames", type=int, default=163)
+    parser.add_argument("--height", type=int, default=704)
+    parser.add_argument("--width", type=int, default=1280)
+    parser.add_argument("--fps", type=int, default=16)
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -1571,6 +1608,12 @@ def main(args):
         type=str,
         default=None,
     )
+    parser.add_argument(
+        "--internvl_path",
+        type=str,
+        default="./InternVL3-2B-Instruct",
+        help="Path to InternVL3-2B-Instruct model dir or HuggingFace repo ID.",
+    )
     args = parser.parse_args()
     main(args)
 
diff --git a/fastvideo/sample/sample_tts.py b/fastvideo/sample/sample_tts.py
index 6037aab..2c82df1 100644
--- a/fastvideo/sample/sample_tts.py
+++ b/fastvideo/sample/sample_tts.py
@@ -902,6 +902,7 @@ def sample_one(
                                         video_output_dir,
                                         videoid_str+"_"+str(caption_ori)+"_"+str(step_sample)+"_"+str(repeat_num)+".mp4",
                                     )
+                print(filename)
                 export_to_video(video[0] , filename, fps=16)
 
                 if step_sample + 1 < sample_num:
diff --git a/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc b/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc
index 3bca54b..33cf5fa 100644
Binary files a/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc and b/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc differ
diff --git a/fastvideo/utils/__pycache__/communications.cpython-312.pyc b/fastvideo/utils/__pycache__/communications.cpython-312.pyc
new file mode 100644
index 0000000..367b12c
Binary files /dev/null and b/fastvideo/utils/__pycache__/communications.cpython-312.pyc differ
diff --git a/fastvideo/utils/__pycache__/dataset_utils.cpython-312.pyc b/fastvideo/utils/__pycache__/dataset_utils.cpython-312.pyc
new file mode 100644
index 0000000..e788319
Binary files /dev/null and b/fastvideo/utils/__pycache__/dataset_utils.cpython-312.pyc differ
diff --git a/fastvideo/utils/__pycache__/fsdp_util.cpython-312.pyc b/fastvideo/utils/__pycache__/fsdp_util.cpython-312.pyc
new file mode 100644
index 0000000..fd09562
Binary files /dev/null and b/fastvideo/utils/__pycache__/fsdp_util.cpython-312.pyc differ
diff --git a/fastvideo/utils/__pycache__/load.cpython-312.pyc b/fastvideo/utils/__pycache__/load.cpython-312.pyc
new file mode 100644
index 0000000..a0d67c7
Binary files /dev/null and b/fastvideo/utils/__pycache__/load.cpython-312.pyc differ
diff --git a/fastvideo/utils/__pycache__/logging_.cpython-312.pyc b/fastvideo/utils/__pycache__/logging_.cpython-312.pyc
index 9bac0b4..cdb2d05 100644
Binary files a/fastvideo/utils/__pycache__/logging_.cpython-312.pyc and b/fastvideo/utils/__pycache__/logging_.cpython-312.pyc differ
diff --git a/fastvideo/utils/__pycache__/parallel_states.cpython-312.pyc b/fastvideo/utils/__pycache__/parallel_states.cpython-312.pyc
new file mode 100644
index 0000000..4ec8dbb
Binary files /dev/null and b/fastvideo/utils/__pycache__/parallel_states.cpython-312.pyc differ
diff --git a/requirements.txt b/requirements.txt
index 808bee9..83181d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,172 +1,172 @@
-absl-py==2.3.0
-accelerate==1.0.1
-aiofiles==23.2.1
-aiohappyeyeballs==2.6.1
-aiohttp==3.12.13
-aiosignal==1.3.2
-albucore==0.0.19
-albumentations==1.4.20
-annotated-types==0.7.0
-antlr4-python3-runtime==4.9.3
-anyio==4.9.0
-async-timeout==5.0.1
-attrs==25.3.0
-av==13.1.0
-beautifulsoup4==4.12.3
-bitsandbytes==0.42.0
-blessed==1.21.0
-certifi==2025.6.15
-charset-normalizer==3.4.2
-click==8.2.1
-codespell==2.3.0
-contourpy==1.3.2
-cycler==0.12.1
-dashscope==1.23.5
-decorator==4.4.2
-decord==0.6.0
-diffusers==0.32.0
-docker-pycreds==0.4.0
-easydict==1.13
-einops==0.8.0
-eval_type_backport==0.2.2
-exceptiongroup==1.3.0
-fastapi==0.115.3
-ffmpeg==1.4
-ffmpy==0.6.0
-filelock==3.13.1
-flash-attn==2.7.0.post2
-fonttools==4.58.4
-frozenlist==1.7.0
-fsspec==2024.6.1
-ftfy==6.3.0
-future==1.0.0
-fvcore==0.1.5.post20221221
-gdown==5.2.0
-gitdb==4.0.12
-GitPython==3.1.44
-gpustat==1.1.1
-gradio==5.3.0
-gradio_client==1.4.2
-grpcio==1.73.0
-h11==0.16.0
-h5py==3.12.1
-hf-xet==1.1.5
-hjson==3.1.0
-httpcore==1.0.9
-httpx==0.28.1
-huggingface-hub==0.26.1
-idna==3.6
-imageio==2.36.0
-imageio-ffmpeg==0.5.1
-importlib_metadata==8.7.0
-inquirerpy==0.3.4
-iopath==0.1.10
-isort==5.13.2
-Jinja2==3.1.4
-joblib==1.5.1
-kiwisolver==1.4.8
-liger_kernel==0.4.1
-lightning-utilities==0.14.3
-loguru==0.7.3
-Markdown==3.8
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-matplotlib==3.9.2
-mdurl==0.1.2
-moviepy==1.0.3
-mpmath==1.3.0
-msgpack==1.1.1
-multidict==6.5.0
-mypy==1.11.1
-mypy_extensions==1.1.0
-networkx==3.3
-ninja==1.11.1.4
-numpy==1.26.3
-omegaconf==2.3.0
-opencv-python==4.10.0.84
-opencv-python-headless==4.10.0.84
-orjson==3.10.18
-packaging==25.0
-pandas==2.2.3
-parameterized==0.9.0
-peft==0.13.2
-pfzy==0.3.4
-pillow==10.2.0
-platformdirs==4.3.8
-polib==1.2.0
-portalocker==3.2.0
-proglog==0.1.12
-prompt_toolkit==3.0.51
-propcache==0.3.2
-protobuf==5.28.3
-psutil==7.0.0
-py-cpuinfo==9.0.0
-pydantic==2.9.2
-pydantic_core==2.23.4
-pydub==0.25.1
-Pygments==2.19.1
-pyparsing==3.2.3
-PySocks==1.7.1
-python-dateutil==2.9.0.post0
-python-multipart==0.0.20
-pytorch-lightning==2.4.0
-pytorchvideo==0.1.5
-pytz==2025.2
-PyYAML==6.0.1
-regex==2024.9.11
-requests==2.31.0
-rich==14.0.0
-ruff==0.6.5
-safetensors==0.5.3
-scikit-learn==1.5.2
-scikit-video==1.1.11
-scipy==1.14.1
-semantic-version==2.10.0
-sentencepiece==0.2.0
-sentry-sdk==2.30.0
-setproctitle==1.3.6
-shellingham==1.5.4
-six==1.16.0
-smmap==5.0.2
-sniffio==1.3.1
-soupsieve==2.7
-sphinx-lint==1.0.0
-starlette==0.41.3
-stringzilla==3.12.5
-sympy==1.13.1
-tabulate==0.9.0
-tensorboard==2.18.0
-tensorboard-data-server==0.7.2
-termcolor==3.1.0
-test_tube==0.7.5
-threadpoolctl==3.6.0
-timm==1.0.11
-tokenizers==0.20.1
-toml==0.10.2
-tomli==2.0.2
-tomlkit==0.12.0
-torch==2.5.0+cu121
-torchdiffeq==0.2.4
-torchmetrics==1.5.1
-torchvision==0.20.0+cu121
-tqdm==4.66.5
-transformers==4.46.1
-triton==3.1.0
-typer==0.16.0
-types-PyYAML==6.0.12.20250516
-types-requests==2.32.4.20250611
-types-setuptools==80.9.0.20250529
-typing_extensions==4.12.2
-tzdata==2025.2
-urllib3==2.2.0
-uvicorn==0.32.0
-wandb==0.18.5
-watch==0.2.7
-wcwidth==0.2.13
-websocket-client==1.8.0
-websockets==12.0
-Werkzeug==3.1.3
-yacs==0.1.8
-yapf==0.32.0
-yarl==1.20.1
-zipp==3.23.0
\ No newline at end of file
+absl-py>=2.3.0
+accelerate>=1.0.1
+aiofiles>=23.2.1
+aiohappyeyeballs>=2.6.1
+aiohttp>=3.12.13
+aiosignal>=1.3.2
+albucore>=0.0.19
+albumentations>=1.4.20
+annotated-types>=0.7.0
+antlr4-python3-runtime>=4.9.3
+anyio>=4.9.0
+async-timeout>=5.0.1
+attrs>=25.3.0
+av>=13.1.0
+beautifulsoup4>=4.12.3
+bitsandbytes>=0.42.0
+blessed>=1.21.0
+certifi>=2025.6.15
+charset-normalizer>=3.4.2
+click>=8.2.1
+codespell>=2.3.0
+contourpy>=1.3.2
+cycler>=0.12.1
+dashscope>=1.23.5
+decorator>=4.4.2
+decord>=0.6.0
+diffusers>=0.32.0
+docker-pycreds>=0.4.0
+easydict>=1.13
+einops>=0.8.0
+eval_type_backport>=0.2.2
+exceptiongroup>=1.3.0
+fastapi>=0.115.3
+ffmpeg>=1.4
+ffmpy>=0.6.0
+filelock>=3.13.1
+fonttools>=4.58.4
+frozenlist>=1.7.0
+fsspec>=2024.6.1
+ftfy>=6.3.0
+future>=1.0.0
+fvcore>=0.1.5.post20221221
+gdown>=5.2.0
+gitdb>=4.0.12
+GitPython>=3.1.44
+gpustat>=1.1.1
+gradio>=5.3.0
+gradio_client>=1.4.2
+grpcio>=1.73.0
+h11>=0.16.0
+h5py>=3.12.1
+hf-xet>=1.1.5
+hjson>=3.1.0
+httpcore>=1.0.9
+httpx>=0.28.1
+huggingface-hub>=0.26.1
+idna>=3.6
+imageio>=2.36.0
+imageio-ffmpeg>=0.5.1
+importlib_metadata>=8.7.0
+inquirerpy>=0.3.4
+iopath>=0.1.10
+isort>=5.13.2
+Jinja2>=3.1.4
+joblib>=1.5.1
+kiwisolver>=1.4.8
+liger_kernel>=0.4.1
+lightning-utilities>=0.14.3
+loguru>=0.7.3
+Markdown>=3.8
+markdown-it-py>=3.0.0
+MarkupSafe>=2.1.5
+matplotlib>=3.9.2
+mdurl>=0.1.2
+moviepy>=1.0.3
+mpmath>=1.3.0
+msgpack>=1.1.1
+multidict>=6.5.0
+mypy>=1.11.1
+mypy_extensions>=1.1.0
+networkx>=3.3
+ninja>=1.11.1.4
+numpy>=1.26.3
+omegaconf>=2.3.0
+opencv-python>=4.10.0.84
+opencv-python-headless>=4.10.0.84
+orjson>=3.10.18
+packaging>=25.0
+pandas>=2.2.3
+parameterized>=0.9.0
+peft>=0.13.2
+pfzy>=0.3.4
+pillow>=10.2.0
+platformdirs>=4.3.8
+polib>=1.2.0
+portalocker>=3.2.0
+proglog>=0.1.12
+prompt_toolkit>=3.0.51
+propcache>=0.3.2
+protobuf>=5.28.3
+psutil>=7.0.0
+py-cpuinfo>=9.0.0
+pydantic>=2.9.2
+pydantic_core>=2.23.4
+pydub>=0.25.1
+Pygments>=2.19.1
+pyparsing>=3.2.3
+PySocks>=1.7.1
+python-dateutil>=2.9.0.post0
+python-multipart>=0.0.20
+pytorch-lightning>=2.4.0
+pytorchvideo>=0.1.5
+pytz>=2025.2
+PyYAML>=6.0.1
+regex>=2024.9.11
+requests>=2.31.0
+rich>=14.0.0
+ruff>=0.6.5
+safetensors>=0.5.3
+scikit-learn>=1.5.2
+scikit-video>=1.1.11
+scipy>=1.14.1
+semantic-version>=2.10.0
+sentencepiece>=0.2.0
+sentry-sdk>=2.30.0
+setproctitle>=1.3.6
+shellingham>=1.5.4
+six>=1.16.0
+smmap>=5.0.2
+sniffio>=1.3.1
+soupsieve>=2.7
+sphinx-lint>=1.0.0
+starlette>=0.41.3
+stringzilla>=3.12.5
+sympy>=1.13.1
+tabulate>=0.9.0
+tensorboard>=2.18.0
+tensorboard-data-server>=0.7.2
+termcolor>=3.1.0
+test_tube>=0.7.5
+threadpoolctl>=3.6.0
+timm>=1.0.11
+tokenizers>=0.20.1
+toml>=0.10.2
+tomli>=2.0.2
+tomlkit>=0.12.0
+#torch>=2.5.0+cu121
+torchdiffeq>=0.2.4
+torchmetrics>=1.5.1
+#torchvision>=0.20.0+cu121
+tqdm>=4.66.5
+transformers>=4.46.1,<5.0
+triton-windows>=3.1.0
+typer>=0.16.0
+types-PyYAML>=6.0.12.20250516
+types-requests>=2.32.4.20250611
+types-setuptools>=80.9.0.20250529
+typing_extensions>=4.12.2
+tzdata>=2025.2
+urllib3>=2.2.0
+uvicorn>=0.32.0
+wandb>=0.18.5
+watch>=0.2.7
+wcwidth>=0.2.13
+websocket-client>=1.8.0
+websockets>=12.0
+Werkzeug>=3.1.3
+yacs>=0.1.8
+yapf>=0.32.0
+yarl>=1.20.1
+zipp>=3.23.0
+flash-attn>=2.7.0.post2
\ No newline at end of file
diff --git a/scripts/finetune/finetune.bat b/scripts/finetune/finetune.bat
new file mode 100644
index 0000000..9c4c7d3
--- /dev/null
+++ b/scripts/finetune/finetune.bat
@@ -0,0 +1,30 @@
+@echo off
+setlocal
+
+set TOKENIZERS_PARALLELISM=false
+set USE_LIBUV=0
+
+cd /d "%~dp0..\.."
+
+torchrun --nproc_per_node 8 --master_port 29607 ^
+    fastvideo/distill_model.py ^
+    --seed 42 ^
+    --gradient_checkpointing ^
+    --train_batch_size=1 ^
+    --dataloader_num_workers 4 ^
+    --gradient_accumulation_steps=1 ^
+    --max_train_steps=600000 ^
+    --learning_rate=1e-5 ^
+    --discriminator_learning_rate=1e-5 ^
+    --mixed_precision="bf16" ^
+    --checkpointing_steps=25 ^
+    --validation_steps 24 ^
+    --allow_tf32 ^
+    --MVDT ^
+    --Distil ^
+    --t5_cpu ^
+    --root_dir="./mp4_frame" ^
+    --full_mp4="./Sekai/" ^
+    --output_dir="./outputs"
+
+exit /b %ERRORLEVEL%
diff --git a/scripts/inference/sample.bat b/scripts/inference/sample.bat
new file mode 100644
index 0000000..5739059
--- /dev/null
+++ b/scripts/inference/sample.bat
@@ -0,0 +1,67 @@
+@echo off
+setlocal enabledelayedexpansion
+
+set TOKENIZERS_PARALLELISM=false
+set TF_ENABLE_ONEDNN_OPTS=0
+set LOCAL_RANK=0
+set RANK=0
+set WORLD_SIZE=1
+set MASTER_ADDR=127.0.0.1
+set MASTER_PORT=29500
+
+set EXAMPLE_DIR=C:\workspace\world\Infinite-World\assets\example_case
+
+cd /d "%~dp0..\.."
+
+:: Write default caption file once
+echo A first-person view exploring an interactive game scene. > temp_caption_default.txt
+
+for /f %%T in ('powershell -NoProfile -Command "Get-Date -Format ''yyyyMMdd_HHmmss''"') do set RUN_TS=%%T
+for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set OVERALL_START=%%t
+set CASE_NUM=0
+
+goto :main
+
+:: -------------------------------------------------------
+:: Subroutine: reads RC_DIR, RC_NAME, RC_CAPTION from env
+:: -------------------------------------------------------
+:run_case
+    set /a CASE_NUM+=1
+    echo.
+    echo === !CASE_NUM!: !RC_NAME! ===
+    echo output: ./outputs/%RUN_TS%/!RC_NAME!
+    for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set CASE_START=%%t
+
+    python fastvideo/sample/sample.py ^
+        --seed 42 ^
+        --gradient_checkpointing ^
+        --train_batch_size=1 ^
+        --max_sample_steps=600000 ^
+        --mixed_precision="bf16" ^
+        --allow_tf32 ^
+        --video_output_dir="./outputs/%RUN_TS%/!RC_NAME!" ^
+        --jpg_dir="!RC_DIR!" ^
+        --caption_path="!RC_CAPTION!" ^
+        --test_data_dir="./val" ^
+        --num_euler_timesteps 50 ^
+        --rand_num_img 0.6 ^
+        --t5_cpu
+
+    powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-!CASE_START!)/1e7; $fps=[math]::Round(163/$e,2); Write-Host ('  time='+[math]::Round($e,1)+'s  inference='+$fps+'fps  video=16fps')"
+    exit /b 0
+
+:main
+
+for /d %%D in (%EXAMPLE_DIR%\*) do (
+    set RC_NAME=%%~nxD
+    set RC_DIR=%%D
+    set RC_CAPTION=temp_caption_default.txt
+    if exist "%%D\prompt.txt" set RC_CAPTION=%%D\prompt.txt
+    call :run_case
+)
+
+echo.
+powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-%OVERALL_START%)/1e7; Write-Host ('Total: !CASE_NUM! cases')"
+
+del temp_caption_default.txt 2>nul
+exit /b 0
diff --git a/scripts/inference/sample.sh b/scripts/inference/sample.sh
index 06f4468..b6c4ca2 100644
--- a/scripts/inference/sample.sh
+++ b/scripts/inference/sample.sh
@@ -3,6 +3,7 @@
 # DATA_DIR=./data
 # IP=[MASTER NODE IP]
 export TOKENIZERS_PARALLELISM=false
+export USE_LIBUV=0
 
 torchrun --nproc_per_node 1 --master_port 29709 \
     fastvideo/sample/sample.py \
diff --git a/scripts/inference/sample_5b.bat b/scripts/inference/sample_5b.bat
new file mode 100644
index 0000000..e7084d0
--- /dev/null
+++ b/scripts/inference/sample_5b.bat
@@ -0,0 +1,81 @@
+@echo off
+setlocal enabledelayedexpansion
+
+set TOKENIZERS_PARALLELISM=false
+set TF_ENABLE_ONEDNN_OPTS=0
+set LOCAL_RANK=0
+set RANK=0
+set WORLD_SIZE=1
+set MASTER_ADDR=127.0.0.1
+set MASTER_PORT=29500
+
+set EXAMPLE_DIR=C:\workspace\world\Infinite-World\assets\example_case
+
+cd /d "%~dp0..\.."
+
+for /f "tokens=*" %%T in ('powershell -NoProfile -Command "Get-Date -Format ''yyyyMMdd_HHmmss''" 2^>nul') do if not "%%T"=="" set RUN_TS=%%T
+for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set OVERALL_START=%%t
+echo Output: %CD%\outputs\%RUN_TS%
+set CASE_NUM=0
+
+:: Count total cases
+set TOTAL_CASES=0
+for /d %%D in (%EXAMPLE_DIR%\*) do set /a TOTAL_CASES+=1
+echo Found %TOTAL_CASES% cases  output: ./outputs/%RUN_TS%
+
+goto :main
+
+:: -------------------------------------------------------
+:: Subroutine: reads RC_DIR, RC_NAME, RC_CAPTION from env
+:: -------------------------------------------------------
+:run_case
+    set /a CASE_NUM+=1
+    echo.
+    echo === !CASE_NUM!/%TOTAL_CASES%: !RC_NAME! ===
+    echo output: ./outputs/%RUN_TS%/!RC_NAME!
+    echo img:    !RC_DIR!
+    echo prompt: !RC_CAPTION!
+    type "!RC_CAPTION!"
+    echo.
+    for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set CASE_START=%%t
+
+    python fastvideo/sample/sample_5b.py ^
+        --seed 43 ^
+        --gradient_checkpointing ^
+        --train_batch_size=1 ^
+        --max_sample_steps=1 ^
+        --mixed_precision="bf16" ^
+        --allow_tf32 ^
+        --t5_cpu ^
+        --video_output_dir="./outputs/%RUN_TS%/!RC_NAME!" ^
+        --jpg_dir="!RC_DIR!" ^
+        --caption_path="!RC_CAPTION!" ^
+        --test_data_dir="./val" ^
+        --num_euler_timesteps 8 ^
+        --rand_num_img 0.6 ^
+        --internvl_path "./InternVL3-2B-Instruct"
+
+    powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-!CASE_START!)/1e7; $tot=([int64](Get-Date).Ticks-%OVERALL_START%)/1e7; $fps=[math]::Round(163/$e,2); $rem=(%TOTAL_CASES%-!CASE_NUM!)*($tot/!CASE_NUM!); Write-Host ('  time='+[math]::Round($e,1)+'s  fps='+$fps+'  ETA='+[math]::Round($rem,0)+'s')"
+    exit /b 0
+
+:main
+
+for /d %%D in (%EXAMPLE_DIR%\*) do (
+    set RC_NAME=%%~nxD
+    set RC_DIR=%%D
+    echo A %%~nxD scene. > temp_caption_%%~nxD.txt
+    set RC_CAPTION=temp_caption_%%~nxD.txt
+    if exist "%%D\prompt.txt" set RC_CAPTION=%%D\prompt.txt
+    if exist "%%D\prompt.json" (
+        powershell -NoProfile -Command "(Get-Content '%%D\prompt.json' -Raw | ConvertFrom-Json).prompt | Set-Content 'temp_prompt_%%~nxD.txt' -Encoding UTF8"
+        set RC_CAPTION=temp_prompt_%%~nxD.txt
+    )
+    call :run_case
+)
+
+echo.
+powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-%OVERALL_START%)/1e7; Write-Host ('Total: %TOTAL_CASES% cases in '+[math]::Round($e,1)+'s  avg='+[math]::Round($e/%TOTAL_CASES%,1)+'s/case')"
+
+del temp_caption_*.txt 2>nul
+del temp_prompt_*.txt 2>nul
+exit /b 0
diff --git a/scripts/inference/sample_5b.sh b/scripts/inference/sample_5b.sh
index b0bc4b2..3777baf 100755
--- a/scripts/inference/sample_5b.sh
+++ b/scripts/inference/sample_5b.sh
@@ -3,6 +3,7 @@
 # DATA_DIR=./data
 # IP=[MASTER NODE IP]
 export TOKENIZERS_PARALLELISM=false
+export USE_LIBUV=0
 
 torchrun --nproc_per_node 2 --master_port 29717 \
     fastvideo/sample/sample_5b.py \
diff --git a/scripts/inference/sample_5b_vbench.bat b/scripts/inference/sample_5b_vbench.bat
new file mode 100644
index 0000000..ad72d7e
--- /dev/null
+++ b/scripts/inference/sample_5b_vbench.bat
@@ -0,0 +1,25 @@
+@echo off
+setlocal enabledelayedexpansion
+
+set PY_SCRIPT=%~dp0vbench_runner.py
+
+cd /d "%~dp0..\.."
+
+:: Default paths relative to this repo — override by passing them as arguments:
+::   sample_5b_vbench.bat [VBENCH_JSON] [VBENCH_CROP] [NUM_FRAMES]
+if not "%~1"=="" (set VBENCH_JSON=%~1) else (set VBENCH_JSON=%CD%\..\VBench\vbench2_beta_i2v\vbench2_beta_i2v\data\i2v-bench-info.json)
+if not "%~2"=="" (set VBENCH_CROP=%~2) else (set VBENCH_CROP=%CD%\..\VBench\vbench2_beta_i2v\vbench2_beta_i2v\data\crop\1-1)
+if not "%~3"=="" (set NUM_FRAMES=%~3) else (set NUM_FRAMES=161)
+
+echo Output:    %CD%\outputs\vbench
+echo VBench:    %VBENCH_JSON%
+echo Crop:      %VBENCH_CROP%
+echo Frames:    %NUM_FRAMES%
+
+python "%PY_SCRIPT%" ^
+    --vbench-json "%VBENCH_JSON%" ^
+    --vbench-crop "%VBENCH_CROP%" ^
+    --work-dir "%CD%" ^
+    --num-frames %NUM_FRAMES%
+
+exit /b %ERRORLEVEL%
diff --git a/scripts/inference/sample_image.bat b/scripts/inference/sample_image.bat
new file mode 100644
index 0000000..be64dc6
--- /dev/null
+++ b/scripts/inference/sample_image.bat
@@ -0,0 +1,29 @@
+@echo off
+setlocal
+
+set TOKENIZERS_PARALLELISM=false
+set TF_ENABLE_ONEDNN_OPTS=0
+set LOCAL_RANK=0
+set RANK=0
+set WORLD_SIZE=1
+set MASTER_ADDR=127.0.0.1
+set MASTER_PORT=29500
+
+cd /d "%~dp0..\.."
+
+python fastvideo/sample/sample.py ^
+    --seed 42 ^
+    --gradient_checkpointing ^
+    --train_batch_size=1 ^
+    --max_sample_steps=600000 ^
+    --mixed_precision="bf16" ^
+    --allow_tf32 ^
+    --video_output_dir="./outputs" ^
+    --jpg_dir="./jpg/" ^
+    --caption_path="./caption.txt" ^
+    --test_data_dir="./val" ^
+    --num_euler_timesteps 50 ^
+    --rand_num_img 0.6 ^
+    --t5_cpu
+
+exit /b %ERRORLEVEL%
diff --git a/scripts/inference/sample_image.sh b/scripts/inference/sample_image.sh
index e2f3e16..779b58c 100644
--- a/scripts/inference/sample_image.sh
+++ b/scripts/inference/sample_image.sh
@@ -3,6 +3,7 @@
 # DATA_DIR=./data
 # IP=[MASTER NODE IP]
 export TOKENIZERS_PARALLELISM=false
+export USE_LIBUV=0
 
 torchrun --nproc_per_node 1 --master_port 29707 \
     fastvideo/sample/sample.py \
diff --git a/scripts/inference/sample_tts.bat b/scripts/inference/sample_tts.bat
new file mode 100644
index 0000000..cd162a8
--- /dev/null
+++ b/scripts/inference/sample_tts.bat
@@ -0,0 +1,29 @@
+@echo off
+setlocal
+
+set TOKENIZERS_PARALLELISM=false
+set TF_ENABLE_ONEDNN_OPTS=0
+set LOCAL_RANK=0
+set RANK=0
+set WORLD_SIZE=1
+set MASTER_ADDR=127.0.0.1
+set MASTER_PORT=29500
+
+cd /d "%~dp0..\.."
+
+python fastvideo/sample/sample_tts.py ^
+    --seed 42 ^
+    --gradient_checkpointing ^
+    --train_batch_size=1 ^
+    --max_sample_steps=600000 ^
+    --mixed_precision="bf16" ^
+    --allow_tf32 ^
+    --video_output_dir="./outputs" ^
+    --jpg_dir="./jpg/" ^
+    --caption_path="./caption.txt" ^
+    --test_data_dir="./val" ^
+    --num_euler_timesteps 50 ^
+    --rand_num_img 0.6 ^
+    --t5_cpu
+
+exit /b %ERRORLEVEL%
diff --git a/scripts/inference/sample_tts.sh b/scripts/inference/sample_tts.sh
index 37a4251..d1807fe 100644
--- a/scripts/inference/sample_tts.sh
+++ b/scripts/inference/sample_tts.sh
@@ -3,6 +3,7 @@
 # DATA_DIR=./data
 # IP=[MASTER NODE IP]
 export TOKENIZERS_PARALLELISM=false
+export USE_LIBUV=0
 
 torchrun --nproc_per_node 1 --master_port 29708 \
     fastvideo/sample/sample_tts.py \
diff --git a/scripts/inference/vbench_runner.py b/scripts/inference/vbench_runner.py
new file mode 100644
index 0000000..8b5d102
--- /dev/null
+++ b/scripts/inference/vbench_runner.py
@@ -0,0 +1,280 @@
+"""
+vbench_runner.py  --  Python replacement for vbench_runner.ps1
+Calls sample_5b.py once per prompt with NUM_SAMPLES caption lines and a
+fresh random seed, so the model loads once per prompt instead of once per sample.
+"""
+import argparse
+import csv
+import json
+import os
+import random
+import re
+import shutil
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+
+ALLOWED_TYPES = ['indoor', 'scenery']
+NUM_SAMPLES   = 5
+NUM_FRAMES    = 161
+
+
+# ---------- helpers ----------
+
+def _safe(s, maxlen=80):
+    return re.sub(r'[<>:"/\\|?*]', '_', s)[:maxlen]
+
+def _poll_vram(stop_event, readings):
+    while not stop_event.is_set():
+        try:
+            out = subprocess.check_output(
+                ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],
+                stderr=subprocess.DEVNULL, text=True
+            ).strip().splitlines()[0].strip()
+            if out.isdigit():
+                readings.append(int(out))
+        except Exception:
+            pass
+        time.sleep(5)
+
+def _ram_gb():
+    try:
+        import psutil
+        vm = psutil.virtual_memory()
+        return round(vm.used / (1024 ** 3), 2)
+    except Exception:
+        return ''
+
+def _vram_peak_gb(readings):
+    if not readings:
+        return ''
+    return round(max(readings) / 1024.0, 2)
+
+
+# ---------- main ----------
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument('--vbench-json', required=True)
+    ap.add_argument('--vbench-crop', required=True)
+    ap.add_argument('--work-dir',    required=True)
+    ap.add_argument('--num-frames',  type=int, default=NUM_FRAMES)
+    args = ap.parse_args()
+
+    work_dir   = Path(args.work_dir)
+    out_base   = work_dir / 'outputs' / 'vbench' / 'videos'
+    stats_path = work_dir / 'outputs' / 'vbench' / 'stats.csv'
+
+    if not Path(args.vbench_json).exists():
+        sys.exit(f'[vbench] ERROR: JSON not found: {args.vbench_json}')
+    if not Path(args.vbench_crop).exists():
+        sys.exit(f'[vbench] ERROR: crop dir not found: {args.vbench_crop}')
+
+    with open(args.vbench_json, encoding='utf-8') as f:
+        entries = json.load(f)
+
+    seen, prompts = set(), []
+    for e in entries:
+        fn = e.get('file_name', '')
+        if fn in seen:
+            continue
+        if e.get('type') not in ALLOWED_TYPES:
+            continue
+        seen.add(fn)
+        cap = e.get('caption') or Path(fn).stem
+        prompts.append({'file': fn, 'caption': cap, 'type': e['type']})
+
+    total = len(prompts) * NUM_SAMPLES
+    print(f'\n=== VBench prompt list ({len(prompts)} prompts, types: {", ".join(ALLOWED_TYPES)}) ===')
+    for i, p in enumerate(prompts):
+        img_ok = '  ' if (Path(args.vbench_crop) / p['file']).exists() else 'MISSING'
+        print(f'  [{i+1:2}/{len(prompts)}] ({p["type"]:<8}) {img_ok} {p["caption"]}')
+    print(f'=== {len(prompts)} prompts x {NUM_SAMPLES} samples = {total} runs ===\n')
+
+    out_base.mkdir(parents=True, exist_ok=True)
+    stats_path.parent.mkdir(parents=True, exist_ok=True)
+
+    stats_is_new = not stats_path.exists()
+    stats_f = open(stats_path, 'a', newline='', encoding='utf-8')
+    writer  = csv.writer(stats_f)
+    if stats_is_new:
+        writer.writerow(['task_idx', 'prompt', 'type', 'sample_idx', 'seed',
+                         'duration_s', 'gen_fps', 'vram_gb', 'ram_gb', 'out_path', 'status'])
+        stats_f.flush()
+
+    done = generated = errors = skipped = 0
+    t_start = time.time()
+
+    for ti, p in enumerate(prompts):
+        img_src = Path(args.vbench_crop) / p['file']
+        if not img_src.exists():
+            print(f'[vbench] skip prompt {ti+1}: image not found - {img_src}')
+            done += NUM_SAMPLES
+            continue
+
+        safe_cap   = _safe(p['caption'])
+        safe_cap_u = safe_cap.replace(' ', '_')   # sample_5b replaces spaces → underscores
+
+        def _cap_match(name):
+            return safe_cap in name or safe_cap_u in name
+
+        # samples already completed (renamed with _s{i}_seed{n} suffix)
+        already = sorted([f for f in out_base.glob('*.mp4')
+                          if _cap_match(f.name) and '_seed' in f.name])
+        n_already = len(already)
+
+        # orphans: files from an interrupted run that were never renamed
+        orphans = sorted([f for f in out_base.glob('*.mp4')
+                          if _cap_match(f.name) and '_seed' not in f.name])
+
+        # recover orphans with new random seeds
+        for i, src in enumerate(orphans):
+            si_abs = n_already + i
+            seed   = random.randint(0, 2**31 - 1)
+            dst    = src.with_stem(f'{src.stem}_s{si_abs}_seed{seed}')
+            print(f'  [recover] {src.name} -> {dst.name}')
+            src.rename(dst)
+            writer.writerow([ti, p['caption'], p['type'], si_abs, seed,
+                              '', '', '', '', str(dst), 'recovered'])
+            stats_f.flush()
+            already.append(dst)
+        n_already = len(already)
+
+        n_needed = NUM_SAMPLES - n_already
+
+        if n_needed <= 0:
+            print(f'[vbench] skip prompt {ti+1}: all {NUM_SAMPLES} samples already done')
+            for i, f in enumerate(already):
+                writer.writerow([ti, p['caption'], p['type'], i, '', '', '', '', '', str(f), 'skipped'])
+            stats_f.flush()
+            skipped += NUM_SAMPLES
+            done    += NUM_SAMPLES
+            continue
+
+        if n_already > 0:
+            print(f'[vbench] prompt {ti+1}: {n_already} done, running {n_needed} remaining')
+
+        short_cap = p['caption'][:60]
+
+        # prepare temp dir with exactly this image (shared across samples)
+        tmp_dir = work_dir / '_vbench_tmp'
+        tmp_dir.mkdir(exist_ok=True)
+        for f in tmp_dir.iterdir():
+            f.unlink(missing_ok=True)
+        shutil.copy(img_src, tmp_dir)
+
+        cap_file = work_dir / '_vbench_caption.txt'
+        cap_file.write_text(p['caption'], encoding='utf-8')
+
+        for si in range(n_needed):
+            si_abs = n_already + si
+            seed   = random.randint(0, 2**31 - 1)
+
+            pct = round(100 * done / total) if total else 0
+            eta = ''
+            if done > 0:
+                elapsed = time.time() - t_start
+                rem = int(elapsed / done * (total - done))
+                eta = f'  ETA {rem//3600:02d}h{(rem%3600)//60:02d}m{rem%60:02d}s'
+            print(f'[vbench] [{done+1}/{total}  {pct}%{eta}]  '
+                  f'prompt {ti+1}/{len(prompts)} ({p["type"]})  '
+                  f'sample {si_abs+1}/{NUM_SAMPLES}  seed {seed} : {short_cap}')
+
+            vram_readings = []
+            stop_evt    = threading.Event()
+            vram_thread = threading.Thread(target=_poll_vram, args=(stop_evt, vram_readings), daemon=True)
+            vram_thread.start()
+
+            t0      = time.time()
+            new_mp4 = None
+            dur     = 0.0
+            fps     = 0.0
+            try:
+                pre_existing = set(out_base.glob('*.mp4'))
+
+                import socket as _socket
+                with _socket.socket() as _s:
+                    _s.bind(('127.0.0.1', 0))
+                    _free_port = str(_s.getsockname()[1])
+                _pythonpath = str(work_dir) + os.pathsep + os.environ.get('PYTHONPATH', '')
+                env = {**os.environ,
+                       'PYTHONPATH':             _pythonpath,
+                       'TOKENIZERS_PARALLELISM': 'false',
+                       'TF_ENABLE_ONEDNN_OPTS':  '0',
+                       'LOCAL_RANK': '0', 'RANK': '0', 'WORLD_SIZE': '1',
+                       'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': _free_port}
+                subprocess.run([
+                    sys.executable, 'fastvideo/sample/sample_5b.py',
+                    '--seed', str(seed),
+                    '--gradient_checkpointing',
+                    '--train_batch_size=1',
+                    '--max_sample_steps=1',
+                    '--mixed_precision=bf16',
+                    '--allow_tf32',
+                    '--t5_cpu',
+                    f'--video_output_dir={out_base}',
+                    f'--jpg_dir={tmp_dir}',
+                    f'--caption_path={cap_file}',
+                    '--test_data_dir=./val',
+                    '--num_euler_timesteps', '5',
+                    '--rand_num_img', '0.6',
+                    '--internvl_path', './InternVL3-2B-Instruct',
+                    '--height', '704',
+                    '--width', '1216',
+                    '--num_frames', str(args.num_frames),
+                    '--fps', '24',
+                ], cwd=str(work_dir), env=env)
+
+                dur = round(time.time() - t0, 2)
+                fps = round(args.num_frames / dur, 2)
+
+                all_now  = set(out_base.glob('*.mp4'))
+                raw_mp4s = sorted(
+                    [f for f in (all_now - pre_existing)
+                     if _cap_match(f.name) and '_seed' not in f.name],
+                    key=lambda f: f.name
+                )
+                print(f'  [detect] {len(raw_mp4s)} new mp4s (pre={len(pre_existing)}, now={len(all_now)})')
+
+                if raw_mp4s:
+                    src = raw_mp4s[0]
+                    dst = src.with_stem(f'{src.stem}_s{si_abs}_seed{seed}')
+                    src.rename(dst)
+                    new_mp4 = dst
+                else:
+                    print(f'  WARNING: expected 1 mp4, got 0')
+
+            except Exception as exc:
+                print(f'  EXCEPTION: {exc}', file=sys.stderr)
+            finally:
+                stop_evt.set()
+                vram_thread.join(timeout=10)
+                vram = _vram_peak_gb(vram_readings)
+                ram  = _ram_gb()
+
+            if new_mp4:
+                print(f'  done in {dur}s  ({fps} gen-fps)  VRAM {vram}GB  RAM {ram}GB')
+                writer.writerow([ti, p['caption'], p['type'], si_abs, seed,
+                                  dur, fps, vram, ram, str(new_mp4), 'ok'])
+                generated += 1
+            else:
+                writer.writerow([ti, p['caption'], p['type'], si_abs, seed,
+                                  '', '', '', '', '', 'error'])
+                errors += 1
+                print(f'[vbench] ERROR prompt {ti+1} sample {si_abs} "{short_cap}" — continuing')
+            stats_f.flush()
+            done += 1
+
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        cap_file.unlink(missing_ok=True)
+
+    stats_f.close()
+    elapsed_m = round((time.time() - t_start) / 60, 1)
+    print(f'\n[vbench] done - generated={generated}  skipped={skipped}  errors={errors}  elapsed={elapsed_m}m')
+    print(f'[vbench] stats -> {stats_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wan23/__pycache__/__init__.cpython-312.pyc b/wan23/__pycache__/__init__.cpython-312.pyc
index 6f8d53d..d8b037b 100644
Binary files a/wan23/__pycache__/__init__.cpython-312.pyc and b/wan23/__pycache__/__init__.cpython-312.pyc differ
diff --git a/wan23/__pycache__/image2video.cpython-312.pyc b/wan23/__pycache__/image2video.cpython-312.pyc
index 322330c..af3d049 100644
Binary files a/wan23/__pycache__/image2video.cpython-312.pyc and b/wan23/__pycache__/image2video.cpython-312.pyc differ
diff --git a/wan23/__pycache__/text2video.cpython-312.pyc b/wan23/__pycache__/text2video.cpython-312.pyc
index 97d448c..0790da1 100644
Binary files a/wan23/__pycache__/text2video.cpython-312.pyc and b/wan23/__pycache__/text2video.cpython-312.pyc differ
diff --git a/wan23/__pycache__/textimage2video.cpython-312.pyc b/wan23/__pycache__/textimage2video.cpython-312.pyc
index 85d34b7..66bb647 100644
Binary files a/wan23/__pycache__/textimage2video.cpython-312.pyc and b/wan23/__pycache__/textimage2video.cpython-312.pyc differ
diff --git a/wan23/configs/__pycache__/__init__.cpython-312.pyc b/wan23/configs/__pycache__/__init__.cpython-312.pyc
index 6567fc0..ab7d856 100644
Binary files a/wan23/configs/__pycache__/__init__.cpython-312.pyc and b/wan23/configs/__pycache__/__init__.cpython-312.pyc differ
diff --git a/wan23/configs/__pycache__/shared_config.cpython-312.pyc b/wan23/configs/__pycache__/shared_config.cpython-312.pyc
index bad711b..602d543 100644
Binary files a/wan23/configs/__pycache__/shared_config.cpython-312.pyc and b/wan23/configs/__pycache__/shared_config.cpython-312.pyc differ
diff --git a/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc b/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc
index 56450e4..658607a 100644
Binary files a/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc and b/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc differ
diff --git a/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc b/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc
index 68973be..c46e66b 100644
Binary files a/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc and b/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc differ
diff --git a/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc b/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc
index 6df028d..fb267b6 100644
Binary files a/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc and b/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc differ
diff --git a/wan23/distributed/__pycache__/__init__.cpython-312.pyc b/wan23/distributed/__pycache__/__init__.cpython-312.pyc
index a5a5c0e..f27ab25 100644
Binary files a/wan23/distributed/__pycache__/__init__.cpython-312.pyc and b/wan23/distributed/__pycache__/__init__.cpython-312.pyc differ
diff --git a/wan23/distributed/__pycache__/fsdp.cpython-312.pyc b/wan23/distributed/__pycache__/fsdp.cpython-312.pyc
index 8efbfd8..cdf2ea4 100644
Binary files a/wan23/distributed/__pycache__/fsdp.cpython-312.pyc and b/wan23/distributed/__pycache__/fsdp.cpython-312.pyc differ
diff --git a/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc b/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc
index 8d313e7..920cfae 100644
Binary files a/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc and b/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc differ
diff --git a/wan23/distributed/__pycache__/ulysses.cpython-312.pyc b/wan23/distributed/__pycache__/ulysses.cpython-312.pyc
index 6f754c9..8051887 100644
Binary files a/wan23/distributed/__pycache__/ulysses.cpython-312.pyc and b/wan23/distributed/__pycache__/ulysses.cpython-312.pyc differ
diff --git a/wan23/distributed/__pycache__/util.cpython-312.pyc b/wan23/distributed/__pycache__/util.cpython-312.pyc
index 4f321cf..3e5c246 100644
Binary files a/wan23/distributed/__pycache__/util.cpython-312.pyc and b/wan23/distributed/__pycache__/util.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/__init__.cpython-312.pyc b/wan23/modules/__pycache__/__init__.cpython-312.pyc
index 154b5b8..32aa80c 100644
Binary files a/wan23/modules/__pycache__/__init__.cpython-312.pyc and b/wan23/modules/__pycache__/__init__.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/attention.cpython-312.pyc b/wan23/modules/__pycache__/attention.cpython-312.pyc
index 26519f9..72cd32e 100644
Binary files a/wan23/modules/__pycache__/attention.cpython-312.pyc and b/wan23/modules/__pycache__/attention.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/model.cpython-312.pyc b/wan23/modules/__pycache__/model.cpython-312.pyc
index 95319b8..b4588f9 100644
Binary files a/wan23/modules/__pycache__/model.cpython-312.pyc and b/wan23/modules/__pycache__/model.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/t5.cpython-312.pyc b/wan23/modules/__pycache__/t5.cpython-312.pyc
index 43c5ef2..6f6b62d 100644
Binary files a/wan23/modules/__pycache__/t5.cpython-312.pyc and b/wan23/modules/__pycache__/t5.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/tokenizers.cpython-312.pyc b/wan23/modules/__pycache__/tokenizers.cpython-312.pyc
index 91cb805..c6f06dd 100644
Binary files a/wan23/modules/__pycache__/tokenizers.cpython-312.pyc and b/wan23/modules/__pycache__/tokenizers.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/vae2_1.cpython-312.pyc b/wan23/modules/__pycache__/vae2_1.cpython-312.pyc
index 39b2088..20f3748 100644
Binary files a/wan23/modules/__pycache__/vae2_1.cpython-312.pyc and b/wan23/modules/__pycache__/vae2_1.cpython-312.pyc differ
diff --git a/wan23/modules/__pycache__/vae2_2.cpython-312.pyc b/wan23/modules/__pycache__/vae2_2.cpython-312.pyc
index c2e308b..64c4caf 100644
Binary files a/wan23/modules/__pycache__/vae2_2.cpython-312.pyc and b/wan23/modules/__pycache__/vae2_2.cpython-312.pyc differ
diff --git a/wan23/modules/t5.py b/wan23/modules/t5.py
index 8010896..475e0d7 100644
--- a/wan23/modules/t5.py
+++ b/wan23/modules/t5.py
@@ -25,22 +25,9 @@ def fp16_clamp(x):
 
 
 def init_weights(m):
-    if isinstance(m, T5LayerNorm):
-        nn.init.ones_(m.weight)
-    elif isinstance(m, T5Model):
-        nn.init.normal_(m.token_embedding.weight, std=1.0)
-    elif isinstance(m, T5FeedForward):
-        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
-        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
-    elif isinstance(m, T5Attention):
-        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
-        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
-    elif isinstance(m, T5RelativeEmbedding):
-        nn.init.normal_(
-            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
+    # Skipped: weights are overwritten by checkpoint load immediately after
+    # model construction. Running init on UMT5-XXL wastes ~10s per subprocess.
+    pass
 
 
 class GELU(nn.Module):
diff --git a/wan23/textimage2video.py b/wan23/textimage2video.py
index 53efcb7..254b53d 100644
--- a/wan23/textimage2video.py
+++ b/wan23/textimage2video.py
@@ -1,5 +1,6 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import gc
+import json
 import logging
 import math
 import os
@@ -153,9 +154,16 @@ def __init__(
             torch.zeros(1, 1, self.model.dim, device=self.model.device)
         )
 
-        self.model = WanModel.from_pretrained(checkpoint_dir)
-        state_dict = load_file(checkpoint_dir+"/diffusion_pytorch_model.safetensors")
-        self.model.load_state_dict(state_dict)
+        # Use load_file (mmap, low RAM) instead of from_pretrained (reads full file into RAM).
+        # strict=False: sideblock/mask_token/patch_embedding_* are YUME-only additions not
+        # present in the base checkpoint; they stay freshly initialized (same as original flow).
+        # Load directly to the target device to avoid holding two copies in RAM.
+        # load_file with device= loads each tensor directly to CUDA, skipping the
+        # intermediate CPU buffer that causes OOM on machines with limited RAM.
+        from safetensors.torch import load_file as _load_file
+        _ckpt = os.path.join(checkpoint_dir, "diffusion_pytorch_model.safetensors")
+        state_dict = _load_file(_ckpt, device=str(self.device))
+        self.model.load_state_dict(state_dict, strict=False)
 
 
         self.sp_size = 1
diff --git a/wan23/utils/__pycache__/__init__.cpython-312.pyc b/wan23/utils/__pycache__/__init__.cpython-312.pyc
deleted file mode 100644
index 072aa21..0000000
Binary files a/wan23/utils/__pycache__/__init__.cpython-312.pyc and /dev/null differ
diff --git a/wan23/utils/__pycache__/fm_solvers.cpython-312.pyc b/wan23/utils/__pycache__/fm_solvers.cpython-312.pyc
deleted file mode 100644
index 593a987..0000000
Binary files a/wan23/utils/__pycache__/fm_solvers.cpython-312.pyc and /dev/null differ
diff --git a/wan23/utils/__pycache__/fm_solvers_unipc.cpython-312.pyc b/wan23/utils/__pycache__/fm_solvers_unipc.cpython-312.pyc
deleted file mode 100644
index 2a7317e..0000000
Binary files a/wan23/utils/__pycache__/fm_solvers_unipc.cpython-312.pyc and /dev/null differ
diff --git a/wan23/utils/__pycache__/utils.cpython-312.pyc b/wan23/utils/__pycache__/utils.cpython-312.pyc
deleted file mode 100644
index 10ae6f6..0000000
Binary files a/wan23/utils/__pycache__/utils.cpython-312.pyc and /dev/null differ