diff --git a/.gitignore b/.gitignore index 2641667..187e562 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,16 @@ *.mp4 +**/__pycache__/ +**/*.pyc +**/*.pyo + +InternVL3-2B-Instruct/ +Yume-5B-720P/ +_vbench_caption.txt +_vbench_tmp/ +outputs/ +requirements-cog.txt +temp_caption_3th person.txt +temp_caption_bus.txt +temp_caption_default.txt +temp_caption_gc.txt +temp_caption_kitchen.txt diff --git a/download_model.py b/download_model.py new file mode 100644 index 0000000..cb3e402 --- /dev/null +++ b/download_model.py @@ -0,0 +1,14 @@ +"""Download Yume model weights from HuggingFace.""" + +from huggingface_hub import snapshot_download + +REPOS = [ + "stdstu123/Yume-5B-720P", + "OpenGVLab/InternVL3-2B-Instruct", +] + +for repo_id in REPOS: + local_dir = f"./{repo_id.split('/')[-1]}" + print(f"Downloading {repo_id} -> {local_dir}") + snapshot_download(repo_id=repo_id, local_dir=local_dir) + print(f"Done: {local_dir}") diff --git a/fastvideo/__pycache__/__init__.cpython-312.pyc b/fastvideo/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..4f89924 Binary files /dev/null and b/fastvideo/__pycache__/__init__.cpython-312.pyc differ diff --git a/fastvideo/distill/__pycache__/__init__.cpython-312.pyc b/fastvideo/distill/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..28b5a5b Binary files /dev/null and b/fastvideo/distill/__pycache__/__init__.cpython-312.pyc differ diff --git a/fastvideo/distill/__pycache__/solver.cpython-312.pyc b/fastvideo/distill/__pycache__/solver.cpython-312.pyc new file mode 100644 index 0000000..b084a04 Binary files /dev/null and b/fastvideo/distill/__pycache__/solver.cpython-312.pyc differ diff --git a/fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc b/fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc new file mode 100644 index 0000000..5c28256 Binary files /dev/null and b/fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/__pycache__/__init__.cpython-312.pyc b/fastvideo/models/hunyuan/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..13622f5 Binary files /dev/null and b/fastvideo/models/hunyuan/__pycache__/__init__.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/__pycache__/constants.cpython-312.pyc b/fastvideo/models/hunyuan/__pycache__/constants.cpython-312.pyc new file mode 100644 index 0000000..5cb432d Binary files /dev/null and b/fastvideo/models/hunyuan/__pycache__/constants.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/text_encoder/__pycache__/__init__.cpython-312.pyc b/fastvideo/models/hunyuan/text_encoder/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..1b034e8 Binary files /dev/null and b/fastvideo/models/hunyuan/text_encoder/__pycache__/__init__.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/vae/__pycache__/__init__.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..f02fad1 Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/__init__.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/vae/__pycache__/autoencoder_kl_causal_3d.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/autoencoder_kl_causal_3d.cpython-312.pyc new file mode 100644 index 0000000..86cddc5 Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/autoencoder_kl_causal_3d.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/vae/__pycache__/unet_causal_3d_blocks.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/unet_causal_3d_blocks.cpython-312.pyc new file mode 100644 index 0000000..89ee2a6 Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/unet_causal_3d_blocks.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan/vae/__pycache__/vae.cpython-312.pyc b/fastvideo/models/hunyuan/vae/__pycache__/vae.cpython-312.pyc new file mode 100644 index 0000000..7a3478b Binary files /dev/null and b/fastvideo/models/hunyuan/vae/__pycache__/vae.cpython-312.pyc differ diff --git a/fastvideo/models/hunyuan_hf/__pycache__/modeling_hunyuan.cpython-312.pyc b/fastvideo/models/hunyuan_hf/__pycache__/modeling_hunyuan.cpython-312.pyc new file mode 100644 index 0000000..1e42ded Binary files /dev/null and b/fastvideo/models/hunyuan_hf/__pycache__/modeling_hunyuan.cpython-312.pyc differ diff --git a/fastvideo/models/mochi_hf/__pycache__/mochi_latents_utils.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/mochi_latents_utils.cpython-312.pyc new file mode 100644 index 0000000..2a7da29 Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/mochi_latents_utils.cpython-312.pyc differ diff --git a/fastvideo/models/mochi_hf/__pycache__/modeling_mochi.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/modeling_mochi.cpython-312.pyc new file mode 100644 index 0000000..642abad Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/modeling_mochi.cpython-312.pyc differ diff --git a/fastvideo/models/mochi_hf/__pycache__/norm.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/norm.cpython-312.pyc new file mode 100644 index 0000000..da49503 Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/norm.cpython-312.pyc differ diff --git a/fastvideo/models/mochi_hf/__pycache__/pipeline_mochi.cpython-312.pyc b/fastvideo/models/mochi_hf/__pycache__/pipeline_mochi.cpython-312.pyc new file mode 100644 index 0000000..7a5b8ed Binary files /dev/null and b/fastvideo/models/mochi_hf/__pycache__/pipeline_mochi.cpython-312.pyc differ diff --git a/fastvideo/models/mochi_hf/modeling_mochi.py b/fastvideo/models/mochi_hf/modeling_mochi.py index 330f31a..5fd498e 100644 --- a/fastvideo/models/mochi_hf/modeling_mochi.py +++ b/fastvideo/models/mochi_hf/modeling_mochi.py @@ -28,7 +28,14 @@ from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers) from diffusers.utils.torch_utils import maybe_allow_in_graph -from liger_kernel.ops.swiglu import LigerSiLUMulFunction +try: + from liger_kernel.ops.swiglu import LigerSiLUMulFunction +except ModuleNotFoundError: + import torch.nn.functional as _F + class LigerSiLUMulFunction: + @staticmethod + def apply(gate, hidden_states): + return _F.silu(gate) * hidden_states from fastvideo.models.flash_attn_no_pad import flash_attn_no_pad from fastvideo.models.mochi_hf.norm import (MochiLayerNormContinuous, diff --git a/fastvideo/sample/sample.py b/fastvideo/sample/sample.py index 1d0f66a..5860462 100644 --- a/fastvideo/sample/sample.py +++ b/fastvideo/sample/sample.py @@ -912,7 +912,8 @@ def main(args): local_rank = int(os.environ["LOCAL_RANK"]) rank = int(os.environ["RANK"]) world_size = int(os.environ["WORLD_SIZE"]) - dist.init_process_group("nccl", rank=rank, world_size=world_size) + backend = "gloo" if os.name == "nt" else "nccl" + dist.init_process_group(backend, rank=rank, world_size=world_size) # Set independent cache directories for each rank os.environ["TRITON_CACHE_DIR"] = f"/tmp/triton_cache_{rank}" diff --git a/fastvideo/sample/sample_5b.py b/fastvideo/sample/sample_5b.py index cdd1e4e..8f1067a 100644 --- a/fastvideo/sample/sample_5b.py +++ b/fastvideo/sample/sample_5b.py @@ -3,6 +3,7 @@ import argparse import math import os +import re import sys import torchvision import time @@ -867,7 +868,7 @@ def sample_one( # Generate diverse output videos from identical input conditions - max_area = 704 * 1280 + max_area = args.height * args.width # pixel_values_vid = torch.nn.functional.interpolate(pixel_values_vid, size=(544, 960), mode='bilinear', align_corners=False) repeat_nums = 1 @@ -889,14 +890,16 @@ def sample_one( frame = model_input.shape[1] + main_print(f"[SAMPLE] VAE encoding input ({model_input.shape}) ...") model_input = torch.cat([wan_i2v.vae.encode([model_input.to(device)[:,:-32].to(device)])[0], \ - wan_i2v.vae.encode([model_input.to(device)[:,-32:].to(device)])[0]],dim=1) + wan_i2v.vae.encode([model_input.to(device)[:,-32:].to(device)])[0]],dim=1) + main_print(f"[SAMPLE] VAE encode done -> latent shape {model_input.shape}") latents = model_input img = model_input[:,:-latent_frame_zero] - + main_print(f"[SAMPLE] wan_i2v.generate (i2v, frame_num={frame}) ...") with torch.no_grad(): arg_c, arg_null, noise, mask2, img = wan_i2v.generate( caption[0], @@ -904,14 +907,17 @@ def sample_one( max_area=max_area, latent_frame_zero=latent_frame_zero, img=img) + main_print("[SAMPLE] wan_i2v.generate done") else: frame = 32 + main_print(f"[SAMPLE] wan_i2v.generate (t2v, frame_num={frame}) ...") with torch.no_grad(): arg_c, arg_null, noise = wan_i2v.generate( caption[0], frame_num=32, max_area=max_area, latent_frame_zero=latent_frame_zero,) + main_print("[SAMPLE] wan_i2v.generate done") @@ -949,15 +955,15 @@ def sample_one( import time start_time = time.time() - + main_print(f"[SAMPLE] Denoising step_sample={step_sample}/{sample_num-1} steps={sample_step} ...") + if not t2v or step_sample > 0: latent = torch.cat([img[0][:, :-latent_frame_zero, :, :], latent[:, -latent_frame_zero:, :, :]], dim=1) #(1. - mask2[0]) * img[0] + mask2[0] * latent - print(latent.shape, "nbxkasbcna090-") with torch.no_grad(): with torch.autocast("cuda", dtype=torch.bfloat16): - for i in range(sample_step): + for i in tqdm(range(sample_step), desc="Sampling", unit="step"): latent_model_input = [latent.squeeze(0)] if not t2v or step_sample>0: @@ -981,7 +987,6 @@ def sample_one( # ]) # timestep = temp_ts.unsqueeze(0) - print(latent_model_input[0].shape,"0-2=ffje0r=----------a") noise_pred_cond = transformer(latent_model_input, t=timestep, **arg_c)[0] if i+1 == sample_step: @@ -1010,7 +1015,6 @@ def sample_one( # timestep = torch.stack(timestep) # temp_ts = timestep.flatten() # timestep = temp_ts#.unsqueeze(0) - print(latent_model_input[0].shape,"0-2=ffje0r=----------a") noise_pred_cond = transformer(latent_model_input, t=timestep, flag=False, **arg_c)[0] # # UniPC @@ -1048,11 +1052,12 @@ def sample_one( else: model_input = latent + main_print(f"[SAMPLE] VAE decoding latents ...") + torch.cuda.empty_cache() with torch.autocast("cuda", dtype=torch.bfloat16): - video_cat = scale(vae, model_input[:,-latent_frame_zero:,:,:]) + video_cat = scale(vae, model_input[:,-latent_frame_zero:,:,:]) video = video_cat[:,-frame_zero:] video_all.append(video) - if step_sample > 0: #if video.shape[1] < frame_zero: # video = torch.cat([video[:,0].unsqueeze(1).repeat(1,frame_zero-video.shape[1],1,1),video],dim=1) @@ -1073,11 +1078,16 @@ def sample_one( else: videoid_str = str(videoid) + os.makedirs(video_output_dir, exist_ok=True) + caption_safe = re.sub(r'[\\/:*?"<>|→←↑↓·\s]+', '_', str(caption_ori))[:60] filename = os.path.join( - video_output_dir, - videoid_str+"_"+str(caption_ori)+"_"+str(repeat_num)+"_"+str(rank)+"_"+str(step_sample)+".mp4", - ) - export_to_video(video[0] , filename, fps=16) + video_output_dir, + f"{videoid_str}_{caption_safe}_{repeat_num}_{rank}_{step_sample}.mp4", + ) + main_print(f"[SAMPLE] VAE decode done -> video shape {video[0].shape if hasattr(video[0], 'shape') else len(video[0])} frames") + main_print(f"[SAVE] Output path: {filename}") + export_to_video(video[0], filename, fps=args.fps) + main_print(f"[SAVE] Saved: {filename}") if step_sample + 1 < sample_num: @@ -1124,7 +1134,10 @@ def main(args): local_rank = int(os.environ["LOCAL_RANK"]) rank = int(os.environ["RANK"]) world_size = int(os.environ["WORLD_SIZE"]) - dist.init_process_group("nccl", rank=rank, world_size=world_size) + print(f"[rank {rank}] dist init (backend={'gloo' if sys.platform == 'win32' else 'nccl'}) ...") + backend = "gloo" if sys.platform == "win32" else "nccl" + dist.init_process_group(backend, rank=rank, world_size=world_size) + print(f"[rank {rank}] dist init done") # Set independent cache directories for each rank os.environ["TRITON_CACHE_DIR"] = f"/tmp/triton_cache_{rank}" @@ -1146,20 +1159,20 @@ def main(args): ckpt_dir = "./Yume-5B-720P" # Referenced from https://github.com/Wan-Video/Wan2.2 + main_print(f"[INIT] Loading wan23.Yume from {ckpt_dir} ...") wan_i2v = wan23.Yume( config=cfg, checkpoint_dir=ckpt_dir, device_id=device, - ) - transformer = wan_i2v.model + ) + main_print("[INIT] wan23.Yume loaded") + transformer = wan_i2v.model transformer = transformer.eval().requires_grad_(False) main_print( f" Total Sample parameters = {sum(p.numel() for p in transformer.parameters() if p.requires_grad) / 1e6} M" ) - main_print( - f"--> Initializing FSDP with sharding strategy: {args.fsdp_sharding_startegy}" - ) + main_print(f"[INIT] Initializing FSDP with sharding strategy: {args.fsdp_sharding_startegy} ...") fsdp_kwargs, no_split_modules = get_dit_fsdp_kwargs( transformer, args.fsdp_sharding_startegy, @@ -1170,6 +1183,7 @@ def main(args): ) if args.resume_from_checkpoint: + main_print(f"[INIT] Resuming from checkpoint: {args.resume_from_checkpoint} ...") ( transformer, init_steps, @@ -1177,6 +1191,7 @@ def main(args): transformer, args.resume_from_checkpoint, ) + main_print(f"[INIT] Checkpoint resumed (init_steps={init_steps})") from safetensors import safe_open @@ -1238,12 +1253,14 @@ def main(args): # transformer.load_state_dict(merged_weights, strict=False) + main_print("[INIT] Casting transformer to bfloat16 and wrapping with FSDP ...") transformer = transformer.to(torch.bfloat16) transformer = FSDP( transformer, **fsdp_kwargs, use_orig_params=True, ) + main_print("[INIT] FSDP wrap done") @@ -1261,38 +1278,52 @@ def main(args): #init t5, clip and vae vae = wan_i2v.vae + main_print("[INIT] dist.barrier ...") dist.barrier() - + main_print("[INIT] barrier passed") + wan_i2v.device = device + main_print("[INIT] Loading denoiser ...") denoiser = load_denoiser() - - print("jpg_dir", args.jpg_dir) + main_print("[INIT] Denoiser loaded") + + main_print(f"[DATA] jpg_dir={args.jpg_dir} video_root_dir={args.video_root_dir} T2V={args.T2V}") image_sample = False dataset_ddp = None dataset_length = None if args.jpg_dir != None and not args.T2V: - dataset_ddp, dataset_length = create_scaled_videos(args.jpg_dir, - total_frames=33, - H1=704, - W1=1280) + main_print(f"[DATA] Building image dataset from {args.jpg_dir} ...") + dataset_ddp, dataset_length = create_scaled_videos(args.jpg_dir, + total_frames=33, + H1=args.height, + W1=args.width) + main_print(f"[DATA] Image dataset ready: {dataset_length} samples") image_sample = True elif not args.T2V: + main_print(f"[DATA] Building video dataset from {args.video_root_dir} ...") dataset_ddp, dataset_length = mp4_data(args.video_root_dir) + main_print(f"[DATA] Video dataset ready: {dataset_length} samples") image_sample = False - - print(dataset_ddp,"dataset_ddpdataset_ddpdataset_ddp") + else: + main_print("[DATA] T2V mode — no dataset loaded") step_times = deque(maxlen=100) - #image_sample = True - # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section. - path = '/mnt/petrelfs/maoxiaofeng/Yume_v2_release/InternVL3-2B-Instruct' - camption_model = AutoModel.from_pretrained( - path, - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - use_flash_attn=True, - trust_remote_code=True).eval().to(device) + _local = os.path.abspath(args.internvl_path) + path = _local if os.path.isdir(_local) else "OpenGVLab/InternVL3-2B-Instruct" + main_print(f"[INIT] Loading InternVL caption model from {path} ...") + # FSDP may leave an active DeviceContext("meta") TorchFunctionMode on the stack. + # torch.set_default_device("cpu") only sets a C++ variable and is overridden by + # the higher-priority Python TorchFunctionMode. Pushing a DeviceContext("cpu") + # via the context manager sits on top of any lingering meta context and wins. + with torch.device("cpu"): + camption_model = AutoModel.from_pretrained( + path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=False, + use_flash_attn=False, + trust_remote_code=True).eval().to(device) tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False) + main_print("[INIT] InternVL caption model loaded") if args.prompt!=None: prompt1 = args.prompt @@ -1304,7 +1335,9 @@ def main(args): else: date_len = int(dataset_length)//world_size + 1 + main_print(f"[LOOP] Starting inference loop: {date_len-1} step(s), world_size={world_size}") for step in range(1, date_len): + main_print(f"[LOOP] Step {step}/{date_len-1} starting ...") start_time = time.time() torch.cuda.empty_cache() torch.cuda.empty_cache() @@ -1335,6 +1368,7 @@ def main(args): step_time = time.time() - start_time step_times.append(step_time) avg_step_time = sum(step_times) / len(step_times) + main_print(f"[LOOP] Step {step}/{date_len-1} done in {step_time:.1f}s (avg {avg_step_time:.1f}s)") torch.cuda.empty_cache() @@ -1396,6 +1430,9 @@ def main(args): default=None, ) parser.add_argument("--num_frames", type=int, default=163) + parser.add_argument("--height", type=int, default=704) + parser.add_argument("--width", type=int, default=1280) + parser.add_argument("--fps", type=int, default=16) parser.add_argument( "--logging_dir", type=str, @@ -1571,6 +1608,12 @@ def main(args): type=str, default=None, ) + parser.add_argument( + "--internvl_path", + type=str, + default="./InternVL3-2B-Instruct", + help="Path to InternVL3-2B-Instruct model dir or HuggingFace repo ID.", + ) args = parser.parse_args() main(args) diff --git a/fastvideo/sample/sample_tts.py b/fastvideo/sample/sample_tts.py index 6037aab..2c82df1 100644 --- a/fastvideo/sample/sample_tts.py +++ b/fastvideo/sample/sample_tts.py @@ -902,6 +902,7 @@ def sample_one( video_output_dir, videoid_str+"_"+str(caption_ori)+"_"+str(step_sample)+"_"+str(repeat_num)+".mp4", ) + print(filename) export_to_video(video[0] , filename, fps=16) if step_sample + 1 < sample_num: diff --git a/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc b/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc index 3bca54b..33cf5fa 100644 Binary files a/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc and b/fastvideo/utils/__pycache__/checkpoint.cpython-312.pyc differ diff --git a/fastvideo/utils/__pycache__/communications.cpython-312.pyc b/fastvideo/utils/__pycache__/communications.cpython-312.pyc new file mode 100644 index 0000000..367b12c Binary files /dev/null and b/fastvideo/utils/__pycache__/communications.cpython-312.pyc differ diff --git a/fastvideo/utils/__pycache__/dataset_utils.cpython-312.pyc b/fastvideo/utils/__pycache__/dataset_utils.cpython-312.pyc new file mode 100644 index 0000000..e788319 Binary files /dev/null and b/fastvideo/utils/__pycache__/dataset_utils.cpython-312.pyc differ diff --git a/fastvideo/utils/__pycache__/fsdp_util.cpython-312.pyc b/fastvideo/utils/__pycache__/fsdp_util.cpython-312.pyc new file mode 100644 index 0000000..fd09562 Binary files /dev/null and b/fastvideo/utils/__pycache__/fsdp_util.cpython-312.pyc differ diff --git a/fastvideo/utils/__pycache__/load.cpython-312.pyc b/fastvideo/utils/__pycache__/load.cpython-312.pyc new file mode 100644 index 0000000..a0d67c7 Binary files /dev/null and b/fastvideo/utils/__pycache__/load.cpython-312.pyc differ diff --git a/fastvideo/utils/__pycache__/logging_.cpython-312.pyc b/fastvideo/utils/__pycache__/logging_.cpython-312.pyc index 9bac0b4..cdb2d05 100644 Binary files a/fastvideo/utils/__pycache__/logging_.cpython-312.pyc and b/fastvideo/utils/__pycache__/logging_.cpython-312.pyc differ diff --git a/fastvideo/utils/__pycache__/parallel_states.cpython-312.pyc b/fastvideo/utils/__pycache__/parallel_states.cpython-312.pyc new file mode 100644 index 0000000..4ec8dbb Binary files /dev/null and b/fastvideo/utils/__pycache__/parallel_states.cpython-312.pyc differ diff --git a/requirements.txt b/requirements.txt index 808bee9..83181d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,172 +1,172 @@ -absl-py==2.3.0 -accelerate==1.0.1 -aiofiles==23.2.1 -aiohappyeyeballs==2.6.1 -aiohttp==3.12.13 -aiosignal==1.3.2 -albucore==0.0.19 -albumentations==1.4.20 -annotated-types==0.7.0 -antlr4-python3-runtime==4.9.3 -anyio==4.9.0 -async-timeout==5.0.1 -attrs==25.3.0 -av==13.1.0 -beautifulsoup4==4.12.3 -bitsandbytes==0.42.0 -blessed==1.21.0 -certifi==2025.6.15 -charset-normalizer==3.4.2 -click==8.2.1 -codespell==2.3.0 -contourpy==1.3.2 -cycler==0.12.1 -dashscope==1.23.5 -decorator==4.4.2 -decord==0.6.0 -diffusers==0.32.0 -docker-pycreds==0.4.0 -easydict==1.13 -einops==0.8.0 -eval_type_backport==0.2.2 -exceptiongroup==1.3.0 -fastapi==0.115.3 -ffmpeg==1.4 -ffmpy==0.6.0 -filelock==3.13.1 -flash-attn==2.7.0.post2 -fonttools==4.58.4 -frozenlist==1.7.0 -fsspec==2024.6.1 -ftfy==6.3.0 -future==1.0.0 -fvcore==0.1.5.post20221221 -gdown==5.2.0 -gitdb==4.0.12 -GitPython==3.1.44 -gpustat==1.1.1 -gradio==5.3.0 -gradio_client==1.4.2 -grpcio==1.73.0 -h11==0.16.0 -h5py==3.12.1 -hf-xet==1.1.5 -hjson==3.1.0 -httpcore==1.0.9 -httpx==0.28.1 -huggingface-hub==0.26.1 -idna==3.6 -imageio==2.36.0 -imageio-ffmpeg==0.5.1 -importlib_metadata==8.7.0 -inquirerpy==0.3.4 -iopath==0.1.10 -isort==5.13.2 -Jinja2==3.1.4 -joblib==1.5.1 -kiwisolver==1.4.8 -liger_kernel==0.4.1 -lightning-utilities==0.14.3 -loguru==0.7.3 -Markdown==3.8 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -matplotlib==3.9.2 -mdurl==0.1.2 -moviepy==1.0.3 -mpmath==1.3.0 -msgpack==1.1.1 -multidict==6.5.0 -mypy==1.11.1 -mypy_extensions==1.1.0 -networkx==3.3 -ninja==1.11.1.4 -numpy==1.26.3 -omegaconf==2.3.0 -opencv-python==4.10.0.84 -opencv-python-headless==4.10.0.84 -orjson==3.10.18 -packaging==25.0 -pandas==2.2.3 -parameterized==0.9.0 -peft==0.13.2 -pfzy==0.3.4 -pillow==10.2.0 -platformdirs==4.3.8 -polib==1.2.0 -portalocker==3.2.0 -proglog==0.1.12 -prompt_toolkit==3.0.51 -propcache==0.3.2 -protobuf==5.28.3 -psutil==7.0.0 -py-cpuinfo==9.0.0 -pydantic==2.9.2 -pydantic_core==2.23.4 -pydub==0.25.1 -Pygments==2.19.1 -pyparsing==3.2.3 -PySocks==1.7.1 -python-dateutil==2.9.0.post0 -python-multipart==0.0.20 -pytorch-lightning==2.4.0 -pytorchvideo==0.1.5 -pytz==2025.2 -PyYAML==6.0.1 -regex==2024.9.11 -requests==2.31.0 -rich==14.0.0 -ruff==0.6.5 -safetensors==0.5.3 -scikit-learn==1.5.2 -scikit-video==1.1.11 -scipy==1.14.1 -semantic-version==2.10.0 -sentencepiece==0.2.0 -sentry-sdk==2.30.0 -setproctitle==1.3.6 -shellingham==1.5.4 -six==1.16.0 -smmap==5.0.2 -sniffio==1.3.1 -soupsieve==2.7 -sphinx-lint==1.0.0 -starlette==0.41.3 -stringzilla==3.12.5 -sympy==1.13.1 -tabulate==0.9.0 -tensorboard==2.18.0 -tensorboard-data-server==0.7.2 -termcolor==3.1.0 -test_tube==0.7.5 -threadpoolctl==3.6.0 -timm==1.0.11 -tokenizers==0.20.1 -toml==0.10.2 -tomli==2.0.2 -tomlkit==0.12.0 -torch==2.5.0+cu121 -torchdiffeq==0.2.4 -torchmetrics==1.5.1 -torchvision==0.20.0+cu121 -tqdm==4.66.5 -transformers==4.46.1 -triton==3.1.0 -typer==0.16.0 -types-PyYAML==6.0.12.20250516 -types-requests==2.32.4.20250611 -types-setuptools==80.9.0.20250529 -typing_extensions==4.12.2 -tzdata==2025.2 -urllib3==2.2.0 -uvicorn==0.32.0 -wandb==0.18.5 -watch==0.2.7 -wcwidth==0.2.13 -websocket-client==1.8.0 -websockets==12.0 -Werkzeug==3.1.3 -yacs==0.1.8 -yapf==0.32.0 -yarl==1.20.1 -zipp==3.23.0 \ No newline at end of file +absl-py>=2.3.0 +accelerate>=1.0.1 +aiofiles>=23.2.1 +aiohappyeyeballs>=2.6.1 +aiohttp>=3.12.13 +aiosignal>=1.3.2 +albucore>=0.0.19 +albumentations>=1.4.20 +annotated-types>=0.7.0 +antlr4-python3-runtime>=4.9.3 +anyio>=4.9.0 +async-timeout>=5.0.1 +attrs>=25.3.0 +av>=13.1.0 +beautifulsoup4>=4.12.3 +bitsandbytes>=0.42.0 +blessed>=1.21.0 +certifi>=2025.6.15 +charset-normalizer>=3.4.2 +click>=8.2.1 +codespell>=2.3.0 +contourpy>=1.3.2 +cycler>=0.12.1 +dashscope>=1.23.5 +decorator>=4.4.2 +decord>=0.6.0 +diffusers>=0.32.0 +docker-pycreds>=0.4.0 +easydict>=1.13 +einops>=0.8.0 +eval_type_backport>=0.2.2 +exceptiongroup>=1.3.0 +fastapi>=0.115.3 +ffmpeg>=1.4 +ffmpy>=0.6.0 +filelock>=3.13.1 +fonttools>=4.58.4 +frozenlist>=1.7.0 +fsspec>=2024.6.1 +ftfy>=6.3.0 +future>=1.0.0 +fvcore>=0.1.5.post20221221 +gdown>=5.2.0 +gitdb>=4.0.12 +GitPython>=3.1.44 +gpustat>=1.1.1 +gradio>=5.3.0 +gradio_client>=1.4.2 +grpcio>=1.73.0 +h11>=0.16.0 +h5py>=3.12.1 +hf-xet>=1.1.5 +hjson>=3.1.0 +httpcore>=1.0.9 +httpx>=0.28.1 +huggingface-hub>=0.26.1 +idna>=3.6 +imageio>=2.36.0 +imageio-ffmpeg>=0.5.1 +importlib_metadata>=8.7.0 +inquirerpy>=0.3.4 +iopath>=0.1.10 +isort>=5.13.2 +Jinja2>=3.1.4 +joblib>=1.5.1 +kiwisolver>=1.4.8 +liger_kernel>=0.4.1 +lightning-utilities>=0.14.3 +loguru>=0.7.3 +Markdown>=3.8 +markdown-it-py>=3.0.0 +MarkupSafe>=2.1.5 +matplotlib>=3.9.2 +mdurl>=0.1.2 +moviepy>=1.0.3 +mpmath>=1.3.0 +msgpack>=1.1.1 +multidict>=6.5.0 +mypy>=1.11.1 +mypy_extensions>=1.1.0 +networkx>=3.3 +ninja>=1.11.1.4 +numpy>=1.26.3 +omegaconf>=2.3.0 +opencv-python>=4.10.0.84 +opencv-python-headless>=4.10.0.84 +orjson>=3.10.18 +packaging>=25.0 +pandas>=2.2.3 +parameterized>=0.9.0 +peft>=0.13.2 +pfzy>=0.3.4 +pillow>=10.2.0 +platformdirs>=4.3.8 +polib>=1.2.0 +portalocker>=3.2.0 +proglog>=0.1.12 +prompt_toolkit>=3.0.51 +propcache>=0.3.2 +protobuf>=5.28.3 +psutil>=7.0.0 +py-cpuinfo>=9.0.0 +pydantic>=2.9.2 +pydantic_core>=2.23.4 +pydub>=0.25.1 +Pygments>=2.19.1 +pyparsing>=3.2.3 +PySocks>=1.7.1 +python-dateutil>=2.9.0.post0 +python-multipart>=0.0.20 +pytorch-lightning>=2.4.0 +pytorchvideo>=0.1.5 +pytz>=2025.2 +PyYAML>=6.0.1 +regex>=2024.9.11 +requests>=2.31.0 +rich>=14.0.0 +ruff>=0.6.5 +safetensors>=0.5.3 +scikit-learn>=1.5.2 +scikit-video>=1.1.11 +scipy>=1.14.1 +semantic-version>=2.10.0 +sentencepiece>=0.2.0 +sentry-sdk>=2.30.0 +setproctitle>=1.3.6 +shellingham>=1.5.4 +six>=1.16.0 +smmap>=5.0.2 +sniffio>=1.3.1 +soupsieve>=2.7 +sphinx-lint>=1.0.0 +starlette>=0.41.3 +stringzilla>=3.12.5 +sympy>=1.13.1 +tabulate>=0.9.0 +tensorboard>=2.18.0 +tensorboard-data-server>=0.7.2 +termcolor>=3.1.0 +test_tube>=0.7.5 +threadpoolctl>=3.6.0 +timm>=1.0.11 +tokenizers>=0.20.1 +toml>=0.10.2 +tomli>=2.0.2 +tomlkit>=0.12.0 +#torch>=2.5.0+cu121 +torchdiffeq>=0.2.4 +torchmetrics>=1.5.1 +#torchvision>=0.20.0+cu121 +tqdm>=4.66.5 +transformers>=4.46.1,<5.0 +triton-windows>=3.1.0 +typer>=0.16.0 +types-PyYAML>=6.0.12.20250516 +types-requests>=2.32.4.20250611 +types-setuptools>=80.9.0.20250529 +typing_extensions>=4.12.2 +tzdata>=2025.2 +urllib3>=2.2.0 +uvicorn>=0.32.0 +wandb>=0.18.5 +watch>=0.2.7 +wcwidth>=0.2.13 +websocket-client>=1.8.0 +websockets>=12.0 +Werkzeug>=3.1.3 +yacs>=0.1.8 +yapf>=0.32.0 +yarl>=1.20.1 +zipp>=3.23.0 +flash-attn>=2.7.0.post2 \ No newline at end of file diff --git a/scripts/finetune/finetune.bat b/scripts/finetune/finetune.bat new file mode 100644 index 0000000..9c4c7d3 --- /dev/null +++ b/scripts/finetune/finetune.bat @@ -0,0 +1,30 @@ +@echo off +setlocal + +set TOKENIZERS_PARALLELISM=false +set USE_LIBUV=0 + +cd /d "%~dp0..\.." + +torchrun --nproc_per_node 8 --master_port 29607 ^ + fastvideo/distill_model.py ^ + --seed 42 ^ + --gradient_checkpointing ^ + --train_batch_size=1 ^ + --dataloader_num_workers 4 ^ + --gradient_accumulation_steps=1 ^ + --max_train_steps=600000 ^ + --learning_rate=1e-5 ^ + --discriminator_learning_rate=1e-5 ^ + --mixed_precision="bf16" ^ + --checkpointing_steps=25 ^ + --validation_steps 24 ^ + --allow_tf32 ^ + --MVDT ^ + --Distil ^ + --t5_cpu ^ + --root_dir="./mp4_frame" ^ + --full_mp4="./Sekai/" ^ + --output_dir="./outputs" + +exit /b %ERRORLEVEL% diff --git a/scripts/inference/sample.bat b/scripts/inference/sample.bat new file mode 100644 index 0000000..5739059 --- /dev/null +++ b/scripts/inference/sample.bat @@ -0,0 +1,67 @@ +@echo off +setlocal enabledelayedexpansion + +set TOKENIZERS_PARALLELISM=false +set TF_ENABLE_ONEDNN_OPTS=0 +set LOCAL_RANK=0 +set RANK=0 +set WORLD_SIZE=1 +set MASTER_ADDR=127.0.0.1 +set MASTER_PORT=29500 + +set EXAMPLE_DIR=C:\workspace\world\Infinite-World\assets\example_case + +cd /d "%~dp0..\.." + +:: Write default caption file once +echo A first-person view exploring an interactive game scene. > temp_caption_default.txt + +for /f %%T in ('powershell -NoProfile -Command "Get-Date -Format ''yyyyMMdd_HHmmss''"') do set RUN_TS=%%T +for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set OVERALL_START=%%t +set CASE_NUM=0 + +goto :main + +:: ------------------------------------------------------- +:: Subroutine: reads RC_DIR, RC_NAME, RC_CAPTION from env +:: ------------------------------------------------------- +:run_case + set /a CASE_NUM+=1 + echo. + echo === !CASE_NUM!: !RC_NAME! === + echo output: ./outputs/%RUN_TS%/!RC_NAME! + for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set CASE_START=%%t + + python fastvideo/sample/sample.py ^ + --seed 42 ^ + --gradient_checkpointing ^ + --train_batch_size=1 ^ + --max_sample_steps=600000 ^ + --mixed_precision="bf16" ^ + --allow_tf32 ^ + --video_output_dir="./outputs/%RUN_TS%/!RC_NAME!" ^ + --jpg_dir="!RC_DIR!" ^ + --caption_path="!RC_CAPTION!" ^ + --test_data_dir="./val" ^ + --num_euler_timesteps 50 ^ + --rand_num_img 0.6 ^ + --t5_cpu + + powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-!CASE_START!)/1e7; $fps=[math]::Round(163/$e,2); Write-Host (' time='+[math]::Round($e,1)+'s inference='+$fps+'fps video=16fps')" + exit /b 0 + +:main + +for /d %%D in (%EXAMPLE_DIR%\*) do ( + set RC_NAME=%%~nxD + set RC_DIR=%%D + set RC_CAPTION=temp_caption_default.txt + if exist "%%D\prompt.txt" set RC_CAPTION=%%D\prompt.txt + call :run_case +) + +echo. +powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-%OVERALL_START%)/1e7; Write-Host ('Total: !CASE_NUM! cases')" + +del temp_caption_default.txt 2>nul +exit /b 0 diff --git a/scripts/inference/sample.sh b/scripts/inference/sample.sh index 06f4468..b6c4ca2 100644 --- a/scripts/inference/sample.sh +++ b/scripts/inference/sample.sh @@ -3,6 +3,7 @@ # DATA_DIR=./data # IP=[MASTER NODE IP] export TOKENIZERS_PARALLELISM=false +export USE_LIBUV=0 torchrun --nproc_per_node 1 --master_port 29709 \ fastvideo/sample/sample.py \ diff --git a/scripts/inference/sample_5b.bat b/scripts/inference/sample_5b.bat new file mode 100644 index 0000000..e7084d0 --- /dev/null +++ b/scripts/inference/sample_5b.bat @@ -0,0 +1,81 @@ +@echo off +setlocal enabledelayedexpansion + +set TOKENIZERS_PARALLELISM=false +set TF_ENABLE_ONEDNN_OPTS=0 +set LOCAL_RANK=0 +set RANK=0 +set WORLD_SIZE=1 +set MASTER_ADDR=127.0.0.1 +set MASTER_PORT=29500 + +set EXAMPLE_DIR=C:\workspace\world\Infinite-World\assets\example_case + +cd /d "%~dp0..\.." + +for /f "tokens=*" %%T in ('powershell -NoProfile -Command "Get-Date -Format ''yyyyMMdd_HHmmss''" 2^>nul') do if not "%%T"=="" set RUN_TS=%%T +for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set OVERALL_START=%%t +echo Output: %CD%\outputs\%RUN_TS% +set CASE_NUM=0 + +:: Count total cases +set TOTAL_CASES=0 +for /d %%D in (%EXAMPLE_DIR%\*) do set /a TOTAL_CASES+=1 +echo Found %TOTAL_CASES% cases output: ./outputs/%RUN_TS% + +goto :main + +:: ------------------------------------------------------- +:: Subroutine: reads RC_DIR, RC_NAME, RC_CAPTION from env +:: ------------------------------------------------------- +:run_case + set /a CASE_NUM+=1 + echo. + echo === !CASE_NUM!/%TOTAL_CASES%: !RC_NAME! === + echo output: ./outputs/%RUN_TS%/!RC_NAME! + echo img: !RC_DIR! + echo prompt: !RC_CAPTION! + type "!RC_CAPTION!" + echo. + for /f %%t in ('powershell -NoProfile -Command "[int64](Get-Date).Ticks"') do set CASE_START=%%t + + python fastvideo/sample/sample_5b.py ^ + --seed 43 ^ + --gradient_checkpointing ^ + --train_batch_size=1 ^ + --max_sample_steps=1 ^ + --mixed_precision="bf16" ^ + --allow_tf32 ^ + --t5_cpu ^ + --video_output_dir="./outputs/%RUN_TS%/!RC_NAME!" ^ + --jpg_dir="!RC_DIR!" ^ + --caption_path="!RC_CAPTION!" ^ + --test_data_dir="./val" ^ + --num_euler_timesteps 8 ^ + --rand_num_img 0.6 ^ + --internvl_path "./InternVL3-2B-Instruct" + + powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-!CASE_START!)/1e7; $tot=([int64](Get-Date).Ticks-%OVERALL_START%)/1e7; $fps=[math]::Round(163/$e,2); $rem=(%TOTAL_CASES%-!CASE_NUM!)*($tot/!CASE_NUM!); Write-Host (' time='+[math]::Round($e,1)+'s fps='+$fps+' ETA='+[math]::Round($rem,0)+'s')" + exit /b 0 + +:main + +for /d %%D in (%EXAMPLE_DIR%\*) do ( + set RC_NAME=%%~nxD + set RC_DIR=%%D + echo A %%~nxD scene. > temp_caption_%%~nxD.txt + set RC_CAPTION=temp_caption_%%~nxD.txt + if exist "%%D\prompt.txt" set RC_CAPTION=%%D\prompt.txt + if exist "%%D\prompt.json" ( + powershell -NoProfile -Command "(Get-Content '%%D\prompt.json' -Raw | ConvertFrom-Json).prompt | Set-Content 'temp_prompt_%%~nxD.txt' -Encoding UTF8" + set RC_CAPTION=temp_prompt_%%~nxD.txt + ) + call :run_case +) + +echo. +powershell -NoProfile -Command "$e=([int64](Get-Date).Ticks-%OVERALL_START%)/1e7; Write-Host ('Total: %TOTAL_CASES% cases in '+[math]::Round($e,1)+'s avg='+[math]::Round($e/%TOTAL_CASES%,1)+'s/case')" + +del temp_caption_*.txt 2>nul +del temp_prompt_*.txt 2>nul +exit /b 0 diff --git a/scripts/inference/sample_5b.sh b/scripts/inference/sample_5b.sh index b0bc4b2..3777baf 100755 --- a/scripts/inference/sample_5b.sh +++ b/scripts/inference/sample_5b.sh @@ -3,6 +3,7 @@ # DATA_DIR=./data # IP=[MASTER NODE IP] export TOKENIZERS_PARALLELISM=false +export USE_LIBUV=0 torchrun --nproc_per_node 2 --master_port 29717 \ fastvideo/sample/sample_5b.py \ diff --git a/scripts/inference/sample_5b_vbench.bat b/scripts/inference/sample_5b_vbench.bat new file mode 100644 index 0000000..ad72d7e --- /dev/null +++ b/scripts/inference/sample_5b_vbench.bat @@ -0,0 +1,25 @@ +@echo off +setlocal enabledelayedexpansion + +set PY_SCRIPT=%~dp0vbench_runner.py + +cd /d "%~dp0..\.." + +:: Default paths relative to this repo — override by passing them as arguments: +:: sample_5b_vbench.bat [VBENCH_JSON] [VBENCH_CROP] [NUM_FRAMES] +if not "%~1"=="" (set VBENCH_JSON=%~1) else (set VBENCH_JSON=%CD%\..\VBench\vbench2_beta_i2v\vbench2_beta_i2v\data\i2v-bench-info.json) +if not "%~2"=="" (set VBENCH_CROP=%~2) else (set VBENCH_CROP=%CD%\..\VBench\vbench2_beta_i2v\vbench2_beta_i2v\data\crop\1-1) +if not "%~3"=="" (set NUM_FRAMES=%~3) else (set NUM_FRAMES=161) + +echo Output: %CD%\outputs\vbench +echo VBench: %VBENCH_JSON% +echo Crop: %VBENCH_CROP% +echo Frames: %NUM_FRAMES% + +python "%PY_SCRIPT%" ^ + --vbench-json "%VBENCH_JSON%" ^ + --vbench-crop "%VBENCH_CROP%" ^ + --work-dir "%CD%" ^ + --num-frames %NUM_FRAMES% + +exit /b %ERRORLEVEL% diff --git a/scripts/inference/sample_image.bat b/scripts/inference/sample_image.bat new file mode 100644 index 0000000..be64dc6 --- /dev/null +++ b/scripts/inference/sample_image.bat @@ -0,0 +1,29 @@ +@echo off +setlocal + +set TOKENIZERS_PARALLELISM=false +set TF_ENABLE_ONEDNN_OPTS=0 +set LOCAL_RANK=0 +set RANK=0 +set WORLD_SIZE=1 +set MASTER_ADDR=127.0.0.1 +set MASTER_PORT=29500 + +cd /d "%~dp0..\.." + +python fastvideo/sample/sample.py ^ + --seed 42 ^ + --gradient_checkpointing ^ + --train_batch_size=1 ^ + --max_sample_steps=600000 ^ + --mixed_precision="bf16" ^ + --allow_tf32 ^ + --video_output_dir="./outputs" ^ + --jpg_dir="./jpg/" ^ + --caption_path="./caption.txt" ^ + --test_data_dir="./val" ^ + --num_euler_timesteps 50 ^ + --rand_num_img 0.6 ^ + --t5_cpu + +exit /b %ERRORLEVEL% diff --git a/scripts/inference/sample_image.sh b/scripts/inference/sample_image.sh index e2f3e16..779b58c 100644 --- a/scripts/inference/sample_image.sh +++ b/scripts/inference/sample_image.sh @@ -3,6 +3,7 @@ # DATA_DIR=./data # IP=[MASTER NODE IP] export TOKENIZERS_PARALLELISM=false +export USE_LIBUV=0 torchrun --nproc_per_node 1 --master_port 29707 \ fastvideo/sample/sample.py \ diff --git a/scripts/inference/sample_tts.bat b/scripts/inference/sample_tts.bat new file mode 100644 index 0000000..cd162a8 --- /dev/null +++ b/scripts/inference/sample_tts.bat @@ -0,0 +1,29 @@ +@echo off +setlocal + +set TOKENIZERS_PARALLELISM=false +set TF_ENABLE_ONEDNN_OPTS=0 +set LOCAL_RANK=0 +set RANK=0 +set WORLD_SIZE=1 +set MASTER_ADDR=127.0.0.1 +set MASTER_PORT=29500 + +cd /d "%~dp0..\.." + +python fastvideo/sample/sample_tts.py ^ + --seed 42 ^ + --gradient_checkpointing ^ + --train_batch_size=1 ^ + --max_sample_steps=600000 ^ + --mixed_precision="bf16" ^ + --allow_tf32 ^ + --video_output_dir="./outputs" ^ + --jpg_dir="./jpg/" ^ + --caption_path="./caption.txt" ^ + --test_data_dir="./val" ^ + --num_euler_timesteps 50 ^ + --rand_num_img 0.6 ^ + --t5_cpu + +exit /b %ERRORLEVEL% diff --git a/scripts/inference/sample_tts.sh b/scripts/inference/sample_tts.sh index 37a4251..d1807fe 100644 --- a/scripts/inference/sample_tts.sh +++ b/scripts/inference/sample_tts.sh @@ -3,6 +3,7 @@ # DATA_DIR=./data # IP=[MASTER NODE IP] export TOKENIZERS_PARALLELISM=false +export USE_LIBUV=0 torchrun --nproc_per_node 1 --master_port 29708 \ fastvideo/sample/sample_tts.py \ diff --git a/scripts/inference/vbench_runner.py b/scripts/inference/vbench_runner.py new file mode 100644 index 0000000..8b5d102 --- /dev/null +++ b/scripts/inference/vbench_runner.py @@ -0,0 +1,280 @@ +""" +vbench_runner.py -- Python replacement for vbench_runner.ps1 +Calls sample_5b.py once per prompt with NUM_SAMPLES caption lines and a +fresh random seed, so the model loads once per prompt instead of once per sample. +""" +import argparse +import csv +import json +import os +import random +import re +import shutil +import subprocess +import sys +import threading +import time +from pathlib import Path + +ALLOWED_TYPES = ['indoor', 'scenery'] +NUM_SAMPLES = 5 +NUM_FRAMES = 161 + + +# ---------- helpers ---------- + +def _safe(s, maxlen=80): + return re.sub(r'[<>:"/\\|?*]', '_', s)[:maxlen] + +def _poll_vram(stop_event, readings): + while not stop_event.is_set(): + try: + out = subprocess.check_output( + ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'], + stderr=subprocess.DEVNULL, text=True + ).strip().splitlines()[0].strip() + if out.isdigit(): + readings.append(int(out)) + except Exception: + pass + time.sleep(5) + +def _ram_gb(): + try: + import psutil + vm = psutil.virtual_memory() + return round(vm.used / (1024 ** 3), 2) + except Exception: + return '' + +def _vram_peak_gb(readings): + if not readings: + return '' + return round(max(readings) / 1024.0, 2) + + +# ---------- main ---------- + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('--vbench-json', required=True) + ap.add_argument('--vbench-crop', required=True) + ap.add_argument('--work-dir', required=True) + ap.add_argument('--num-frames', type=int, default=NUM_FRAMES) + args = ap.parse_args() + + work_dir = Path(args.work_dir) + out_base = work_dir / 'outputs' / 'vbench' / 'videos' + stats_path = work_dir / 'outputs' / 'vbench' / 'stats.csv' + + if not Path(args.vbench_json).exists(): + sys.exit(f'[vbench] ERROR: JSON not found: {args.vbench_json}') + if not Path(args.vbench_crop).exists(): + sys.exit(f'[vbench] ERROR: crop dir not found: {args.vbench_crop}') + + with open(args.vbench_json, encoding='utf-8') as f: + entries = json.load(f) + + seen, prompts = set(), [] + for e in entries: + fn = e.get('file_name', '') + if fn in seen: + continue + if e.get('type') not in ALLOWED_TYPES: + continue + seen.add(fn) + cap = e.get('caption') or Path(fn).stem + prompts.append({'file': fn, 'caption': cap, 'type': e['type']}) + + total = len(prompts) * NUM_SAMPLES + print(f'\n=== VBench prompt list ({len(prompts)} prompts, types: {", ".join(ALLOWED_TYPES)}) ===') + for i, p in enumerate(prompts): + img_ok = ' ' if (Path(args.vbench_crop) / p['file']).exists() else 'MISSING' + print(f' [{i+1:2}/{len(prompts)}] ({p["type"]:<8}) {img_ok} {p["caption"]}') + print(f'=== {len(prompts)} prompts x {NUM_SAMPLES} samples = {total} runs ===\n') + + out_base.mkdir(parents=True, exist_ok=True) + stats_path.parent.mkdir(parents=True, exist_ok=True) + + stats_is_new = not stats_path.exists() + stats_f = open(stats_path, 'a', newline='', encoding='utf-8') + writer = csv.writer(stats_f) + if stats_is_new: + writer.writerow(['task_idx', 'prompt', 'type', 'sample_idx', 'seed', + 'duration_s', 'gen_fps', 'vram_gb', 'ram_gb', 'out_path', 'status']) + stats_f.flush() + + done = generated = errors = skipped = 0 + t_start = time.time() + + for ti, p in enumerate(prompts): + img_src = Path(args.vbench_crop) / p['file'] + if not img_src.exists(): + print(f'[vbench] skip prompt {ti+1}: image not found - {img_src}') + done += NUM_SAMPLES + continue + + safe_cap = _safe(p['caption']) + safe_cap_u = safe_cap.replace(' ', '_') # sample_5b replaces spaces → underscores + + def _cap_match(name): + return safe_cap in name or safe_cap_u in name + + # samples already completed (renamed with _s{i}_seed{n} suffix) + already = sorted([f for f in out_base.glob('*.mp4') + if _cap_match(f.name) and '_seed' in f.name]) + n_already = len(already) + + # orphans: files from an interrupted run that were never renamed + orphans = sorted([f for f in out_base.glob('*.mp4') + if _cap_match(f.name) and '_seed' not in f.name]) + + # recover orphans with new random seeds + for i, src in enumerate(orphans): + si_abs = n_already + i + seed = random.randint(0, 2**31 - 1) + dst = src.with_stem(f'{src.stem}_s{si_abs}_seed{seed}') + print(f' [recover] {src.name} -> {dst.name}') + src.rename(dst) + writer.writerow([ti, p['caption'], p['type'], si_abs, seed, + '', '', '', '', str(dst), 'recovered']) + stats_f.flush() + already.append(dst) + n_already = len(already) + + n_needed = NUM_SAMPLES - n_already + + if n_needed <= 0: + print(f'[vbench] skip prompt {ti+1}: all {NUM_SAMPLES} samples already done') + for i, f in enumerate(already): + writer.writerow([ti, p['caption'], p['type'], i, '', '', '', '', '', str(f), 'skipped']) + stats_f.flush() + skipped += NUM_SAMPLES + done += NUM_SAMPLES + continue + + if n_already > 0: + print(f'[vbench] prompt {ti+1}: {n_already} done, running {n_needed} remaining') + + short_cap = p['caption'][:60] + + # prepare temp dir with exactly this image (shared across samples) + tmp_dir = work_dir / '_vbench_tmp' + tmp_dir.mkdir(exist_ok=True) + for f in tmp_dir.iterdir(): + f.unlink(missing_ok=True) + shutil.copy(img_src, tmp_dir) + + cap_file = work_dir / '_vbench_caption.txt' + cap_file.write_text(p['caption'], encoding='utf-8') + + for si in range(n_needed): + si_abs = n_already + si + seed = random.randint(0, 2**31 - 1) + + pct = round(100 * done / total) if total else 0 + eta = '' + if done > 0: + elapsed = time.time() - t_start + rem = int(elapsed / done * (total - done)) + eta = f' ETA {rem//3600:02d}h{(rem%3600)//60:02d}m{rem%60:02d}s' + print(f'[vbench] [{done+1}/{total} {pct}%{eta}] ' + f'prompt {ti+1}/{len(prompts)} ({p["type"]}) ' + f'sample {si_abs+1}/{NUM_SAMPLES} seed {seed} : {short_cap}') + + vram_readings = [] + stop_evt = threading.Event() + vram_thread = threading.Thread(target=_poll_vram, args=(stop_evt, vram_readings), daemon=True) + vram_thread.start() + + t0 = time.time() + new_mp4 = None + dur = 0.0 + fps = 0.0 + try: + pre_existing = set(out_base.glob('*.mp4')) + + import socket as _socket + with _socket.socket() as _s: + _s.bind(('127.0.0.1', 0)) + _free_port = str(_s.getsockname()[1]) + _pythonpath = str(work_dir) + os.pathsep + os.environ.get('PYTHONPATH', '') + env = {**os.environ, + 'PYTHONPATH': _pythonpath, + 'TOKENIZERS_PARALLELISM': 'false', + 'TF_ENABLE_ONEDNN_OPTS': '0', + 'LOCAL_RANK': '0', 'RANK': '0', 'WORLD_SIZE': '1', + 'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': _free_port} + subprocess.run([ + sys.executable, 'fastvideo/sample/sample_5b.py', + '--seed', str(seed), + '--gradient_checkpointing', + '--train_batch_size=1', + '--max_sample_steps=1', + '--mixed_precision=bf16', + '--allow_tf32', + '--t5_cpu', + f'--video_output_dir={out_base}', + f'--jpg_dir={tmp_dir}', + f'--caption_path={cap_file}', + '--test_data_dir=./val', + '--num_euler_timesteps', '5', + '--rand_num_img', '0.6', + '--internvl_path', './InternVL3-2B-Instruct', + '--height', '704', + '--width', '1216', + '--num_frames', str(args.num_frames), + '--fps', '24', + ], cwd=str(work_dir), env=env) + + dur = round(time.time() - t0, 2) + fps = round(args.num_frames / dur, 2) + + all_now = set(out_base.glob('*.mp4')) + raw_mp4s = sorted( + [f for f in (all_now - pre_existing) + if _cap_match(f.name) and '_seed' not in f.name], + key=lambda f: f.name + ) + print(f' [detect] {len(raw_mp4s)} new mp4s (pre={len(pre_existing)}, now={len(all_now)})') + + if raw_mp4s: + src = raw_mp4s[0] + dst = src.with_stem(f'{src.stem}_s{si_abs}_seed{seed}') + src.rename(dst) + new_mp4 = dst + else: + print(f' WARNING: expected 1 mp4, got 0') + + except Exception as exc: + print(f' EXCEPTION: {exc}', file=sys.stderr) + finally: + stop_evt.set() + vram_thread.join(timeout=10) + vram = _vram_peak_gb(vram_readings) + ram = _ram_gb() + + if new_mp4: + print(f' done in {dur}s ({fps} gen-fps) VRAM {vram}GB RAM {ram}GB') + writer.writerow([ti, p['caption'], p['type'], si_abs, seed, + dur, fps, vram, ram, str(new_mp4), 'ok']) + generated += 1 + else: + writer.writerow([ti, p['caption'], p['type'], si_abs, seed, + '', '', '', '', '', 'error']) + errors += 1 + print(f'[vbench] ERROR prompt {ti+1} sample {si_abs} "{short_cap}" — continuing') + stats_f.flush() + done += 1 + + shutil.rmtree(tmp_dir, ignore_errors=True) + cap_file.unlink(missing_ok=True) + + stats_f.close() + elapsed_m = round((time.time() - t_start) / 60, 1) + print(f'\n[vbench] done - generated={generated} skipped={skipped} errors={errors} elapsed={elapsed_m}m') + print(f'[vbench] stats -> {stats_path}') + + +if __name__ == '__main__': + main() diff --git a/wan23/__pycache__/__init__.cpython-312.pyc b/wan23/__pycache__/__init__.cpython-312.pyc index 6f8d53d..d8b037b 100644 Binary files a/wan23/__pycache__/__init__.cpython-312.pyc and b/wan23/__pycache__/__init__.cpython-312.pyc differ diff --git a/wan23/__pycache__/image2video.cpython-312.pyc b/wan23/__pycache__/image2video.cpython-312.pyc index 322330c..af3d049 100644 Binary files a/wan23/__pycache__/image2video.cpython-312.pyc and b/wan23/__pycache__/image2video.cpython-312.pyc differ diff --git a/wan23/__pycache__/text2video.cpython-312.pyc b/wan23/__pycache__/text2video.cpython-312.pyc index 97d448c..0790da1 100644 Binary files a/wan23/__pycache__/text2video.cpython-312.pyc and b/wan23/__pycache__/text2video.cpython-312.pyc differ diff --git a/wan23/__pycache__/textimage2video.cpython-312.pyc b/wan23/__pycache__/textimage2video.cpython-312.pyc index 85d34b7..66bb647 100644 Binary files a/wan23/__pycache__/textimage2video.cpython-312.pyc and b/wan23/__pycache__/textimage2video.cpython-312.pyc differ diff --git a/wan23/configs/__pycache__/__init__.cpython-312.pyc b/wan23/configs/__pycache__/__init__.cpython-312.pyc index 6567fc0..ab7d856 100644 Binary files a/wan23/configs/__pycache__/__init__.cpython-312.pyc and b/wan23/configs/__pycache__/__init__.cpython-312.pyc differ diff --git a/wan23/configs/__pycache__/shared_config.cpython-312.pyc b/wan23/configs/__pycache__/shared_config.cpython-312.pyc index bad711b..602d543 100644 Binary files a/wan23/configs/__pycache__/shared_config.cpython-312.pyc and b/wan23/configs/__pycache__/shared_config.cpython-312.pyc differ diff --git a/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc b/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc index 56450e4..658607a 100644 Binary files a/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc and b/wan23/configs/__pycache__/wan_i2v_A14B.cpython-312.pyc differ diff --git a/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc b/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc index 68973be..c46e66b 100644 Binary files a/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc and b/wan23/configs/__pycache__/wan_t2v_A14B.cpython-312.pyc differ diff --git a/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc b/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc index 6df028d..fb267b6 100644 Binary files a/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc and b/wan23/configs/__pycache__/wan_ti2v_5B.cpython-312.pyc differ diff --git a/wan23/distributed/__pycache__/__init__.cpython-312.pyc b/wan23/distributed/__pycache__/__init__.cpython-312.pyc index a5a5c0e..f27ab25 100644 Binary files a/wan23/distributed/__pycache__/__init__.cpython-312.pyc and b/wan23/distributed/__pycache__/__init__.cpython-312.pyc differ diff --git a/wan23/distributed/__pycache__/fsdp.cpython-312.pyc b/wan23/distributed/__pycache__/fsdp.cpython-312.pyc index 8efbfd8..cdf2ea4 100644 Binary files a/wan23/distributed/__pycache__/fsdp.cpython-312.pyc and b/wan23/distributed/__pycache__/fsdp.cpython-312.pyc differ diff --git a/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc b/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc index 8d313e7..920cfae 100644 Binary files a/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc and b/wan23/distributed/__pycache__/sequence_parallel.cpython-312.pyc differ diff --git a/wan23/distributed/__pycache__/ulysses.cpython-312.pyc b/wan23/distributed/__pycache__/ulysses.cpython-312.pyc index 6f754c9..8051887 100644 Binary files a/wan23/distributed/__pycache__/ulysses.cpython-312.pyc and b/wan23/distributed/__pycache__/ulysses.cpython-312.pyc differ diff --git a/wan23/distributed/__pycache__/util.cpython-312.pyc b/wan23/distributed/__pycache__/util.cpython-312.pyc index 4f321cf..3e5c246 100644 Binary files a/wan23/distributed/__pycache__/util.cpython-312.pyc and b/wan23/distributed/__pycache__/util.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/__init__.cpython-312.pyc b/wan23/modules/__pycache__/__init__.cpython-312.pyc index 154b5b8..32aa80c 100644 Binary files a/wan23/modules/__pycache__/__init__.cpython-312.pyc and b/wan23/modules/__pycache__/__init__.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/attention.cpython-312.pyc b/wan23/modules/__pycache__/attention.cpython-312.pyc index 26519f9..72cd32e 100644 Binary files a/wan23/modules/__pycache__/attention.cpython-312.pyc and b/wan23/modules/__pycache__/attention.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/model.cpython-312.pyc b/wan23/modules/__pycache__/model.cpython-312.pyc index 95319b8..b4588f9 100644 Binary files a/wan23/modules/__pycache__/model.cpython-312.pyc and b/wan23/modules/__pycache__/model.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/t5.cpython-312.pyc b/wan23/modules/__pycache__/t5.cpython-312.pyc index 43c5ef2..6f6b62d 100644 Binary files a/wan23/modules/__pycache__/t5.cpython-312.pyc and b/wan23/modules/__pycache__/t5.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/tokenizers.cpython-312.pyc b/wan23/modules/__pycache__/tokenizers.cpython-312.pyc index 91cb805..c6f06dd 100644 Binary files a/wan23/modules/__pycache__/tokenizers.cpython-312.pyc and b/wan23/modules/__pycache__/tokenizers.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/vae2_1.cpython-312.pyc b/wan23/modules/__pycache__/vae2_1.cpython-312.pyc index 39b2088..20f3748 100644 Binary files a/wan23/modules/__pycache__/vae2_1.cpython-312.pyc and b/wan23/modules/__pycache__/vae2_1.cpython-312.pyc differ diff --git a/wan23/modules/__pycache__/vae2_2.cpython-312.pyc b/wan23/modules/__pycache__/vae2_2.cpython-312.pyc index c2e308b..64c4caf 100644 Binary files a/wan23/modules/__pycache__/vae2_2.cpython-312.pyc and b/wan23/modules/__pycache__/vae2_2.cpython-312.pyc differ diff --git a/wan23/modules/t5.py b/wan23/modules/t5.py index 8010896..475e0d7 100644 --- a/wan23/modules/t5.py +++ b/wan23/modules/t5.py @@ -25,22 +25,9 @@ def fp16_clamp(x): def init_weights(m): - if isinstance(m, T5LayerNorm): - nn.init.ones_(m.weight) - elif isinstance(m, T5Model): - nn.init.normal_(m.token_embedding.weight, std=1.0) - elif isinstance(m, T5FeedForward): - nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5) - nn.init.normal_(m.fc1.weight, std=m.dim**-0.5) - nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5) - elif isinstance(m, T5Attention): - nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5) - nn.init.normal_(m.k.weight, std=m.dim**-0.5) - nn.init.normal_(m.v.weight, std=m.dim**-0.5) - nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5) - elif isinstance(m, T5RelativeEmbedding): - nn.init.normal_( - m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5) + # Skipped: weights are overwritten by checkpoint load immediately after + # model construction. Running init on UMT5-XXL wastes ~10s per subprocess. + pass class GELU(nn.Module): diff --git a/wan23/textimage2video.py b/wan23/textimage2video.py index 53efcb7..254b53d 100644 --- a/wan23/textimage2video.py +++ b/wan23/textimage2video.py @@ -1,5 +1,6 @@ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import gc +import json import logging import math import os @@ -153,9 +154,16 @@ def __init__( torch.zeros(1, 1, self.model.dim, device=self.model.device) ) - self.model = WanModel.from_pretrained(checkpoint_dir) - state_dict = load_file(checkpoint_dir+"/diffusion_pytorch_model.safetensors") - self.model.load_state_dict(state_dict) + # Use load_file (mmap, low RAM) instead of from_pretrained (reads full file into RAM). + # strict=False: sideblock/mask_token/patch_embedding_* are YUME-only additions not + # present in the base checkpoint; they stay freshly initialized (same as original flow). + # Load directly to the target device to avoid holding two copies in RAM. + # load_file with device= loads each tensor directly to CUDA, skipping the + # intermediate CPU buffer that causes OOM on machines with limited RAM. + from safetensors.torch import load_file as _load_file + _ckpt = os.path.join(checkpoint_dir, "diffusion_pytorch_model.safetensors") + state_dict = _load_file(_ckpt, device=str(self.device)) + self.model.load_state_dict(state_dict, strict=False) self.sp_size = 1 diff --git a/wan23/utils/__pycache__/__init__.cpython-312.pyc b/wan23/utils/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 072aa21..0000000 Binary files a/wan23/utils/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/wan23/utils/__pycache__/fm_solvers.cpython-312.pyc b/wan23/utils/__pycache__/fm_solvers.cpython-312.pyc deleted file mode 100644 index 593a987..0000000 Binary files a/wan23/utils/__pycache__/fm_solvers.cpython-312.pyc and /dev/null differ diff --git a/wan23/utils/__pycache__/fm_solvers_unipc.cpython-312.pyc b/wan23/utils/__pycache__/fm_solvers_unipc.cpython-312.pyc deleted file mode 100644 index 2a7317e..0000000 Binary files a/wan23/utils/__pycache__/fm_solvers_unipc.cpython-312.pyc and /dev/null differ diff --git a/wan23/utils/__pycache__/utils.cpython-312.pyc b/wan23/utils/__pycache__/utils.cpython-312.pyc deleted file mode 100644 index 10ae6f6..0000000 Binary files a/wan23/utils/__pycache__/utils.cpython-312.pyc and /dev/null differ