tdrussell · lyogavin · Dec 5, 2025 · Dec 6, 2025 · Feb 26, 2026
diff --git a/examples/main_example.toml b/examples/main_example.toml
@@ -77,6 +77,10 @@ activation_checkpointing = true
 # Use reentrant activation checkpointing method (set this in addition to `activation_checkpointing`). Might be required for some models
 # when using pipeline parallelism (pipeline_stages>1). Otherwise recommended to not use it.
 #reentrant_activation_checkpointing = true
+# Only upload to Hugging Face starting from this epoch (optional, defaults to 0)
+# upload_hf_from_epoch = 2
+# Your existing HF repo setting
+# huggingface_repo = 'lyogavin/godmodeai_game_ui_gray_out_kontext'
 
 # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
 partition_method = 'parameters'

diff --git a/utils/dataset.py b/utils/dataset.py
@@ -741,17 +741,17 @@ def fn(example):
             try:
                 if image_file.suffix in VIDEO_EXTENSIONS:
                     # 100% accurate frame count, but much slower.
-                    # frames = 0
-                    # for frame in imageio.v3.imiter(image_file):
-                    #     frames += 1
-                    #     height, width = frame.shape[:2]
+                    frames = 0
+                    for frame in imageio.v3.imiter(image_file):
+                        frames += 1
+                        height, width = frame.shape[:2]
                     # TODO: this is an estimate of frame count. What happens if variable frame rate? Is
                     # it still close enough?
-                    meta = imageio.v3.immeta(filepath_or_file)
-                    first_frame = next(imageio.v3.imiter(filepath_or_file))
-                    height, width = first_frame.shape[:2]
+                    # meta = imageio.v3.immeta(filepath_or_file)
+                    # first_frame = next(imageio.v3.imiter(filepath_or_file))
+                    # height, width = first_frame.shape[:2]
                     assert self.framerate is not None, "Need model framerate but don't have it. This shouldn't happen. Is the framerate attribute on the model set?"
-                    frames = int(self.framerate * meta['duration'])
+                    # frames = int(self.framerate * meta['duration'])
                 else:
                     pil_img = Image.open(filepath_or_file)
                     width, height = pil_img.size
@@ -1419,4 +1419,4 @@ def _zero_first():
     print(f'Dataset length: {len(train_data)}')
 
     for item in train_data:
-        pass
+        pass
diff --git a/utils/saver.py b/utils/saver.py
@@ -7,6 +7,7 @@
 import torch
 from deepspeed import comm as dist
 from deepspeed.utils.logging import logger
+from huggingface_hub import HfApi
 
 from utils.common import is_main_process
 
@@ -55,7 +56,7 @@ def __init__(self, args, config, is_adapter, save_root, model, train_dataloader,
         self.model_engine = model_engine
         self.pipeline_model = pipeline_model
 
-    def save_adapter(self, name):
+    def save_adapter(self, name, epoch=None):
         dp_id = self.model_engine.grid.get_data_parallel_rank()
         stage_id = self.model_engine.grid.get_pipe_parallel_rank()
         save_dir = self.save_root / name
@@ -83,8 +84,43 @@ def save_adapter(self, name):
             self.model.save_adapter(save_dir, state_dict)
             shutil.copy(self.args.config, save_dir)
             shutil.rmtree(tmp_dir)
-
-    def save_full_model(self, name):
+
+            # Upload to Hugging Face if configured and epoch condition is met
+            if 'huggingface_repo' in self.config and self.config['huggingface_repo']:
+                upload_hf_from_epoch = self.config.get('upload_hf_from_epoch', 0)
+                should_upload = epoch is None or epoch >= upload_hf_from_epoch
+
+                if should_upload:
+                    try:
+                        if is_main_process():
+                            print(f"Uploading adapter '{name}' to Hugging Face repo: {self.config['huggingface_repo']}")
+
+                        api = HfApi()
+                        api.create_repo(
+                            repo_id=self.config['huggingface_repo'],
+                            private=True,
+                            exist_ok=True,
+                            repo_type="model"
+                        )
+                        api.upload_folder(
+                            folder_path=str(save_dir),
+                            repo_id=self.config['huggingface_repo'],
+                            path_in_repo=f"epoch{epoch}", # to be consistent
+                            repo_type="model"
+                        )
+
+                        if is_main_process():
+                            print(f"Successfully uploaded adapter '{name}' to Hugging Face")
+
+                    except Exception as e:
+                        if is_main_process():
+                            logger.error(f"Failed to upload adapter to Hugging Face: {str(e)}")
+                            print(f"Warning: Failed to upload to Hugging Face: {str(e)}")
+                else:
+                    if is_main_process():
+                        print(f"Skipping Hugging Face upload for epoch {epoch} (upload_hf_from_epoch={upload_hf_from_epoch})")
+
+    def save_full_model(self, name, epoch=None, max_shard_size='5GB'):
         dp_id = self.model_engine.grid.get_data_parallel_rank()
         stage_id = self.model_engine.grid.get_pipe_parallel_rank()
         save_dir = self.save_root / name
@@ -107,13 +143,13 @@ def save_full_model(self, name):
             shutil.copy(self.args.config, save_dir)
             shutil.rmtree(tmp_dir)
 
-    def save_model(self, name):
+    def save_model(self, name, epoch=None):
         if is_main_process():
             print(f'Saving model to directory {name}')
         if self.is_adapter:
-            self.save_adapter(name)
+            self.save_adapter(name, epoch)
         else:
-            self.save_full_model(name)
+            self.save_full_model(name, epoch)
 
     def save_checkpoint(self, step, examples):
         self.model_engine.save_checkpoint(
@@ -134,7 +170,7 @@ def process_epoch(self, epoch, step, examples):
                 self.save_checkpoint(step, examples)
                 checkpointed = True
             if 'save_every_n_epochs' in self.config and epoch % self.config['save_every_n_epochs'] == 0:
-                self.save_model(f'epoch{epoch}')
+                self.save_model(f'epoch{epoch}', epoch)
                 saved = True
             epoch = self.train_dataloader.epoch
             if epoch > self.config['epochs']: