Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/main_example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ activation_checkpointing = true
# Use reentrant activation checkpointing method (set this in addition to `activation_checkpointing`). Might be required for some models
# when using pipeline parallelism (pipeline_stages>1). Otherwise recommended to not use it.
#reentrant_activation_checkpointing = true
# Only upload to Hugging Face starting from this epoch (optional, defaults to 0)
# upload_hf_from_epoch = 2
# Your existing HF repo setting
# huggingface_repo = 'lyogavin/godmodeai_game_ui_gray_out_kontext'

# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
partition_method = 'parameters'
Expand Down
18 changes: 9 additions & 9 deletions utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,17 +741,17 @@ def fn(example):
try:
if image_file.suffix in VIDEO_EXTENSIONS:
# 100% accurate frame count, but much slower.
# frames = 0
# for frame in imageio.v3.imiter(image_file):
# frames += 1
# height, width = frame.shape[:2]
frames = 0
for frame in imageio.v3.imiter(image_file):
frames += 1
height, width = frame.shape[:2]
# TODO: this is an estimate of frame count. What happens if variable frame rate? Is
# it still close enough?
meta = imageio.v3.immeta(filepath_or_file)
first_frame = next(imageio.v3.imiter(filepath_or_file))
height, width = first_frame.shape[:2]
# meta = imageio.v3.immeta(filepath_or_file)
# first_frame = next(imageio.v3.imiter(filepath_or_file))
# height, width = first_frame.shape[:2]
assert self.framerate is not None, "Need model framerate but don't have it. This shouldn't happen. Is the framerate attribute on the model set?"
frames = int(self.framerate * meta['duration'])
# frames = int(self.framerate * meta['duration'])
else:
pil_img = Image.open(filepath_or_file)
width, height = pil_img.size
Expand Down Expand Up @@ -1419,4 +1419,4 @@ def _zero_first():
print(f'Dataset length: {len(train_data)}')

for item in train_data:
pass
pass
50 changes: 43 additions & 7 deletions utils/saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch
from deepspeed import comm as dist
from deepspeed.utils.logging import logger
from huggingface_hub import HfApi

from utils.common import is_main_process

Expand Down Expand Up @@ -55,7 +56,7 @@ def __init__(self, args, config, is_adapter, save_root, model, train_dataloader,
self.model_engine = model_engine
self.pipeline_model = pipeline_model

def save_adapter(self, name):
def save_adapter(self, name, epoch=None):
dp_id = self.model_engine.grid.get_data_parallel_rank()
stage_id = self.model_engine.grid.get_pipe_parallel_rank()
save_dir = self.save_root / name
Expand Down Expand Up @@ -83,8 +84,43 @@ def save_adapter(self, name):
self.model.save_adapter(save_dir, state_dict)
shutil.copy(self.args.config, save_dir)
shutil.rmtree(tmp_dir)

def save_full_model(self, name):

# Upload to Hugging Face if configured and epoch condition is met
if 'huggingface_repo' in self.config and self.config['huggingface_repo']:
upload_hf_from_epoch = self.config.get('upload_hf_from_epoch', 0)
should_upload = epoch is None or epoch >= upload_hf_from_epoch

if should_upload:
try:
if is_main_process():
print(f"Uploading adapter '{name}' to Hugging Face repo: {self.config['huggingface_repo']}")

api = HfApi()
api.create_repo(
repo_id=self.config['huggingface_repo'],
private=True,
exist_ok=True,
repo_type="model"
)
api.upload_folder(
folder_path=str(save_dir),
repo_id=self.config['huggingface_repo'],
path_in_repo=f"epoch{epoch}", # to be consistent
repo_type="model"
)

if is_main_process():
print(f"Successfully uploaded adapter '{name}' to Hugging Face")

except Exception as e:
if is_main_process():
logger.error(f"Failed to upload adapter to Hugging Face: {str(e)}")
print(f"Warning: Failed to upload to Hugging Face: {str(e)}")
else:
if is_main_process():
print(f"Skipping Hugging Face upload for epoch {epoch} (upload_hf_from_epoch={upload_hf_from_epoch})")

def save_full_model(self, name, epoch=None, max_shard_size='5GB'):
dp_id = self.model_engine.grid.get_data_parallel_rank()
stage_id = self.model_engine.grid.get_pipe_parallel_rank()
save_dir = self.save_root / name
Expand All @@ -107,13 +143,13 @@ def save_full_model(self, name):
shutil.copy(self.args.config, save_dir)
shutil.rmtree(tmp_dir)

def save_model(self, name):
def save_model(self, name, epoch=None):
if is_main_process():
print(f'Saving model to directory {name}')
if self.is_adapter:
self.save_adapter(name)
self.save_adapter(name, epoch)
else:
self.save_full_model(name)
self.save_full_model(name, epoch)

def save_checkpoint(self, step, examples):
self.model_engine.save_checkpoint(
Expand All @@ -134,7 +170,7 @@ def process_epoch(self, epoch, step, examples):
self.save_checkpoint(step, examples)
checkpointed = True
if 'save_every_n_epochs' in self.config and epoch % self.config['save_every_n_epochs'] == 0:
self.save_model(f'epoch{epoch}')
self.save_model(f'epoch{epoch}', epoch)
saved = True
epoch = self.train_dataloader.epoch
if epoch > self.config['epochs']:
Expand Down