diff --git a/src/diffusers/modular_pipelines/anima/__init__.py b/src/diffusers/modular_pipelines/anima/__init__.py index 4772d906e03b..1cbb2d741bfb 100644 --- a/src/diffusers/modular_pipelines/anima/__init__.py +++ b/src/diffusers/modular_pipelines/anima/__init__.py @@ -21,7 +21,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks"] + _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks", "AnimaImg2ImgAutoBlocks"] _import_structure["modular_pipeline"] = ["AnimaModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -31,7 +31,7 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .modular_blocks_anima import AnimaAutoBlocks + from .modular_blocks_anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks from .modular_pipeline import AnimaModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/anima/before_denoise.py b/src/diffusers/modular_pipelines/anima/before_denoise.py index 25f38cd0cb65..c6376f295d71 100644 --- a/src/diffusers/modular_pipelines/anima/before_denoise.py +++ b/src/diffusers/modular_pipelines/anima/before_denoise.py @@ -414,3 +414,150 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi self.set_block_state(state, block_state) return components, state + + +class AnimaImg2ImgSetTimestepsStep(ModularPipelineBlocks): + model_name = "anima" + + @property + def expected_components(self) -> list[ComponentSpec]: + from ...schedulers import FlowMatchEulerDiscreteScheduler + return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)] + + @property + def description(self) -> str: + return "Step that sets the scheduler's timesteps for image-to-image inference" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("num_inference_steps", default=50), + InputParam("timesteps"), + InputParam("sigmas"), + InputParam("strength", default=0.8), + InputParam("num_images_per_prompt", default=1), + InputParam( + "batch_size", + required=True, + type_hint=int, + description="Number of prompts", + ), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference"), + OutputParam( + "num_inference_steps", + type_hint=int, + description="The number of denoising steps to perform at inference time", + ), + ] + + @staticmethod + def get_timesteps(scheduler, num_inference_steps, strength, device): + init_timestep = min(num_inference_steps * strength, num_inference_steps) + + t_start = int(max(num_inference_steps - init_timestep, 0)) + timesteps = scheduler.timesteps[t_start * scheduler.order :] + if hasattr(scheduler, "set_begin_index"): + scheduler.set_begin_index(t_start * scheduler.order) + + return timesteps, num_inference_steps - t_start + + @torch.no_grad() + def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + sigmas = ( + np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps) + if block_state.sigmas is None + else block_state.sigmas + ) + timesteps, num_inference_steps = retrieve_timesteps( + components.scheduler, + device=device, + sigmas=sigmas, + ) + + timesteps, num_inference_steps = self.get_timesteps( + components.scheduler, num_inference_steps, block_state.strength, device + ) + block_state.timesteps = timesteps + block_state.num_inference_steps = num_inference_steps + + self.set_block_state(state, block_state) + return components, state + + +class AnimaImg2ImgPrepareLatentsStep(ModularPipelineBlocks): + model_name = "anima" + + @property + def description(self) -> str: + return "Step that adds noise to image latents for image-to-image in Anima." + + @property + def expected_components(self) -> list[ComponentSpec]: + from ...schedulers import FlowMatchEulerDiscreteScheduler + return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial random noise, generated in prepare latent step.", + ), + InputParam( + name="image_latents", + required=True, + type_hint=torch.Tensor, + description="The image latents to use for the denoising process. Generated in vae encoder step.", + ), + InputParam( + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process.", + ), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + name="initial_noise", + type_hint=torch.Tensor, + description="The initial random noise.", + ), + ] + + @staticmethod + def check_inputs(image_latents, latents): + if image_latents.shape[0] != latents.shape[0]: + raise ValueError( + f"`image_latents` must have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}" + ) + + @torch.no_grad() + def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + self.check_inputs(image_latents=block_state.image_latents, latents=block_state.latents) + + latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0]) + + block_state.initial_noise = block_state.latents + + block_state.latents = components.scheduler.scale_noise( + block_state.image_latents, latent_timestep, block_state.latents + ) + + self.set_block_state(state, block_state) + + return components, state diff --git a/src/diffusers/modular_pipelines/anima/encoders.py b/src/diffusers/modular_pipelines/anima/encoders.py index bdeecd28737b..f29ce8971254 100644 --- a/src/diffusers/modular_pipelines/anima/encoders.py +++ b/src/diffusers/modular_pipelines/anima/encoders.py @@ -251,3 +251,62 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi self.set_block_state(state, block_state) return components, state + +class AnimaVaeEncoderStep(ModularPipelineBlocks): + model_name = "anima" + + @property + def description(self) -> str: + return "VAE Encoder step that converts image into latent representations." + + @property + def expected_components(self) -> list[ComponentSpec]: + from ...models import AutoencoderKLQwenImage + return [ComponentSpec("vae", AutoencoderKLQwenImage)] + + @property + def inputs(self) -> list[InputParam]: + return [InputParam("processed_image"), InputParam("generator")] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "image_latents", + type_hint=torch.Tensor, + description="The latents representing the reference image", + ) + ] + + @torch.no_grad() + def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + if block_state.processed_image is None: + block_state.image_latents = None + else: + device = components._execution_device + dtype = components.vae.dtype + image = block_state.processed_image.to(device=device, dtype=dtype) + + from ...utils.torch_utils import retrieve_latents + latents = retrieve_latents(components.vae.encode(image), generator=block_state.generator, sample_mode="sample") + + latents_mean = ( + torch.tensor(components.vae.config.latents_mean) + .view(1, components.vae.config.z_dim, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view( + 1, components.vae.config.z_dim, 1, 1 + ).to(latents.device, latents.dtype) + + image_latents = (latents - latents_mean) * latents_std + + image_latents = image_latents.unsqueeze(2) + + block_state.image_latents = image_latents + + self.set_block_state(state, block_state) + + return components, state diff --git a/src/diffusers/modular_pipelines/anima/inputs.py b/src/diffusers/modular_pipelines/anima/inputs.py new file mode 100644 index 000000000000..4ce37448d135 --- /dev/null +++ b/src/diffusers/modular_pipelines/anima/inputs.py @@ -0,0 +1,76 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import AnimaModularPipeline + + +class AnimaProcessImagesInputStep(ModularPipelineBlocks): + model_name = "anima" + + @property + def description(self) -> str: + return "Image Preprocess step for Anima." + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 8, "vae_latent_channels": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> list[InputParam]: + return [InputParam("image"), InputParam("height"), InputParam("width")] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [OutputParam(name="processed_image")] + + @staticmethod + def check_inputs(height, width, vae_scale_factor): + divisor = vae_scale_factor * 2 + if height is not None and height % divisor != 0: + raise ValueError(f"Height must be divisible by {divisor} but is {height}") + + if width is not None and width % divisor != 0: + raise ValueError(f"Width must be divisible by {divisor} but is {width}") + + @torch.no_grad() + def __call__(self, components: AnimaModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + if block_state.image is None: + raise ValueError("`image` cannot be None") + + image = block_state.image + self.check_inputs( + height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor + ) + height = block_state.height or components.default_height + width = block_state.width or components.default_width + + block_state.processed_image = components.image_processor.preprocess(image=image, height=height, width=width) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py index fc71b87f62d8..ddbebe78b03c 100644 --- a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py +++ b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py @@ -14,15 +14,18 @@ from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import OutputParam +from .inputs import AnimaProcessImagesInputStep +from .encoders import AnimaTextEncoderStep, AnimaVaeEncoderStep from .before_denoise import ( AnimaPrepareLatentsStep, AnimaSetTimestepsStep, + AnimaImg2ImgSetTimestepsStep, + AnimaImg2ImgPrepareLatentsStep, AnimaTextConditioningStep, AnimaTextInputStep, ) from .decoders import AnimaProcessImagesOutputStep, AnimaVaeDecoderStep from .denoise import AnimaDenoiseStep -from .encoders import AnimaTextEncoderStep # auto_docstring @@ -78,6 +81,8 @@ class AnimaCoreDenoiseStep(SequentialPipelineBlocks): AnimaTextInputStep, AnimaPrepareLatentsStep, AnimaSetTimestepsStep, + AnimaImg2ImgSetTimestepsStep, + AnimaImg2ImgPrepareLatentsStep, AnimaDenoiseStep, ] block_names = ["text_conditioning", "input", "prepare_latents", "set_timesteps", "denoise"] @@ -181,3 +186,51 @@ def description(self) -> str: @property def outputs(self): return [OutputParam.template("images")] + + +class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + """ + Denoise block that takes encoded Anima text inputs and image latents and runs the denoising process for img2img. + """ + + block_classes = [ + AnimaTextConditioningStep, + AnimaTextInputStep, + AnimaImg2ImgSetTimestepsStep, + AnimaPrepareLatentsStep, + AnimaImg2ImgPrepareLatentsStep, + AnimaDenoiseStep, + ] + block_names = ["text_conditioning", "input", "set_timesteps", "prepare_latents", "prepare_img2img_latents", "denoise"] + + @property + def description(self) -> str: + return "Denoise block that takes encoded Anima text inputs and image latents and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for image-to-image generation using Anima. + """ + + block_classes = [ + AnimaProcessImagesInputStep, + AnimaVaeEncoderStep, + AnimaTextEncoderStep, + AnimaImg2ImgCoreDenoiseStep, + AnimaDecodeStep, + ] + block_names = ["image_input", "vae_encoder", "text_encoder", "denoise", "decode"] + _workflow_map = {"img2img": {"prompt": True, "image": True, "strength": True}} + + @property + def description(self) -> str: + return "Auto Modular pipeline for image-to-image generation using Anima." + + @property + def outputs(self): + return [OutputParam.template("images")]