Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/diffusers/modular_pipelines/anima/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks"]
_import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks", "AnimaImg2ImgAutoBlocks"]
_import_structure["modular_pipeline"] = ["AnimaModularPipeline"]

if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
Expand All @@ -31,7 +31,7 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
from .modular_blocks_anima import AnimaAutoBlocks
from .modular_blocks_anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks
from .modular_pipeline import AnimaModularPipeline
else:
import sys
Expand Down
147 changes: 147 additions & 0 deletions src/diffusers/modular_pipelines/anima/before_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,3 +414,150 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi

self.set_block_state(state, block_state)
return components, state


class AnimaImg2ImgSetTimestepsStep(ModularPipelineBlocks):
model_name = "anima"

@property
def expected_components(self) -> list[ComponentSpec]:
from ...schedulers import FlowMatchEulerDiscreteScheduler
return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]

@property
def description(self) -> str:
return "Step that sets the scheduler's timesteps for image-to-image inference"

@property
def inputs(self) -> list[InputParam]:
return [
InputParam("num_inference_steps", default=50),
InputParam("timesteps"),
InputParam("sigmas"),
InputParam("strength", default=0.8),
InputParam("num_images_per_prompt", default=1),
InputParam(
"batch_size",
required=True,
type_hint=int,
description="Number of prompts",
),
]

@property
def intermediate_outputs(self) -> list[OutputParam]:
return [
OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference"),
OutputParam(
"num_inference_steps",
type_hint=int,
description="The number of denoising steps to perform at inference time",
),
]

@staticmethod
def get_timesteps(scheduler, num_inference_steps, strength, device):
init_timestep = min(num_inference_steps * strength, num_inference_steps)

t_start = int(max(num_inference_steps - init_timestep, 0))
timesteps = scheduler.timesteps[t_start * scheduler.order :]
if hasattr(scheduler, "set_begin_index"):
scheduler.set_begin_index(t_start * scheduler.order)

return timesteps, num_inference_steps - t_start

@torch.no_grad()
def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
device = components._execution_device

sigmas = (
np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps)
if block_state.sigmas is None
else block_state.sigmas
)
timesteps, num_inference_steps = retrieve_timesteps(
components.scheduler,
device=device,
sigmas=sigmas,
)

timesteps, num_inference_steps = self.get_timesteps(
components.scheduler, num_inference_steps, block_state.strength, device
)
block_state.timesteps = timesteps
block_state.num_inference_steps = num_inference_steps

self.set_block_state(state, block_state)
return components, state


class AnimaImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
model_name = "anima"

@property
def description(self) -> str:
return "Step that adds noise to image latents for image-to-image in Anima."

@property
def expected_components(self) -> list[ComponentSpec]:
from ...schedulers import FlowMatchEulerDiscreteScheduler
return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]

@property
def inputs(self) -> list[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial random noise, generated in prepare latent step.",
),
InputParam(
name="image_latents",
required=True,
type_hint=torch.Tensor,
description="The image latents to use for the denoising process. Generated in vae encoder step.",
),
InputParam(
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process.",
),
]

@property
def intermediate_outputs(self) -> list[OutputParam]:
return [
OutputParam(
name="initial_noise",
type_hint=torch.Tensor,
description="The initial random noise.",
),
]

@staticmethod
def check_inputs(image_latents, latents):
if image_latents.shape[0] != latents.shape[0]:
raise ValueError(
f"`image_latents` must have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}"
)

@torch.no_grad()
def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)

self.check_inputs(image_latents=block_state.image_latents, latents=block_state.latents)

latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])

block_state.initial_noise = block_state.latents

block_state.latents = components.scheduler.scale_noise(
block_state.image_latents, latent_timestep, block_state.latents
)

self.set_block_state(state, block_state)

return components, state
59 changes: 59 additions & 0 deletions src/diffusers/modular_pipelines/anima/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,3 +251,62 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi

self.set_block_state(state, block_state)
return components, state

class AnimaVaeEncoderStep(ModularPipelineBlocks):
model_name = "anima"

@property
def description(self) -> str:
return "VAE Encoder step that converts image into latent representations."

@property
def expected_components(self) -> list[ComponentSpec]:
from ...models import AutoencoderKLQwenImage
return [ComponentSpec("vae", AutoencoderKLQwenImage)]

@property
def inputs(self) -> list[InputParam]:
return [InputParam("processed_image"), InputParam("generator")]

@property
def intermediate_outputs(self) -> list[OutputParam]:
return [
OutputParam(
"image_latents",
type_hint=torch.Tensor,
description="The latents representing the reference image",
)
]

@torch.no_grad()
def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)

if block_state.processed_image is None:
block_state.image_latents = None
else:
device = components._execution_device
dtype = components.vae.dtype
image = block_state.processed_image.to(device=device, dtype=dtype)

from ...utils.torch_utils import retrieve_latents
latents = retrieve_latents(components.vae.encode(image), generator=block_state.generator, sample_mode="sample")

latents_mean = (
torch.tensor(components.vae.config.latents_mean)
.view(1, components.vae.config.z_dim, 1, 1)
.to(latents.device, latents.dtype)
)
latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
1, components.vae.config.z_dim, 1, 1
).to(latents.device, latents.dtype)

image_latents = (latents - latents_mean) * latents_std

image_latents = image_latents.unsqueeze(2)

block_state.image_latents = image_latents

self.set_block_state(state, block_state)

return components, state
76 changes: 76 additions & 0 deletions src/diffusers/modular_pipelines/anima/inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch

from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import AnimaModularPipeline


class AnimaProcessImagesInputStep(ModularPipelineBlocks):
model_name = "anima"

@property
def description(self) -> str:
return "Image Preprocess step for Anima."

@property
def expected_components(self) -> list[ComponentSpec]:
return [
ComponentSpec(
"image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 8, "vae_latent_channels": 16}),
default_creation_method="from_config",
),
]

@property
def inputs(self) -> list[InputParam]:
return [InputParam("image"), InputParam("height"), InputParam("width")]

@property
def intermediate_outputs(self) -> list[OutputParam]:
return [OutputParam(name="processed_image")]

@staticmethod
def check_inputs(height, width, vae_scale_factor):
divisor = vae_scale_factor * 2
if height is not None and height % divisor != 0:
raise ValueError(f"Height must be divisible by {divisor} but is {height}")

if width is not None and width % divisor != 0:
raise ValueError(f"Width must be divisible by {divisor} but is {width}")

@torch.no_grad()
def __call__(self, components: AnimaModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)

if block_state.image is None:
raise ValueError("`image` cannot be None")

image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width

block_state.processed_image = components.image_processor.preprocess(image=image, height=height, width=width)

self.set_block_state(state, block_state)
return components, state
55 changes: 54 additions & 1 deletion src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,18 @@

from ..modular_pipeline import SequentialPipelineBlocks
from ..modular_pipeline_utils import OutputParam
from .inputs import AnimaProcessImagesInputStep
from .encoders import AnimaTextEncoderStep, AnimaVaeEncoderStep
from .before_denoise import (
AnimaPrepareLatentsStep,
AnimaSetTimestepsStep,
AnimaImg2ImgSetTimestepsStep,
AnimaImg2ImgPrepareLatentsStep,
AnimaTextConditioningStep,
AnimaTextInputStep,
)
from .decoders import AnimaProcessImagesOutputStep, AnimaVaeDecoderStep
from .denoise import AnimaDenoiseStep
from .encoders import AnimaTextEncoderStep


# auto_docstring
Expand Down Expand Up @@ -78,6 +81,8 @@ class AnimaCoreDenoiseStep(SequentialPipelineBlocks):
AnimaTextInputStep,
AnimaPrepareLatentsStep,
AnimaSetTimestepsStep,
AnimaImg2ImgSetTimestepsStep,
AnimaImg2ImgPrepareLatentsStep,
AnimaDenoiseStep,
]
block_names = ["text_conditioning", "input", "prepare_latents", "set_timesteps", "denoise"]
Expand Down Expand Up @@ -181,3 +186,51 @@ def description(self) -> str:
@property
def outputs(self):
return [OutputParam.template("images")]


class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
"""
Denoise block that takes encoded Anima text inputs and image latents and runs the denoising process for img2img.
"""

block_classes = [
AnimaTextConditioningStep,
AnimaTextInputStep,
AnimaImg2ImgSetTimestepsStep,
AnimaPrepareLatentsStep,
AnimaImg2ImgPrepareLatentsStep,
AnimaDenoiseStep,
]
block_names = ["text_conditioning", "input", "set_timesteps", "prepare_latents", "prepare_img2img_latents", "denoise"]

@property
def description(self) -> str:
return "Denoise block that takes encoded Anima text inputs and image latents and runs the denoising process."

@property
def outputs(self):
return [OutputParam.template("latents")]


class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks):
"""
Auto Modular pipeline for image-to-image generation using Anima.
"""

block_classes = [
AnimaProcessImagesInputStep,
AnimaVaeEncoderStep,
AnimaTextEncoderStep,
AnimaImg2ImgCoreDenoiseStep,
AnimaDecodeStep,
]
block_names = ["image_input", "vae_encoder", "text_encoder", "denoise", "decode"]
_workflow_map = {"img2img": {"prompt": True, "image": True, "strength": True}}

@property
def description(self) -> str:
return "Auto Modular pipeline for image-to-image generation using Anima."

@property
def outputs(self):
return [OutputParam.template("images")]
Loading