diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md index f0188f3c36fb..51d674a68b83 100644 --- a/docs/source/en/api/pipelines/animatediff.md +++ b/docs/source/en/api/pipelines/animatediff.md @@ -172,7 +172,7 @@ Here are some sample outputs: raccoon playing a guitar
- racoon playing a guitar + raccoon playing a guitar a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality @@ -491,7 +491,7 @@ Here are some sample outputs: raccoon playing a guitar
racoon playing a guitar diff --git a/docs/source/en/api/pipelines/bria_fibo.md b/docs/source/en/api/pipelines/bria_fibo.md index 96c6b0317e1b..52e463500847 100644 --- a/docs/source/en/api/pipelines/bria_fibo.md +++ b/docs/source/en/api/pipelines/bria_fibo.md @@ -16,7 +16,7 @@ Text-to-image models have mastered imagination - but not control. FIBO changes t FIBO is trained on structured JSON captions up to 1,000+ words and designed to understand and control different visual parameters such as lighting, composition, color, and camera settings, enabling precise and reproducible outputs. -With only 8 billion parameters, FIBO provides a new level of image quality, prompt adherence and proffesional control. +With only 8 billion parameters, FIBO provides a new level of image quality, prompt adherence and professional control. FIBO is trained exclusively on a structured prompt and will not work with freeform text prompts. you can use the [FIBO-VLM-prompt-to-JSON](https://huggingface.co/briaai/FIBO-VLM-prompt-to-JSON) model or the [FIBO-gemini-prompt-to-JSON](https://huggingface.co/briaai/FIBO-gemini-prompt-to-JSON) to convert your freeform text prompt to a structured JSON prompt. diff --git a/docs/source/en/api/pipelines/kandinsky5_video.md b/docs/source/en/api/pipelines/kandinsky5_video.md index 733e2481732a..ea561e8c887e 100644 --- a/docs/source/en/api/pipelines/kandinsky5_video.md +++ b/docs/source/en/api/pipelines/kandinsky5_video.md @@ -54,7 +54,7 @@ Kandinsky 5.0 T2V Lite: ### Basic Text-to-Video Generation #### Pro -**⚠️ Warning!** all Pro models should be infered with pipeline.enable_model_cpu_offload() +**⚠️ Warning!** all Pro models should be inferred with pipeline.enable_model_cpu_offload() ```python import torch from diffusers import Kandinsky5T2VPipeline @@ -65,7 +65,7 @@ model_id = "kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers" pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16) pipe = pipe.to("cuda") -pipeline.transformer.set_attention_backend("flex") # <--- Set attention bakend to Flex +pipeline.transformer.set_attention_backend("flex") # <--- Set attention backend to Flex pipeline.enable_model_cpu_offload() # <--- Enable cpu offloading for single GPU inference pipeline.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=True) # <--- Compile with max-autotune-no-cudagraphs @@ -126,7 +126,7 @@ pipe = pipe.to("cuda") pipe.transformer.set_attention_backend( "flex" -) # <--- Set attention bakend to Flex +) # <--- Set attention backend to Flex pipe.transformer.compile( mode="max-autotune-no-cudagraphs", dynamic=True @@ -149,7 +149,7 @@ export_to_video(output, "output.mp4", fps=24, quality=9) ``` ### Diffusion Distilled model -**⚠️ Warning!** all nocfg and diffusion distilled models should be infered wothout CFG (```guidance_scale=1.0```): +**⚠️ Warning!** all nocfg and diffusion distilled models should be inferred without CFG (```guidance_scale=1.0```): ```python model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers" @@ -167,7 +167,7 @@ export_to_video(output, "output.mp4", fps=24, quality=9) ### Basic Image-to-Video Generation -**⚠️ Warning!** all Pro models should be infered with pipeline.enable_model_cpu_offload() +**⚠️ Warning!** all Pro models should be inferred with pipeline.enable_model_cpu_offload() ```python import torch from diffusers import Kandinsky5T2VPipeline @@ -178,7 +178,7 @@ model_id = "kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers" pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16) pipe = pipe.to("cuda") -pipeline.transformer.set_attention_backend("flex") # <--- Set attention bakend to Flex +pipeline.transformer.set_attention_backend("flex") # <--- Set attention backend to Flex pipeline.enable_model_cpu_offload() # <--- Enable cpu offloading for single GPU inference pipeline.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=True) # <--- Compile with max-autotune-no-cudagraphs diff --git a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md index fb4f7dbbc18c..e5cbeaa8bf7c 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md +++ b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md @@ -75,14 +75,14 @@ import torch pipeline_text2image = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16") pipeline_text2image = pipeline_text2image.to("cuda") -prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe." +prompt = "A cinematic shot of a baby raccoon wearing an intricate italian priest robe." image = pipeline_text2image(prompt=prompt, guidance_scale=0.0, num_inference_steps=1).images[0] image ```
- generated image of a racoon in a robe + generated image of a raccoon in a robe
## Image-to-image diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md index d5fdbbfe0f95..2c3b00aef690 100644 --- a/docs/source/en/api/pipelines/wan.md +++ b/docs/source/en/api/pipelines/wan.md @@ -531,7 +531,7 @@ export_to_video(output, "animated_advanced.mp4", fps=30) - Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images. -- Wan 2.1 and 2.2 support using [LightX2V LoRAs](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Lightx2v) to speed up inference. Using them on Wan 2.2 is slightly more involed. Refer to [this code snippet](https://github.com/huggingface/diffusers/pull/12040#issuecomment-3144185272) to learn more. +- Wan 2.1 and 2.2 support using [LightX2V LoRAs](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Lightx2v) to speed up inference. Using them on Wan 2.2 is slightly more involved. Refer to [this code snippet](https://github.com/huggingface/diffusers/pull/12040#issuecomment-3144185272) to learn more. - Wan 2.2 has two denoisers. By default, LoRAs are only loaded into the first denoiser. One can set `load_into_transformer_2=True` to load LoRAs into the second denoiser. Refer to [this](https://github.com/huggingface/diffusers/pull/12074#issue-3292620048) and [this](https://github.com/huggingface/diffusers/pull/12074#issuecomment-3155896144) examples to learn more. diff --git a/src/diffusers/pipelines/bria/pipeline_bria.py b/src/diffusers/pipelines/bria/pipeline_bria.py index 9b80278af21e..2dff6a9189c7 100644 --- a/src/diffusers/pipelines/bria/pipeline_bria.py +++ b/src/diffusers/pipelines/bria/pipeline_bria.py @@ -604,7 +604,7 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) # 5. Prepare latent variables - num_channels_latents = self.transformer.config.in_channels // 4 # due to patch=2, we devide by 4 + num_channels_latents = self.transformer.config.in_channels // 4 # due to patch=2, we divide by 4 latents, latent_image_ids = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py index f526dc419cea..8d4b88bb2dba 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py @@ -320,10 +320,10 @@ def __call__( Args: prompt (`str` or `list[str]`): The prompt or prompts to guide the image generation. - image (`nd.ndarray` or `PIL.Image.Image`): + image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`nd.ndarray` or `PIL.Image.Image`): + mask_image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py index f74bf1e14900..9892199355e3 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -150,7 +150,7 @@ def prepare_mask_and_masked_image(image, mask, height, width): ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions. TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not - (ot the other way around). + (or the other way around). Returns: tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4 diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py index 796ab94b33a6..4e7d6edb1e6f 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py @@ -148,7 +148,7 @@ def prepare_mask_and_masked_image(image, mask, height, width): ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions. TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not - (ot the other way around). + (or the other way around). Returns: tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4 diff --git a/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py b/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py index f640fddc2bc5..ec7a85f46673 100644 --- a/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +++ b/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py @@ -223,7 +223,7 @@ def check_inputs( f"got {type(task_prompt)} and {type(content_prompt)}" ) if len(content_prompt) != len(task_prompt): - raise ValueError("`task_prompt` and `content_prompt` must have the same length whe they are lists.") + raise ValueError("`task_prompt` and `content_prompt` must have the same length when they are lists.") for sample in image: if not isinstance(sample, list) or not isinstance(sample[0], list): diff --git a/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py b/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py index dd5d0603d6d0..bd84adcf6425 100644 --- a/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +++ b/src/diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py @@ -443,7 +443,7 @@ def check_inputs( f"got {type(task_prompt)} and {type(content_prompt)}" ) if len(content_prompt) != len(task_prompt): - raise ValueError("`task_prompt` and `content_prompt` must have the same length whe they are lists.") + raise ValueError("`task_prompt` and `content_prompt` must have the same length when they are lists.") for sample in image: if not isinstance(sample, list) or not isinstance(sample[0], list): diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index c7d9ec89bee6..0d29d52222b8 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -518,7 +518,7 @@ def dequantize_gguf_tensor(tensor): block_size, type_size = GGML_QUANT_SIZES[quant_type] - # Conver to plain tensor to avoid unnecessary __torch_function__ overhead. + # Convert to plain tensor to avoid unnecessary __torch_function__ overhead. tensor = tensor.as_tensor() tensor = tensor.view(torch.uint8)