diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 83023a8c74d9..1f3e6553e753 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -256,7 +256,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py index be1d6d72a009..e2b0273bb36e 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py @@ -301,7 +301,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py index 9c65999e3a17..0a0858239e38 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py @@ -310,7 +310,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 08c1190d9b6d..9fba7536ee2a 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -358,7 +358,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py index e383e9c631d0..8761f2158761 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py @@ -389,7 +389,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index fb3dc94d6b56..a61d7de7a6a9 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -400,7 +400,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index f0cfabc66f25..b180fa4d3f2d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -378,7 +378,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index d34278d0086b..8819cd521235 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -384,7 +384,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index 1094ecf09a01..815672af7b99 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -429,7 +429,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index f3c35e7c8213..0f2d22345d86 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -457,7 +457,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/deprecated/controlnet_xs/pipeline_controlnet_xs.py index d3fe2488a922..0cc1b10fb682 100644 --- a/src/diffusers/pipelines/deprecated/controlnet_xs/pipeline_controlnet_xs.py +++ b/src/diffusers/pipelines/deprecated/controlnet_xs/pipeline_controlnet_xs.py @@ -334,7 +334,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/deprecated/i2vgen_xl/pipeline_i2vgen_xl.py index 7712743e6bdd..b73693626c09 100644 --- a/src/diffusers/pipelines/deprecated/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/deprecated/i2vgen_xl/pipeline_i2vgen_xl.py @@ -245,7 +245,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype @@ -315,7 +319,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - negative_prompt_embeds = self.text_encoder.text_model.final_layer_norm(negative_prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + negative_prompt_embeds = text_model.final_layer_norm(negative_prompt_embeds) if self.do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method diff --git a/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py b/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py index 93366d10eb9e..f78a056a02a2 100644 --- a/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py @@ -318,7 +318,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/deprecated/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py index 20240d07dfa5..70edb60c8fe6 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py @@ -398,7 +398,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/deprecated/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index ee8675678f2d..251da20a903c 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -524,7 +524,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 38f5af842e1b..c97be4f8f077 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -322,7 +322,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index d72d12a64945..f7452d2cfc54 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -353,7 +353,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 70a16f5d522f..5fe51c4677fb 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -414,7 +414,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/deprecated/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 481c9c93ddde..6b884f00c3f5 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -385,7 +385,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/deprecated/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index 678ef74f387c..2111469180b5 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -313,7 +313,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index a4fef21ab82b..d7265de6a22d 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -390,7 +390,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index 650695b604c1..77ff58827495 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -361,7 +361,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index 851820c00aed..6743ac4e40da 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -294,7 +294,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index ea81be87a0f4..c791abaa2f4d 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -291,7 +291,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index f88c6d8fbc30..3f90c01cffef 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -509,7 +509,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py index 33d1c378fcc0..f4c78bc47bfa 100644 --- a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -261,7 +261,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index b135d128b269..2f8d5b11b63f 100644 --- a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -296,7 +296,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_zero.py index 6ea24ae2c817..2a56fd4fed9a 100644 --- a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -918,7 +918,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/deprecated/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/deprecated/unidiffuser/pipeline_unidiffuser.py index 7e55075cc209..6478a4141c7d 100644 --- a/src/diffusers/pipelines/deprecated/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/deprecated/unidiffuser/pipeline_unidiffuser.py @@ -523,7 +523,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 424a2c46e06b..596032aa2984 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -343,7 +343,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 60f59ec7f9d3..99d929304002 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -328,7 +328,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index 864f9feeb5aa..bf01319baa91 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -680,7 +680,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - editing_prompt_embeds = self.text_encoder.text_model.final_layer_norm(editing_prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + editing_prompt_embeds = text_model.final_layer_norm(editing_prompt_embeds) editing_prompt_embeds = editing_prompt_embeds.to(dtype=negative_prompt_embeds.dtype, device=device) diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py index 807c42d21bb4..cfdfb3a2d500 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py @@ -377,7 +377,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py index ebc2e882868c..452b6327affa 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py @@ -353,7 +353,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd.py b/src/diffusers/pipelines/pag/pipeline_pag_sd.py index 26ea717556c5..46b820ee9261 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd.py @@ -406,7 +406,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py index c15865fdd11b..08e31516029e 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py @@ -267,7 +267,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_img2img.py index 822483eca995..2cf416475514 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_img2img.py @@ -401,7 +401,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py index a61b8ec14f08..0e1c3259dda6 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py @@ -436,7 +436,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 42d62f53a20a..02c580d53ed3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -434,7 +434,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 3ec64c30763f..58934dc8446a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -306,7 +306,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index abcd06a2bb3b..029e2fd90dc1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -460,7 +460,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 8cc0c2bbea70..b56d87bbc2e3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -414,7 +414,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 4dcc7fcc5718..5d3a136db907 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -319,7 +319,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 7015e9727ea5..28c24e53063d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -399,7 +399,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index bb96e5db0295..0af17d11cc42 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -361,7 +361,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index ffb877cfd0f6..096c41bfa72d 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -413,7 +413,11 @@ def encode_prompt( # representations. The `last_hidden_states` that we typically use for # obtaining the final prompt representations passes through the LayerNorm # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + # CLIPTextModel was flattened in transformers>=5.6 (no longer wrapped in .text_model). + text_model = ( + self.text_encoder.text_model if hasattr(self.text_encoder, "text_model") else self.text_encoder + ) + prompt_embeds = text_model.final_layer_norm(prompt_embeds) if self.text_encoder is not None: prompt_embeds_dtype = self.text_encoder.dtype diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index c9d9525b2e45..b3b4080eba0b 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -185,6 +185,22 @@ def test_stable_diffusion_ddim(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_clip_skip(self): + # `clip_skip` re-applies the text encoder's final_layer_norm by hand, so it must run on + # transformers>=5.6 where `CLIPTextModel` no longer exposes a `.text_model` wrapper. + device = "cpu" # ensure determinism for the device-dependent torch.Generator + + components = self.get_dummy_components() + sd_pipe = StableDiffusionPipeline(**components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + output_default = sd_pipe(**self.get_dummy_inputs(device)).images + output_clip_skip = sd_pipe(**self.get_dummy_inputs(device), clip_skip=1).images + + assert output_clip_skip.shape == (1, 64, 64, 3) + assert not np.allclose(output_default, output_clip_skip, atol=1e-3), "clip_skip should change the output" + def test_stable_diffusion_lcm(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator