From 33c5d125cb1d25fdcfc04ce450f6d51777bcfa3a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 11 Apr 2024 09:07:38 +0530 Subject: [PATCH] [Core] fix img2img pipeline for Playground (#7627) * playground vae encoding should use std and mean of the vae. * style. * fix-copies. --- .../controlnet/pipeline_controlnet_sd_xl_img2img.py | 13 ++++++++++++- .../pipeline_stable_diffusion_xl_img2img.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index 5938d99b734e..d32e7d81649d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -898,6 +898,12 @@ def prepare_latents( f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" ) + latents_mean = latents_std = None + if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None: + latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1) + if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None: + latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1) + # Offload text encoder if `enable_model_cpu_offload` was enabled if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.text_encoder_2.to("cpu") @@ -935,7 +941,12 @@ def prepare_latents( self.vae.to(dtype) init_latents = init_latents.to(dtype) - init_latents = self.vae.config.scaling_factor * init_latents + if latents_mean is not None and latents_std is not None: + latents_mean = latents_mean.to(device=self.device, dtype=dtype) + latents_std = latents_std.to(device=self.device, dtype=dtype) + init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std + else: + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 8bcfcfbfe57a..b72b19d5c1ef 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -665,6 +665,12 @@ def prepare_latents( f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" ) + latents_mean = latents_std = None + if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None: + latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1) + if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None: + latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1) + # Offload text encoder if `enable_model_cpu_offload` was enabled if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.text_encoder_2.to("cpu") @@ -702,7 +708,12 @@ def prepare_latents( self.vae.to(dtype) init_latents = init_latents.to(dtype) - init_latents = self.vae.config.scaling_factor * init_latents + if latents_mean is not None and latents_std is not None: + latents_mean = latents_mean.to(device=self.device, dtype=dtype) + latents_std = latents_std.to(device=self.device, dtype=dtype) + init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std + else: + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size