From 306deff2ebf2f76d045e340cd418bcedc530e8f5 Mon Sep 17 00:00:00 2001 From: chaowenguo Date: Mon, 23 Dec 2024 00:21:46 -0800 Subject: [PATCH 01/10] Update pipeline_stable_diffusion_instruct_pix2pix.py --- .../pipeline_stable_diffusion_instruct_pix2pix.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index fd89b195c778..1db5c647aeda 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -25,12 +25,18 @@ from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import PIL_INTERPOLATION, deprecate, logging +from ...utils import PIL_INTERPOLATION, deprecate, logging, is_torch_xla_available from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + XLA_AVAILABLE = True +else: + XLA_AVAILABLE = False logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -79,6 +85,7 @@ class StableDiffusionInstructPix2PixPipeline( TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion). @@ -456,6 +463,9 @@ def __call__( if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) + + if XLA_AVAILABLE: + xm.mark_step() if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] From 76021b8309c5ca2cbd783da0d280cc3091160111 Mon Sep 17 00:00:00 2001 From: chaowenguo Date: Mon, 23 Dec 2024 00:33:24 -0800 Subject: [PATCH 02/10] Update pipeline_stable_diffusion_instruct_pix2pix.py --- .../pipeline_stable_diffusion_instruct_pix2pix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 1db5c647aeda..5aabb0a10961 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -22,7 +22,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging, is_torch_xla_available From a797301f9ad06271289e6d96ce87af96dd11c154 Mon Sep 17 00:00:00 2001 From: hlky Date: Mon, 23 Dec 2024 09:29:07 +0000 Subject: [PATCH 03/10] instruct-pix2pix single file --- src/diffusers/loaders/single_file_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 6de9f0e9e638..132f44524a44 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -109,6 +109,7 @@ "autoencoder-dc-sana": "encoder.project_in.conv.bias", "mochi-1-preview": ["model.diffusion_model.blocks.0.attn.qkv_x.weight", "blocks.0.attn.qkv_x.weight"], "hunyuan-video": "txt_in.individual_token_refiner.blocks.0.adaLN_modulation.1.bias", + "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight", } DIFFUSERS_DEFAULT_PIPELINE_PATHS = { @@ -164,6 +165,7 @@ "autoencoder-dc-f32c32-sana": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"}, "mochi-1-preview": {"pretrained_model_name_or_path": "genmo/mochi-1-preview"}, "hunyuan-video": {"pretrained_model_name_or_path": "hunyuanvideo-community/HunyuanVideo"}, + "instruct-pix2pix": {"pretrained_model_name_or_path": "timbrooks/instruct-pix2pix"}, } # Use to configure model sample size when original config is provided @@ -629,6 +631,12 @@ def infer_diffusers_model_type(checkpoint): elif CHECKPOINT_KEY_NAMES["hunyuan-video"] in checkpoint: model_type = "hunyuan-video" + elif ( + CHECKPOINT_KEY_NAMES["instruct-pix2pix"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["instruct-pix2pix"]].shape[1] == 8 + ): + model_type = "instruct-pix2pix" + else: model_type = "v1" From 078fbdc9df40a75ca847fa181b6f35d8ce1bfc2f Mon Sep 17 00:00:00 2001 From: hlky Date: Mon, 23 Dec 2024 09:29:11 +0000 Subject: [PATCH 04/10] make --- .../pipeline_stable_diffusion_instruct_pix2pix.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 5aabb0a10961..af40fe14f8ab 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -25,12 +25,13 @@ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import PIL_INTERPOLATION, deprecate, logging, is_torch_xla_available +from ...utils import PIL_INTERPOLATION, deprecate, is_torch_xla_available, logging from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker + if is_torch_xla_available(): import torch_xla.core.xla_model as xm @@ -463,7 +464,7 @@ def __call__( if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) - + if XLA_AVAILABLE: xm.mark_step() From fcc80edfcad075fd00f506440a02de98ddf221e1 Mon Sep 17 00:00:00 2001 From: chaowenguo Date: Tue, 24 Dec 2024 22:43:02 -0800 Subject: [PATCH 05/10] Update pipeline_text_to_video_zero.py --- .../text_to_video_synthesis/pipeline_text_to_video_zero.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index c95c7f1b9625..0b92c3f6369e 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -11,7 +11,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -282,7 +282,7 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s class TextToVideoZeroPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for zero-shot text-to-video generation using Stable Diffusion. From 19f48c55a7ed0fcf778bdc38a50919e49d0b1b2d Mon Sep 17 00:00:00 2001 From: chaowenguo Date: Tue, 24 Dec 2024 22:43:33 -0800 Subject: [PATCH 06/10] Update pipeline_text_to_video_zero.py --- .../text_to_video_synthesis/pipeline_text_to_video_zero.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 0b92c3f6369e..d06c902973e0 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -20,6 +20,12 @@ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionSafetyChecker +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + XLA_AVAILABLE = True +else: + XLA_AVAILABLE = False logger = logging.get_logger(__name__) # pylint: disable=invalid-name From 6a6050479a73bfa77f9b674de6ceb87b6f47bd81 Mon Sep 17 00:00:00 2001 From: chaowenguo Date: Tue, 24 Dec 2024 22:44:03 -0800 Subject: [PATCH 07/10] Update pipeline_text_to_video_zero.py --- .../text_to_video_synthesis/pipeline_text_to_video_zero.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index d06c902973e0..ddb60e05a545 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -15,7 +15,7 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers -from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers +from ...utils import USE_PEFT_BACKEND, is_torch_xla_available, BaseOutput, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionSafetyChecker From fe378b3b37a08d943b5cf5cbe07aad62e915037b Mon Sep 17 00:00:00 2001 From: chaowenguo Date: Tue, 24 Dec 2024 22:45:48 -0800 Subject: [PATCH 08/10] Update pipeline_text_to_video_zero.py --- .../text_to_video_synthesis/pipeline_text_to_video_zero.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index ddb60e05a545..81473f274d07 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -446,6 +446,10 @@ def backward_loop( if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) + + if XLA_AVAILABLE: + xm.mark_step() + return latents.clone().detach() # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs From f518f5ff46599f303c2d195551e2a5b24d0309b5 Mon Sep 17 00:00:00 2001 From: hlky Date: Wed, 25 Dec 2024 11:58:41 +0000 Subject: [PATCH 09/10] make style --- .../pipeline_text_to_video_zero.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 81473f274d07..e7aa488d125d 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -15,7 +15,14 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers -from ...utils import USE_PEFT_BACKEND, is_torch_xla_available, BaseOutput, logging, scale_lora_layers, unscale_lora_layers +from ...utils import ( + USE_PEFT_BACKEND, + is_torch_xla_available, + BaseOutput, + logging, + scale_lora_layers, + unscale_lora_layers, +) from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionSafetyChecker @@ -288,7 +295,11 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s class TextToVideoZeroPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for zero-shot text-to-video generation using Stable Diffusion. @@ -446,10 +457,10 @@ def backward_loop( if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) - + if XLA_AVAILABLE: xm.mark_step() - + return latents.clone().detach() # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs From 2e56f7d3b2140644ba7143233831964902160ec8 Mon Sep 17 00:00:00 2001 From: hlky Date: Wed, 25 Dec 2024 12:02:29 +0000 Subject: [PATCH 10/10] make style --- .../text_to_video_synthesis/pipeline_text_to_video_zero.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index e7aa488d125d..f7f5d86a0888 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -17,8 +17,8 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( USE_PEFT_BACKEND, - is_torch_xla_available, BaseOutput, + is_torch_xla_available, logging, scale_lora_layers, unscale_lora_layers, @@ -27,6 +27,7 @@ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionSafetyChecker + if is_torch_xla_available(): import torch_xla.core.xla_model as xm