interactdiffusion
/

diffusers-v1-2

Diffusers

Safetensors

Model card Files Files and versions Community

jiuntian commited on Mar 13, 2024

Commit

09e5125

1 Parent(s): 6fce1d6

update pipeline

Browse files

Files changed (2) hide show

README.md +32 -0
pipeline_stable_diffusion_interactdiffusion.py +122 -14

README.md CHANGED Viewed

@@ -1,3 +1,35 @@
 ---
 license: bsd
 ---

 ---
 license: bsd
 ---
+# InteractDiffusion Diffuser Implementation
+## How to Use
+```python
+from diffusers import DiffusionPipeline
+import torch
+pipeline = DiffusionPipeline.from_pretrained(
+    "interactdiffusion/diffusers-v1-2",
+    trust_remote_code=True,
+    variant="fp16", torch_dtype=torch.float16
+)
+pipeline = pipeline.to("cuda")
+images = pipeline(
+    prompt="a person is feeding a cat",
+    interactdiffusion_subject_phrases=["person"],
+    interactdiffusion_object_phrases=["cat"],
+    interactdiffusion_action_phrases=["feeding"],
+    interactdiffusion_subject_boxes=[[0.0332, 0.1660, 0.3359, 0.7305]],
+    interactdiffusion_object_boxes=[[0.2891, 0.4766, 0.6680, 0.7930]],
+    interactdiffusion_scheduled_sampling_beta=1,
+    output_type="pil",
+    num_inference_steps=50,
+    ).images
+images[0].save('out.jpg')
+```
+For more information, please check the project homepage:

pipeline_stable_diffusion_interactdiffusion.py CHANGED Viewed

@@ -26,6 +26,7 @@ from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention import GatedSelfAttentionDense
 from diffusers.models.embeddings import get_fourier_embeds_from_boundingbox
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -38,7 +39,7 @@ from diffusers.utils import (
     unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -46,7 +47,7 @@ from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionS
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-class StableDiffusionInteractDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion with Interaction-to-Image Generation (InteractDiffusion).
@@ -105,17 +106,6 @@ class StableDiffusionInteractDiffusionPipeline(DiffusionPipeline, StableDiffusio
                 "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
-        # # load position_net
-        # positive_len = 768
-        # if isinstance(unet.config.cross_attention_dim, int):
-        #     positive_len = unet.config.cross_attention_dim
-        # elif isinstance(unet.config.cross_attention_dim, tuple) or isinstance(unet.config.cross_attention_dim, list):
-        #     positive_len = unet.config.cross_attention_dim[0]
-        # self.position_net = InteractDiffusionInteractionProjection(
-        #     in_dim=positive_len, out_dim=unet.config.cross_attention_dim
-        # )
         self.register_modules(
             vae=vae,
@@ -130,6 +120,125 @@ class StableDiffusionInteractDiffusionPipeline(DiffusionPipeline, StableDiffusio
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -464,7 +573,6 @@ class StableDiffusionInteractDiffusionPipeline(DiffusionPipeline, StableDiffusio
                 module.enabled = enabled
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,

 from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention import GatedSelfAttentionDense
+from diffusers.models.attention_processor import FusedAttnProcessor2_0
 from diffusers.models.embeddings import get_fourier_embeds_from_boundingbox
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.schedulers import KarrasDiffusionSchedulers
     unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableDiffusionInteractDiffusionPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion with Interaction-to-Image Generation (InteractDiffusion).
                 "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
         self.register_modules(
             vae=vae,
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
+    ### Backward compability with pre diffusers-0.27.0, which this class cannot inherit StableDiffusionMixin class
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stages where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+    ### end of the section
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
                 module.enabled = enabled
     @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,