ConsistentID

Runtime error

App Files Files Community

adaface-neurips commited on Aug 25, 2024

Commit

bef4321

1 Parent(s): b527a08

Rename functions and variables

Browse files

Files changed (2) hide show

lib/pipline_ConsistentID.py +22 -23
models/insightface +1 -0

lib/pipline_ConsistentID.py CHANGED Viewed

@@ -80,9 +80,9 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         self.id_image_processor = CLIPImageProcessor()
         self.crop_size = 512
-        # FaceID
-        self.app = FaceAnalysis(name="buffalo_l", providers=['CPUExecutionProvider'])
-        self.app.prepare(ctx_id=0, det_size=(640, 640))
         if not os.path.exists(consistentID_weight_path):
             ### Download pretrained models
@@ -172,8 +172,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
     # parsed_image_parts2 is a batched tensor of parsed_image_parts with bs=1. It only contains the facial areas of one input image.
     # clip_encoder maps image parts to image-space diffusion prompts.
     # Then the facial class token embeddings are replaced with the fused (multi_facial_embeds, prompt_embeds[class_tokens_mask]).
-    def get_local_facial_embeds(self, prompt_embeds, uncond_prompt_embeds, parsed_image_parts2,
-                                facial_token_masks, valid_facial_token_idx_mask, calc_uncond=True):
         hidden_states = []
         uncond_hidden_states = []
@@ -200,13 +200,13 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
     @torch.inference_mode()
     # Extrat OpenCLIP embeddings from the input image and map them to face prompt embeddings.
-    def get_global_id_embeds(self, faceid_embeds, face_image, s_scale, shortcut=False):
-        clip_image = self.clip_preprocessor(images=face_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
-        clip_image_embeds = self.clip_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
-        uncond_clip_image_embeds = self.clip_encoder(torch.zeros_like(clip_image), output_hidden_states=True).hidden_states[-2]
         faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
         # image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
         # clip_image_embeds are used as queries to transform faceid_embeds.
@@ -222,9 +222,9 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
                 attn_processor.scale = scale
     @torch.inference_mode()
-    def extract_faceid(self, face_image):
-        faceid_image = np.array(face_image)
-        faces = self.app.get(faceid_image)
         if faces==[]:
             faceid_embeds = torch.zeros_like(torch.empty((1, 512)))
         else:
@@ -377,8 +377,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
             self.vae = None
     # input_subj_image_obj: an Image object.
-    def generate_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):
-        faceid_embeds = self.extract_faceid(face_image=input_subj_image_obj)
         face_caption = "The person has one nose, two eyes, two ears, and a mouth."
         key_parsing_mask_dict, vis_parsing_anno_color = self.extract_facemask(input_subj_image_obj)
@@ -403,9 +402,9 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         # 5. Prepare the input ID images
         # global_id_embeds: [1, 4, 768]
-        # get_global_id_embeds() extrats OpenCLIP embeddings from the input image and map them to global face prompt embeddings.
         global_id_embeds, uncond_global_id_embeds = \
-            self.get_global_id_embeds(faceid_embeds, face_image=input_subj_image_obj, s_scale=1.0, shortcut=False)
         # parsed_image_parts: [5, 3, 224, 224]. 5 parts, each part is a 3-channel 224x224 image (resized by CLIP Preprocessor).
         parsed_image_parts, facial_masks, key_masked_raw_images_dict = \
@@ -423,13 +422,13 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         # text_local_id_embeds: [1, 77, 768]
         # text_local_id_embeds only differs with text_global_id_embeds on 4 tokens, and is identical
         # to text_global_id_embeds on the rest 73 tokens.
-        # get_local_facial_embeds() maps parsed_image_parts2 to multi_facial_embeds, and then replaces the class tokens in prompt_embeds
         # with the fused (id_embeds, prompt_embeds[class_tokens_mask]) whose indices are specified by class_tokens_mask.
         # parsed_image_parts2: [1, 5, 3, 224, 224]
         text_local_id_embeds, uncond_text_local_id_embeds = \
-            self.get_local_facial_embeds(text_embeds, uncond_text_embeds, \
-                                         parsed_image_parts2, facial_token_mask, facial_token_idx_mask,
-                                         calc_uncond=calc_uncond)
         # text_global_id_embeds, text_local_global_id_embeds: [1, 81, 768]
         text_global_id_embeds         = torch.cat([text_embeds,          global_id_embeds], dim=1)
@@ -508,7 +507,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         # 3. Encode input prompt
         coarse_prompt_embeds, fine_prompt_embeds = \
-            self.generate_id_prompt_embeds(prompt, negative_prompt, input_subj_image_objs[0], device)
         # 7. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)

         self.id_image_processor = CLIPImageProcessor()
         self.crop_size = 512
+        # face_app: FaceAnalysis object
+        self.face_app = FaceAnalysis(name="buffalo_l", root='models/insightface', providers=['CPUExecutionProvider'])
+        self.face_app.prepare(ctx_id=0, det_size=(640, 640))
         if not os.path.exists(consistentID_weight_path):
             ### Download pretrained models
     # parsed_image_parts2 is a batched tensor of parsed_image_parts with bs=1. It only contains the facial areas of one input image.
     # clip_encoder maps image parts to image-space diffusion prompts.
     # Then the facial class token embeddings are replaced with the fused (multi_facial_embeds, prompt_embeds[class_tokens_mask]).
+    def extract_local_facial_embeds(self, prompt_embeds, uncond_prompt_embeds, parsed_image_parts2,
+                                    facial_token_masks, valid_facial_token_idx_mask, calc_uncond=True):
         hidden_states = []
         uncond_hidden_states = []
     @torch.inference_mode()
     # Extrat OpenCLIP embeddings from the input image and map them to face prompt embeddings.
+    def extract_global_id_embeds(self, face_image_obj, s_scale=1.0, shortcut=False):
+        clip_image_ts = self.clip_preprocessor(images=face_image_obj, return_tensors="pt").pixel_values
+        clip_image_ts = clip_image_ts.to(self.device, dtype=self.torch_dtype)
+        clip_image_embeds = self.clip_encoder(clip_image_ts, output_hidden_states=True).hidden_states[-2]
+        uncond_clip_image_embeds = self.clip_encoder(torch.zeros_like(clip_image_ts), output_hidden_states=True).hidden_states[-2]
+        faceid_embeds = self.extract_faceid(face_image_obj)
         faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
         # image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
         # clip_image_embeds are used as queries to transform faceid_embeds.
                 attn_processor.scale = scale
     @torch.inference_mode()
+    def extract_faceid(self, face_image_obj):
+        faceid_image = np.array(face_image_obj)
+        faces = self.face_app.get(faceid_image)
         if faces==[]:
             faceid_embeds = torch.zeros_like(torch.empty((1, 512)))
         else:
             self.vae = None
     # input_subj_image_obj: an Image object.
+    def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):
         face_caption = "The person has one nose, two eyes, two ears, and a mouth."
         key_parsing_mask_dict, vis_parsing_anno_color = self.extract_facemask(input_subj_image_obj)
         # 5. Prepare the input ID images
         # global_id_embeds: [1, 4, 768]
+        # extract_global_id_embeds() extrats OpenCLIP embeddings from the input image and map them to global face prompt embeddings.
         global_id_embeds, uncond_global_id_embeds = \
+            self.extract_global_id_embeds(face_image_obj=input_subj_image_obj, s_scale=1.0, shortcut=False)
         # parsed_image_parts: [5, 3, 224, 224]. 5 parts, each part is a 3-channel 224x224 image (resized by CLIP Preprocessor).
         parsed_image_parts, facial_masks, key_masked_raw_images_dict = \
         # text_local_id_embeds: [1, 77, 768]
         # text_local_id_embeds only differs with text_global_id_embeds on 4 tokens, and is identical
         # to text_global_id_embeds on the rest 73 tokens.
+        # extract_local_facial_embeds() maps parsed_image_parts2 to multi_facial_embeds, and then replaces the class tokens in prompt_embeds
         # with the fused (id_embeds, prompt_embeds[class_tokens_mask]) whose indices are specified by class_tokens_mask.
         # parsed_image_parts2: [1, 5, 3, 224, 224]
         text_local_id_embeds, uncond_text_local_id_embeds = \
+            self.extract_local_facial_embeds(text_embeds, uncond_text_embeds, \
+                                             parsed_image_parts2, facial_token_mask, facial_token_idx_mask,
+                                             calc_uncond=calc_uncond)
         # text_global_id_embeds, text_local_global_id_embeds: [1, 81, 768]
         text_global_id_embeds         = torch.cat([text_embeds,          global_id_embeds], dim=1)
         # 3. Encode input prompt
         coarse_prompt_embeds, fine_prompt_embeds = \
+            self.extract_double_id_prompt_embeds(prompt, negative_prompt, input_subj_image_objs[0], device)
         # 7. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)

models/insightface ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/lish/adaprompt/models/insightface