Merge pull request chenfei-wu#363 from Wang-Xiaodong1899/main

fix SAM and InfinityOutPainting
changzy00 · Apr 19, 2023 · 233ea1b · 233ea1b
2 parents ee529b0 + 0dae1c9
commit 233ea1b
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@ See our paper: [<font size=5>Visual ChatGPT: Talking, Drawing and Editing with V
 
 ## Updates:
 - Now Visual ChatGPT supports [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) and [segment-anything](https://github.com/facebookresearch/segment-anything)! Thanks **@jordddan** for his efforts. For the image editing case, `GroundingDINO` is first used to locate bounding boxes guided by given text, then `segment-anything` is used to generate the related mask, and finally stable diffusion inpainting is used to edit image based on the mask. 
+    - Firstly, run `python visual_chatgpt.py --load "Text2Box_cuda:0,Segmenting_cuda:0,Inpainting_cuda:0,ImageCaptioning_cuda:0"`
+    - Then, say `find xxx in the image` or `segment xxx in the image`. `xxx` is an object. Visual ChatGPT will return the detection or segmentation result!
 
 
 - Now Visual ChatGPT can support Chinese! Thanks to **@Wang-Xiaodong1899** for his efforts.
@@ -23,9 +25,9 @@ See our paper: [<font size=5>Visual ChatGPT: Talking, Drawing and Editing with V
     - A template can **invoke multiple foundation models** or even **establish a new ChatGPT session**
     - To define a **template**, simply adding a class with attributes `template_model = True`
 - Thanks to **@ShengmingYin** and **@thebestannie** for providing a template example in `InfinityOutPainting` class (see the following gif)
-    - Firstly, run `python visual_chatgpt.py --load "ImageCaptioning_cuda:0,ImageEditing_cuda:1,VisualQuestionAnswering_cuda:2"`
+    - Firstly, run `python visual_chatgpt.py --load "Inpainting_cuda:0,ImageCaptioning_cuda:0,VisualQuestionAnswering_cuda:0"`
     - Secondly, say `extend the image to 2048x1024` to Visual ChatGPT!
-    - By simply creating an `InfinityOutPainting` template, Visual ChatGPT can seamlessly extend images to any size through collaboration with existing `ImageCaptioning`, `ImageEditing`, and `VisualQuestionAnswering` foundation models, **without the need for additional training**.
+    - By simply creating an `InfinityOutPainting` template, Visual ChatGPT can seamlessly extend images to any size through collaboration with existing `ImageCaptioning`, `Inpainting`, and `VisualQuestionAnswering` foundation models, **without the need for additional training**.
 - **Visual ChatGPT needs the effort of the community! We crave your contribution to add new and interesting features!**
 <img src="./assets/demo_inf.gif" width="750">
 

diff --git a/visual_chatgpt.py b/visual_chatgpt.py
@@ -1058,17 +1058,17 @@ def __init__(self, device):
 
         self.inpaint = StableDiffusionInpaintPipeline.from_pretrained(
             "runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device)
-    def __call__(self, prompt, original_image, mask_image):
-        update_image = self.inpaint(prompt=prompt, image=original_image.resize((512, 512)),
-                                     mask_image=mask_image.resize((512, 512))).images[0]
+    def __call__(self, prompt, image, mask_image, height=512, width=512, num_inference_steps=50):
+        update_image = self.inpaint(prompt=prompt, image=image.resize((width, height)),
+                                     mask_image=mask_image.resize((width, height)), height=height, width=width, num_inference_steps=num_inference_steps).images[0]
         return update_image
 
 class InfinityOutPainting:
     template_model = True # Add this line to show this is a template model.
-    def __init__(self, ImageCaptioning, ImageEditing, VisualQuestionAnswering):
+    def __init__(self, ImageCaptioning, Inpainting, VisualQuestionAnswering):
         self.llm = OpenAI(temperature=0)
         self.ImageCaption = ImageCaptioning
-        self.ImageEditing = ImageEditing
+        self.inpaint = Inpainting
         self.ImageVQA = VisualQuestionAnswering
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
@@ -1135,9 +1135,9 @@ def dowhile(self, original_img, tosize, expand_ratio, imagine, usr_prompt):
             temp_canvas.paste(old_img, (x, y))
             temp_mask.paste(0, (x, y, x + old_img.width, y + old_img.height))
             resized_temp_canvas, resized_temp_mask = self.resize_image(temp_canvas), self.resize_image(temp_mask)
-            image = self.ImageEditing.inpaint(prompt=prompt, image=resized_temp_canvas, mask_image=resized_temp_mask,
+            image = self.inpaint(prompt=prompt, image=resized_temp_canvas, mask_image=resized_temp_mask,
                                               height=resized_temp_canvas.height, width=resized_temp_canvas.width,
-                                              num_inference_steps=50).images[0].resize(
+                                              num_inference_steps=50).resize(
                 (temp_canvas.width, temp_canvas.height), Image.ANTIALIAS)
             image = blend_gt2pt(old_img, image)
             old_img = image
@@ -1238,7 +1238,7 @@ def inference_replace_sam(self,inputs):
         mask = self.pad_edge(mask,padding=20) #numpy
         mask_image = Image.fromarray(mask)
 
-        updated_image = self.inpaint(prompt=replace_with_txt, original_image=image_pil,
+        updated_image = self.inpaint(prompt=replace_with_txt, image=image_pil,
                                      mask_image=mask_image)
         updated_image_path = get_new_image_name(image_path, func_name="replace-something")
         updated_image = updated_image.resize(image_pil.size)