diff --git a/README.md b/README.md index 3b9206c3..a45fe825 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ See our paper: [Visual ChatGPT: Talking, Drawing and Editing with V ## Updates: - Now Visual ChatGPT supports [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) and [segment-anything](https://github.com/facebookresearch/segment-anything)! Thanks **@jordddan** for his efforts. For the image editing case, `GroundingDINO` is first used to locate bounding boxes guided by given text, then `segment-anything` is used to generate the related mask, and finally stable diffusion inpainting is used to edit image based on the mask. + - Firstly, run `python visual_chatgpt.py --load "Text2Box_cuda:0,Segmenting_cuda:0,Inpainting_cuda:0,ImageCaptioning_cuda:0"` + - Then, say `find xxx in the image` or `segment xxx in the image`. `xxx` is an object. Visual ChatGPT will return the detection or segmentation result! - Now Visual ChatGPT can support Chinese! Thanks to **@Wang-Xiaodong1899** for his efforts. @@ -23,9 +25,9 @@ See our paper: [Visual ChatGPT: Talking, Drawing and Editing with V - A template can **invoke multiple foundation models** or even **establish a new ChatGPT session** - To define a **template**, simply adding a class with attributes `template_model = True` - Thanks to **@ShengmingYin** and **@thebestannie** for providing a template example in `InfinityOutPainting` class (see the following gif) - - Firstly, run `python visual_chatgpt.py --load "ImageCaptioning_cuda:0,ImageEditing_cuda:1,VisualQuestionAnswering_cuda:2"` + - Firstly, run `python visual_chatgpt.py --load "Inpainting_cuda:0,ImageCaptioning_cuda:0,VisualQuestionAnswering_cuda:0"` - Secondly, say `extend the image to 2048x1024` to Visual ChatGPT! - - By simply creating an `InfinityOutPainting` template, Visual ChatGPT can seamlessly extend images to any size through collaboration with existing `ImageCaptioning`, `ImageEditing`, and `VisualQuestionAnswering` foundation models, **without the need for additional training**. + - By simply creating an `InfinityOutPainting` template, Visual ChatGPT can seamlessly extend images to any size through collaboration with existing `ImageCaptioning`, `Inpainting`, and `VisualQuestionAnswering` foundation models, **without the need for additional training**. - **Visual ChatGPT needs the effort of the community! We crave your contribution to add new and interesting features!** diff --git a/visual_chatgpt.py b/visual_chatgpt.py index 595fea41..df83858d 100644 --- a/visual_chatgpt.py +++ b/visual_chatgpt.py @@ -1058,17 +1058,17 @@ def __init__(self, device): self.inpaint = StableDiffusionInpaintPipeline.from_pretrained( "runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device) - def __call__(self, prompt, original_image, mask_image): - update_image = self.inpaint(prompt=prompt, image=original_image.resize((512, 512)), - mask_image=mask_image.resize((512, 512))).images[0] + def __call__(self, prompt, image, mask_image, height=512, width=512, num_inference_steps=50): + update_image = self.inpaint(prompt=prompt, image=image.resize((width, height)), + mask_image=mask_image.resize((width, height)), height=height, width=width, num_inference_steps=num_inference_steps).images[0] return update_image class InfinityOutPainting: template_model = True # Add this line to show this is a template model. - def __init__(self, ImageCaptioning, ImageEditing, VisualQuestionAnswering): + def __init__(self, ImageCaptioning, Inpainting, VisualQuestionAnswering): self.llm = OpenAI(temperature=0) self.ImageCaption = ImageCaptioning - self.ImageEditing = ImageEditing + self.inpaint = Inpainting self.ImageVQA = VisualQuestionAnswering self.a_prompt = 'best quality, extremely detailed' self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \ @@ -1135,9 +1135,9 @@ def dowhile(self, original_img, tosize, expand_ratio, imagine, usr_prompt): temp_canvas.paste(old_img, (x, y)) temp_mask.paste(0, (x, y, x + old_img.width, y + old_img.height)) resized_temp_canvas, resized_temp_mask = self.resize_image(temp_canvas), self.resize_image(temp_mask) - image = self.ImageEditing.inpaint(prompt=prompt, image=resized_temp_canvas, mask_image=resized_temp_mask, + image = self.inpaint(prompt=prompt, image=resized_temp_canvas, mask_image=resized_temp_mask, height=resized_temp_canvas.height, width=resized_temp_canvas.width, - num_inference_steps=50).images[0].resize( + num_inference_steps=50).resize( (temp_canvas.width, temp_canvas.height), Image.ANTIALIAS) image = blend_gt2pt(old_img, image) old_img = image @@ -1238,7 +1238,7 @@ def inference_replace_sam(self,inputs): mask = self.pad_edge(mask,padding=20) #numpy mask_image = Image.fromarray(mask) - updated_image = self.inpaint(prompt=replace_with_txt, original_image=image_pil, + updated_image = self.inpaint(prompt=replace_with_txt, image=image_pil, mask_image=mask_image) updated_image_path = get_new_image_name(image_path, func_name="replace-something") updated_image = updated_image.resize(image_pil.size)