From faaa22e77952de16c1b94ee6b781d789f7c27663 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Fri, 1 Dec 2023 14:07:58 +0100 Subject: [PATCH 1/2] Update README and refine segmentation functionality Updated the README to fix a typographical error and added another task to the upcoming roadmap. Made adjustments to the segmentation method in 'sam.py' to handle mask-guided segmentation and improved the sorting logic in 'mask.py'. --- README.md | 5 ++-- maestro/markers/sam.py | 43 ++++++++++++++++++++++++++++------ maestro/postprocessing/mask.py | 5 ++-- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 54ae80e..dda763e 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Multimodal-Maestro gives you more control over large multimodal models to get the outputs you want. With more effective prompting tactics, you can get multimodal models to do tasks you didn't know (or think!) were possible. Curious how it works? Try our -HF [space](https://huggingface.co/spaces/Roboflow/SoM)! +[HF space](https://huggingface.co/spaces/Roboflow/SoM)! 🚧 The project is still under construction, and the API is prone to change. @@ -109,10 +109,9 @@ Find dog. ## 🚧 roadmap - [ ] Rewriting the `maestro` API. +- [ ] Update [HF space](https://huggingface.co/spaces/Roboflow/SoM). - [ ] Documentation page. - [ ] Add GroundingDINO prompting strategy. -- [ ] Segment Anything guided marks generation. -- [ ] Non-Max Suppression marks refinement. - [ ] CovVLM demo. - [ ] Qwen-VL demo. diff --git a/maestro/markers/sam.py b/maestro/markers/sam.py index 69183f8..99a0cfb 100644 --- a/maestro/markers/sam.py +++ b/maestro/markers/sam.py @@ -3,7 +3,7 @@ import supervision as sv from PIL import Image from transformers import pipeline, SamModel, SamProcessor, SamImageProcessor -from typing import Union +from typing import Optional from maestro.postprocessing.mask import masks_to_marks @@ -15,30 +15,59 @@ class SegmentAnythingMarkGenerator: Parameters: device (str): The device to run the model on (e.g., 'cpu', 'cuda'). model_name (str): The name of the model to be loaded. Defaults to - 'facebook/sam-vit-huge'. + 'facebook/sam-vit-huge'. """ def __init__(self, device: str = 'cpu', model_name: str = "facebook/sam-vit-huge"): self.model = SamModel.from_pretrained(model_name).to(device) self.processor = SamProcessor.from_pretrained(model_name) self.image_processor = SamImageProcessor.from_pretrained(model_name) + self.device = device self.pipeline = pipeline( task="mask-generation", model=self.model, image_processor=self.image_processor, - device=device) + device=self.device) - def generate(self, image: np.ndarray) -> sv.Detections: + def generate( + self, + image: np.ndarray, + mask: Optional[np.ndarray] = None + ) -> sv.Detections: """ Generate image segmentation marks. Parameters: image (np.ndarray): The image to be marked in BGR format. + mask: (Optional[np.ndarray]): The mask to be used as a guide for + segmentation. Returns: sv.Detections: An object containing the segmentation masks and their corresponding bounding box coordinates. """ image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) - outputs = self.pipeline(image, points_per_batch=64) - masks = np.array(outputs['masks']) - return masks_to_marks(masks=masks) + if mask is None: + outputs = self.pipeline(image, points_per_batch=64) + masks = np.array(outputs['masks']) + return masks_to_marks(masks=masks) + else: + inputs = self.processor(image, return_tensors="pt").to(self.device) + image_embeddings = self.model.get_image_embeddings(inputs.pixel_values) + masks = [] + for polygon in sv.mask_to_polygons(mask.astype(bool)): + indexes = np.random.choice(a=polygon.shape[0], size=5, replace=True) + input_points = polygon[indexes] + inputs = self.processor( + images=image, + input_points=[[input_points]], + return_tensors="pt" + ).to(self.device) + outputs = self.model(image_embeddings=image_embeddings, **inputs) + mask = self.processor.image_processor.post_process_masks( + masks=outputs.pred_masks.cpu().detach(), + original_sizes=inputs["original_sizes"].cpu().detach(), + reshaped_input_sizes=inputs["reshaped_input_sizes"].cpu().detach() + )[0][0][0].numpy() + masks.append(mask) + masks = np.array(masks) + return masks_to_marks(masks=masks) diff --git a/maestro/postprocessing/mask.py b/maestro/postprocessing/mask.py index 7f09b40..b028495 100644 --- a/maestro/postprocessing/mask.py +++ b/maestro/postprocessing/mask.py @@ -78,9 +78,8 @@ def mask_non_max_suppression( overlapping_masks = iou_matrix[sorted_idx[i]] > iou_threshold overlapping_masks[sorted_idx[i]] = False - keep_mask[sorted_idx] = np.logical_and( - keep_mask[sorted_idx], - ~overlapping_masks) + overlapping_indices = np.where(overlapping_masks)[0] + keep_mask[sorted_idx[overlapping_indices]] = False return masks[keep_mask] From 392f4242defb42c6e492a051a23b17fbb430068f Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Fri, 1 Dec 2023 14:44:09 +0100 Subject: [PATCH 2/2] Bump version and add ability to guide segmentation with mask. --- maestro/markers/sam.py | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/maestro/markers/sam.py b/maestro/markers/sam.py index 99a0cfb..17a2566 100644 --- a/maestro/markers/sam.py +++ b/maestro/markers/sam.py @@ -62,6 +62,7 @@ def generate( input_points=[[input_points]], return_tensors="pt" ).to(self.device) + del inputs["pixel_values"] outputs = self.model(image_embeddings=image_embeddings, **inputs) mask = self.processor.image_processor.post_process_masks( masks=outputs.pred_masks.cpu().detach(), diff --git a/pyproject.toml b/pyproject.toml index 5104d3c..26a2983 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "maestro" -version = "0.1.0" +version = "0.1.1rc1" description = "Visual Prompting for Large Multimodal Models (LMMs)" authors = ["Piotr Skalski "] maintainers = ["Piotr Skalski "]