From 23fbef9c5411e98c991dce90d4132827205c618d Mon Sep 17 00:00:00 2001 From: danqiao <1192787158@qq.com> Date: Mon, 17 Apr 2023 06:26:05 +0000 Subject: [PATCH] support GroundingDINO and segment-anything --- extensions/grounding_config.py | 43 ---------------------------------- visual_chatgpt.py | 8 ++++--- 2 files changed, 5 insertions(+), 46 deletions(-) delete mode 100644 extensions/grounding_config.py diff --git a/extensions/grounding_config.py b/extensions/grounding_config.py deleted file mode 100644 index 9158d5f6..00000000 --- a/extensions/grounding_config.py +++ /dev/null @@ -1,43 +0,0 @@ -batch_size = 1 -modelname = "groundingdino" -backbone = "swin_T_224_1k" -position_embedding = "sine" -pe_temperatureH = 20 -pe_temperatureW = 20 -return_interm_indices = [1, 2, 3] -backbone_freeze_keywords = None -enc_layers = 6 -dec_layers = 6 -pre_norm = False -dim_feedforward = 2048 -hidden_dim = 256 -dropout = 0.0 -nheads = 8 -num_queries = 900 -query_dim = 4 -num_patterns = 0 -num_feature_levels = 4 -enc_n_points = 4 -dec_n_points = 4 -two_stage_type = "standard" -two_stage_bbox_embed_share = False -two_stage_class_embed_share = False -transformer_activation = "relu" -dec_pred_bbox_embed_share = True -dn_box_noise_scale = 1.0 -dn_label_noise_ratio = 0.5 -dn_label_coef = 1.0 -dn_bbox_coef = 1.0 -embed_init_tgt = True -dn_labelbook_size = 2000 -max_text_len = 256 -text_encoder_type = "bert-base-uncased" -use_text_enhancer = True -use_fusion_layer = True -use_checkpoint = True -use_transformer_ckpt = True -use_text_cross_attention = True -text_dropout = 0.0 -fusion_dropout = 0.0 -fusion_droppath = 0.1 -sub_sentence_present = True diff --git a/visual_chatgpt.py b/visual_chatgpt.py index d93755bf..82ef4a65 100644 --- a/visual_chatgpt.py +++ b/visual_chatgpt.py @@ -11,7 +11,7 @@ import numpy as np import argparse import inspect - +import tempfile from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering from transformers import AutoImageProcessor, UperNetForSemanticSegmentation @@ -911,17 +911,19 @@ def __init__(self, device): self.device = device self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32 self.model_checkpoint_path = os.path.join("checkpoints","groundingdino") + self.model_config_path = os.path.join("checkpoints","grounding_config.py") self.download_parameters() self.box_threshold = 0.3 self.text_threshold = 0.25 - self.model_config_path = "extensions/grounding_config.py" self.grounding = (self.load_model()).to(self.device) def download_parameters(self): url = "https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth" if not os.path.exists(self.model_checkpoint_path): wget.download(url,out=self.model_checkpoint_path) - + config_url = "https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py" + if not os.path.exists(self.model_config_path): + wget.download(config_url,out=self.model_config_path) def load_image(self,image_path): # load image image_pil = Image.open(image_path).convert("RGB") # load image