From 23fbef9c5411e98c991dce90d4132827205c618d Mon Sep 17 00:00:00 2001
From: danqiao <1192787158@qq.com>
Date: Mon, 17 Apr 2023 06:26:05 +0000
Subject: [PATCH] support GroundingDINO and segment-anything

---
 extensions/grounding_config.py | 43 ----------------------------------
 visual_chatgpt.py              |  8 ++++---
 2 files changed, 5 insertions(+), 46 deletions(-)
 delete mode 100644 extensions/grounding_config.py

diff --git a/extensions/grounding_config.py b/extensions/grounding_config.py
deleted file mode 100644
index 9158d5f6..00000000
--- a/extensions/grounding_config.py
+++ /dev/null
@@ -1,43 +0,0 @@
-batch_size = 1
-modelname = "groundingdino"
-backbone = "swin_T_224_1k"
-position_embedding = "sine"
-pe_temperatureH = 20
-pe_temperatureW = 20
-return_interm_indices = [1, 2, 3]
-backbone_freeze_keywords = None
-enc_layers = 6
-dec_layers = 6
-pre_norm = False
-dim_feedforward = 2048
-hidden_dim = 256
-dropout = 0.0
-nheads = 8
-num_queries = 900
-query_dim = 4
-num_patterns = 0
-num_feature_levels = 4
-enc_n_points = 4
-dec_n_points = 4
-two_stage_type = "standard"
-two_stage_bbox_embed_share = False
-two_stage_class_embed_share = False
-transformer_activation = "relu"
-dec_pred_bbox_embed_share = True
-dn_box_noise_scale = 1.0
-dn_label_noise_ratio = 0.5
-dn_label_coef = 1.0
-dn_bbox_coef = 1.0
-embed_init_tgt = True
-dn_labelbook_size = 2000
-max_text_len = 256
-text_encoder_type = "bert-base-uncased"
-use_text_enhancer = True
-use_fusion_layer = True
-use_checkpoint = True
-use_transformer_ckpt = True
-use_text_cross_attention = True
-text_dropout = 0.0
-fusion_dropout = 0.0
-fusion_droppath = 0.1
-sub_sentence_present = True
diff --git a/visual_chatgpt.py b/visual_chatgpt.py
index d93755bf..82ef4a65 100644
--- a/visual_chatgpt.py
+++ b/visual_chatgpt.py
@@ -11,7 +11,7 @@
 import numpy as np
 import argparse
 import inspect
-
+import tempfile
 from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
 from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
 from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
@@ -911,17 +911,19 @@ def __init__(self, device):
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.model_checkpoint_path = os.path.join("checkpoints","groundingdino")
+        self.model_config_path = os.path.join("checkpoints","grounding_config.py")
         self.download_parameters()
         self.box_threshold = 0.3
         self.text_threshold = 0.25
-        self.model_config_path = "extensions/grounding_config.py"
         self.grounding = (self.load_model()).to(self.device)
 
     def download_parameters(self):
         url = "https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth"
         if not os.path.exists(self.model_checkpoint_path):
             wget.download(url,out=self.model_checkpoint_path)
-
+        config_url = "https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+        if not os.path.exists(self.model_config_path):
+            wget.download(config_url,out=self.model_config_path)
     def load_image(self,image_path):
          # load image
         image_pil = Image.open(image_path).convert("RGB")  # load image