From af680d6776529ba4753cb850b0b82b5ea2eabbd7 Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@sensetime.com>
Date: Wed, 10 Jan 2024 15:43:17 +0800
Subject: [PATCH] add file

---
 .../mm_gdino_clip/script/grit_vg_to_rec.py    |  22 ++++
 ..._vg_to_rec.py => merge_flickrvg_to_rec.py} |   6 +-
 .../script/merge_gqavg_to_rec.py              | 112 ++++++++++++++++++
 3 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 projects/mm_gdino_clip/script/grit_vg_to_rec.py
 rename projects/mm_gdino_clip/script/{merge_vg_to_rec.py => merge_flickrvg_to_rec.py} (94%)
 create mode 100644 projects/mm_gdino_clip/script/merge_gqavg_to_rec.py

diff --git a/projects/mm_gdino_clip/script/grit_vg_to_rec.py b/projects/mm_gdino_clip/script/grit_vg_to_rec.py
new file mode 100644
index 00000000000..f8183e3ad29
--- /dev/null
+++ b/projects/mm_gdino_clip/script/grit_vg_to_rec.py
@@ -0,0 +1,22 @@
+import json
+import jsonlines
+
+root_path = '/mnt/workspace/zhaoxiangyu/code_new/grounding_mm_mine/grit_try/'
+grit_path = root_path + 'grit_ref_all_after_filter.jsonl'
+
+with open(grit_path, 'r') as f:
+    rec_data_list = [json.loads(line) for line in f]
+
+for data in rec_data_list:
+    referring = data['referring']
+    new_dict = {}
+    for ref in referring:
+        new_dict['exp'] = ref['phrase']
+        new_dict['bbox'] = ref['bbox']
+    data['referring'] = {}
+    data['referring']['instances'] = new_dict
+
+out_path = root_path + 'grit_ref_all_after_filter_rec.json'
+with jsonlines.open(out_path, mode='w') as writer:
+    writer.write_all(rec_data_list)
+print(f'save to {out_path}')
diff --git a/projects/mm_gdino_clip/script/merge_vg_to_rec.py b/projects/mm_gdino_clip/script/merge_flickrvg_to_rec.py
similarity index 94%
rename from projects/mm_gdino_clip/script/merge_vg_to_rec.py
rename to projects/mm_gdino_clip/script/merge_flickrvg_to_rec.py
index 872cfdfedd4..5fd72f0f591 100644
--- a/projects/mm_gdino_clip/script/merge_vg_to_rec.py
+++ b/projects/mm_gdino_clip/script/merge_flickrvg_to_rec.py
@@ -20,6 +20,7 @@
     anno = vg_data['grounding']
     regions = anno['regions']
 
+    # 每个 caption 只有一个 phrase
     if len(regions) > 1:
         continue
 
@@ -43,6 +44,7 @@
         if not isinstance(rec_bbox[0], list):
             rec_bbox = [rec_bbox]
         rec_bbox = set([sum(r) for r in rec_bbox])
+        # 严格匹配
         if rec_bbox == bbox:
             if isinstance(ins['exp'], list):
                 is_same = False
@@ -60,8 +62,8 @@
             break
     num += 1
 
-print(num)
-print(in_num)
+print(num)  # 17233
+print(in_num)  # 17111
 
 out_path = root_path + 'flickr30k_separateGT_train_mergevg_rec.json'
 with jsonlines.open(out_path, mode='w') as writer:
diff --git a/projects/mm_gdino_clip/script/merge_gqavg_to_rec.py b/projects/mm_gdino_clip/script/merge_gqavg_to_rec.py
new file mode 100644
index 00000000000..297ba3d3c0a
--- /dev/null
+++ b/projects/mm_gdino_clip/script/merge_gqavg_to_rec.py
@@ -0,0 +1,112 @@
+import json
+import jsonlines
+import re
+import tqdm
+
+root_path = '/home/PJLAB/huanghaian/dataset/gqa/'
+rec_path = root_path + 'gqa_rec.json'
+vg_path = root_path + 'final_mixed_train_no_coco_vg.json'
+
+with open(rec_path, 'r') as f:
+    rec_data_list = [json.loads(line) for line in f]
+
+rec_data_list_name = [data['filename'] for data in rec_data_list]
+
+with open(vg_path, 'r') as f:
+    vg_data_list = [json.loads(line) for line in f]
+
+
+def split_sentence(sentence):
+    pattern = r'([?.])'  # 正则表达式模式，匹配问号 "?" 或句号 "."
+    sentences = re.split(pattern, sentence)
+    sentences = [s.strip() + p for s, p in zip(sentences[0::2], sentences[1::2])]
+    return sentences
+
+
+num = 0
+in_num = 0
+new_results = []
+
+for vg_data in tqdm.tqdm(vg_data_list):
+    filename = vg_data['filename']
+    anno = vg_data['grounding']
+    regions = anno['regions']
+    all_phrase = [r['phrase'] for r in regions]
+    caption = anno['caption']
+    # 按照分隔符切割为多段
+    caption_list = split_sentence(caption)
+
+    for caption in caption_list:
+        if caption.endswith('?'):  # 问句不要了
+            continue
+        count = 0
+        for i, p in enumerate(all_phrase):
+            # 如果这个 phrase 是列表，则抛弃
+            if isinstance(p, list):
+                break
+            # 如果这个 caption 位于多个 phrase 中，则抛弃
+            if p in caption:
+                index = i
+                count += 1
+        if count > 1 or count == 0:
+            continue
+        num += 1
+
+        # 我们只需要这个 caption 中只有一个名词短语的数据
+        data = regions[index]
+        new_results.append({'bbox': data['bbox'], 'exp': caption, 'filename': filename, 'height': vg_data['height'],
+                            'width': vg_data['width']})
+
+print(num)  # 989203
+print(len(new_results), new_results[0])
+
+new_image = 0
+for new in tqdm.tqdm(new_results):
+    filename = new.pop('filename')
+    width = new.pop('width')
+    height = new.pop('height')
+    bbox = new['bbox']
+    caption = new['exp']
+    if not isinstance(bbox[0], list):
+        bbox = [bbox]
+    new_bbox = set([sum(r) for r in bbox])
+
+    if filename not in rec_data_list_name:
+        new_image += 1
+        rec_data_list.append({'filename': filename, 'width': width, 'height': height,
+                              'referring': {'instances': [{'bbox': new['bbox'], 'exp': new['exp']}]}})
+        rec_data_list_name = [data['filename'] for data in rec_data_list]
+    else:
+        index = rec_data_list_name.index(filename)
+        rec_data = rec_data_list[index]
+        anno = rec_data.get('referring', {})
+        instances = [obj for obj in anno.get('instances', [])]
+        for ins in instances:
+            rec_bbox = ins['bbox']
+            if not isinstance(rec_bbox[0], list):
+                rec_bbox = [rec_bbox]
+            rec_bbox = set([sum(r) for r in rec_bbox])
+            # 非常严格的匹配策略，确保不会出现错误
+            if rec_bbox == new_bbox:
+                if isinstance(ins['exp'], list):
+                    is_same = False
+                    for exp in ins['exp']:
+                        if exp.lower() == caption.lower():
+                            is_same = True
+                            break
+                    if not is_same:
+                        in_num += 1
+                        ins['exp'].append(caption)
+                else:
+                    if ins['exp'].lower() != caption.lower():
+                        in_num += 1
+                        ins['exp'] = [ins['exp'], caption]
+                break
+
+print(in_num)  # 47266
+print(new_image)  # 12052
+
+out_path = root_path + 'gqa_mergevg_rec.json'
+with jsonlines.open(out_path, mode='w') as writer:
+    writer.write_all(rec_data_list)
+print(f'save to {out_path}')