From af680d6776529ba4753cb850b0b82b5ea2eabbd7 Mon Sep 17 00:00:00 2001 From: huanghaian Date: Wed, 10 Jan 2024 15:43:17 +0800 Subject: [PATCH] add file --- .../mm_gdino_clip/script/grit_vg_to_rec.py | 22 ++++ ..._vg_to_rec.py => merge_flickrvg_to_rec.py} | 6 +- .../script/merge_gqavg_to_rec.py | 112 ++++++++++++++++++ 3 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 projects/mm_gdino_clip/script/grit_vg_to_rec.py rename projects/mm_gdino_clip/script/{merge_vg_to_rec.py => merge_flickrvg_to_rec.py} (94%) create mode 100644 projects/mm_gdino_clip/script/merge_gqavg_to_rec.py diff --git a/projects/mm_gdino_clip/script/grit_vg_to_rec.py b/projects/mm_gdino_clip/script/grit_vg_to_rec.py new file mode 100644 index 00000000000..f8183e3ad29 --- /dev/null +++ b/projects/mm_gdino_clip/script/grit_vg_to_rec.py @@ -0,0 +1,22 @@ +import json +import jsonlines + +root_path = '/mnt/workspace/zhaoxiangyu/code_new/grounding_mm_mine/grit_try/' +grit_path = root_path + 'grit_ref_all_after_filter.jsonl' + +with open(grit_path, 'r') as f: + rec_data_list = [json.loads(line) for line in f] + +for data in rec_data_list: + referring = data['referring'] + new_dict = {} + for ref in referring: + new_dict['exp'] = ref['phrase'] + new_dict['bbox'] = ref['bbox'] + data['referring'] = {} + data['referring']['instances'] = new_dict + +out_path = root_path + 'grit_ref_all_after_filter_rec.json' +with jsonlines.open(out_path, mode='w') as writer: + writer.write_all(rec_data_list) +print(f'save to {out_path}') diff --git a/projects/mm_gdino_clip/script/merge_vg_to_rec.py b/projects/mm_gdino_clip/script/merge_flickrvg_to_rec.py similarity index 94% rename from projects/mm_gdino_clip/script/merge_vg_to_rec.py rename to projects/mm_gdino_clip/script/merge_flickrvg_to_rec.py index 872cfdfedd4..5fd72f0f591 100644 --- a/projects/mm_gdino_clip/script/merge_vg_to_rec.py +++ b/projects/mm_gdino_clip/script/merge_flickrvg_to_rec.py @@ -20,6 +20,7 @@ anno = vg_data['grounding'] regions = anno['regions'] + # 每个 caption 只有一个 phrase if len(regions) > 1: continue @@ -43,6 +44,7 @@ if not isinstance(rec_bbox[0], list): rec_bbox = [rec_bbox] rec_bbox = set([sum(r) for r in rec_bbox]) + # 严格匹配 if rec_bbox == bbox: if isinstance(ins['exp'], list): is_same = False @@ -60,8 +62,8 @@ break num += 1 -print(num) -print(in_num) +print(num) # 17233 +print(in_num) # 17111 out_path = root_path + 'flickr30k_separateGT_train_mergevg_rec.json' with jsonlines.open(out_path, mode='w') as writer: diff --git a/projects/mm_gdino_clip/script/merge_gqavg_to_rec.py b/projects/mm_gdino_clip/script/merge_gqavg_to_rec.py new file mode 100644 index 00000000000..297ba3d3c0a --- /dev/null +++ b/projects/mm_gdino_clip/script/merge_gqavg_to_rec.py @@ -0,0 +1,112 @@ +import json +import jsonlines +import re +import tqdm + +root_path = '/home/PJLAB/huanghaian/dataset/gqa/' +rec_path = root_path + 'gqa_rec.json' +vg_path = root_path + 'final_mixed_train_no_coco_vg.json' + +with open(rec_path, 'r') as f: + rec_data_list = [json.loads(line) for line in f] + +rec_data_list_name = [data['filename'] for data in rec_data_list] + +with open(vg_path, 'r') as f: + vg_data_list = [json.loads(line) for line in f] + + +def split_sentence(sentence): + pattern = r'([?.])' # 正则表达式模式,匹配问号 "?" 或句号 "." + sentences = re.split(pattern, sentence) + sentences = [s.strip() + p for s, p in zip(sentences[0::2], sentences[1::2])] + return sentences + + +num = 0 +in_num = 0 +new_results = [] + +for vg_data in tqdm.tqdm(vg_data_list): + filename = vg_data['filename'] + anno = vg_data['grounding'] + regions = anno['regions'] + all_phrase = [r['phrase'] for r in regions] + caption = anno['caption'] + # 按照分隔符切割为多段 + caption_list = split_sentence(caption) + + for caption in caption_list: + if caption.endswith('?'): # 问句不要了 + continue + count = 0 + for i, p in enumerate(all_phrase): + # 如果这个 phrase 是列表,则抛弃 + if isinstance(p, list): + break + # 如果这个 caption 位于多个 phrase 中,则抛弃 + if p in caption: + index = i + count += 1 + if count > 1 or count == 0: + continue + num += 1 + + # 我们只需要这个 caption 中只有一个名词短语的数据 + data = regions[index] + new_results.append({'bbox': data['bbox'], 'exp': caption, 'filename': filename, 'height': vg_data['height'], + 'width': vg_data['width']}) + +print(num) # 989203 +print(len(new_results), new_results[0]) + +new_image = 0 +for new in tqdm.tqdm(new_results): + filename = new.pop('filename') + width = new.pop('width') + height = new.pop('height') + bbox = new['bbox'] + caption = new['exp'] + if not isinstance(bbox[0], list): + bbox = [bbox] + new_bbox = set([sum(r) for r in bbox]) + + if filename not in rec_data_list_name: + new_image += 1 + rec_data_list.append({'filename': filename, 'width': width, 'height': height, + 'referring': {'instances': [{'bbox': new['bbox'], 'exp': new['exp']}]}}) + rec_data_list_name = [data['filename'] for data in rec_data_list] + else: + index = rec_data_list_name.index(filename) + rec_data = rec_data_list[index] + anno = rec_data.get('referring', {}) + instances = [obj for obj in anno.get('instances', [])] + for ins in instances: + rec_bbox = ins['bbox'] + if not isinstance(rec_bbox[0], list): + rec_bbox = [rec_bbox] + rec_bbox = set([sum(r) for r in rec_bbox]) + # 非常严格的匹配策略,确保不会出现错误 + if rec_bbox == new_bbox: + if isinstance(ins['exp'], list): + is_same = False + for exp in ins['exp']: + if exp.lower() == caption.lower(): + is_same = True + break + if not is_same: + in_num += 1 + ins['exp'].append(caption) + else: + if ins['exp'].lower() != caption.lower(): + in_num += 1 + ins['exp'] = [ins['exp'], caption] + break + +print(in_num) # 47266 +print(new_image) # 12052 + +out_path = root_path + 'gqa_mergevg_rec.json' +with jsonlines.open(out_path, mode='w') as writer: + writer.write_all(rec_data_list) +print(f'save to {out_path}')