Skip to content

Commit

Permalink
add file
Browse files Browse the repository at this point in the history
  • Loading branch information
hhaAndroid committed Jan 10, 2024
1 parent 9a226a6 commit af680d6
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 2 deletions.
22 changes: 22 additions & 0 deletions projects/mm_gdino_clip/script/grit_vg_to_rec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json
import jsonlines

root_path = '/mnt/workspace/zhaoxiangyu/code_new/grounding_mm_mine/grit_try/'
grit_path = root_path + 'grit_ref_all_after_filter.jsonl'

with open(grit_path, 'r') as f:
rec_data_list = [json.loads(line) for line in f]

for data in rec_data_list:
referring = data['referring']
new_dict = {}
for ref in referring:
new_dict['exp'] = ref['phrase']
new_dict['bbox'] = ref['bbox']
data['referring'] = {}
data['referring']['instances'] = new_dict

out_path = root_path + 'grit_ref_all_after_filter_rec.json'
with jsonlines.open(out_path, mode='w') as writer:
writer.write_all(rec_data_list)
print(f'save to {out_path}')
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
anno = vg_data['grounding']
regions = anno['regions']

# 每个 caption 只有一个 phrase
if len(regions) > 1:
continue

Expand All @@ -43,6 +44,7 @@
if not isinstance(rec_bbox[0], list):
rec_bbox = [rec_bbox]
rec_bbox = set([sum(r) for r in rec_bbox])
# 严格匹配
if rec_bbox == bbox:
if isinstance(ins['exp'], list):
is_same = False
Expand All @@ -60,8 +62,8 @@
break
num += 1

print(num)
print(in_num)
print(num) # 17233
print(in_num) # 17111

out_path = root_path + 'flickr30k_separateGT_train_mergevg_rec.json'
with jsonlines.open(out_path, mode='w') as writer:
Expand Down
112 changes: 112 additions & 0 deletions projects/mm_gdino_clip/script/merge_gqavg_to_rec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import json
import jsonlines
import re
import tqdm

root_path = '/home/PJLAB/huanghaian/dataset/gqa/'
rec_path = root_path + 'gqa_rec.json'
vg_path = root_path + 'final_mixed_train_no_coco_vg.json'

with open(rec_path, 'r') as f:
rec_data_list = [json.loads(line) for line in f]

rec_data_list_name = [data['filename'] for data in rec_data_list]

with open(vg_path, 'r') as f:
vg_data_list = [json.loads(line) for line in f]


def split_sentence(sentence):
pattern = r'([?.])' # 正则表达式模式,匹配问号 "?" 或句号 "."
sentences = re.split(pattern, sentence)
sentences = [s.strip() + p for s, p in zip(sentences[0::2], sentences[1::2])]
return sentences


num = 0
in_num = 0
new_results = []

for vg_data in tqdm.tqdm(vg_data_list):
filename = vg_data['filename']
anno = vg_data['grounding']
regions = anno['regions']
all_phrase = [r['phrase'] for r in regions]
caption = anno['caption']
# 按照分隔符切割为多段
caption_list = split_sentence(caption)

for caption in caption_list:
if caption.endswith('?'): # 问句不要了
continue
count = 0
for i, p in enumerate(all_phrase):
# 如果这个 phrase 是列表,则抛弃
if isinstance(p, list):
break
# 如果这个 caption 位于多个 phrase 中,则抛弃
if p in caption:
index = i
count += 1
if count > 1 or count == 0:
continue
num += 1

# 我们只需要这个 caption 中只有一个名词短语的数据
data = regions[index]
new_results.append({'bbox': data['bbox'], 'exp': caption, 'filename': filename, 'height': vg_data['height'],
'width': vg_data['width']})

print(num) # 989203
print(len(new_results), new_results[0])

new_image = 0
for new in tqdm.tqdm(new_results):
filename = new.pop('filename')
width = new.pop('width')
height = new.pop('height')
bbox = new['bbox']
caption = new['exp']
if not isinstance(bbox[0], list):
bbox = [bbox]
new_bbox = set([sum(r) for r in bbox])

if filename not in rec_data_list_name:
new_image += 1
rec_data_list.append({'filename': filename, 'width': width, 'height': height,
'referring': {'instances': [{'bbox': new['bbox'], 'exp': new['exp']}]}})
rec_data_list_name = [data['filename'] for data in rec_data_list]
else:
index = rec_data_list_name.index(filename)
rec_data = rec_data_list[index]
anno = rec_data.get('referring', {})
instances = [obj for obj in anno.get('instances', [])]
for ins in instances:
rec_bbox = ins['bbox']
if not isinstance(rec_bbox[0], list):
rec_bbox = [rec_bbox]
rec_bbox = set([sum(r) for r in rec_bbox])
# 非常严格的匹配策略,确保不会出现错误
if rec_bbox == new_bbox:
if isinstance(ins['exp'], list):
is_same = False
for exp in ins['exp']:
if exp.lower() == caption.lower():
is_same = True
break
if not is_same:
in_num += 1
ins['exp'].append(caption)
else:
if ins['exp'].lower() != caption.lower():
in_num += 1
ins['exp'] = [ins['exp'], caption]
break

print(in_num) # 47266
print(new_image) # 12052

out_path = root_path + 'gqa_mergevg_rec.json'
with jsonlines.open(out_path, mode='w') as writer:
writer.write_all(rec_data_list)
print(f'save to {out_path}')

0 comments on commit af680d6

Please sign in to comment.