-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_dataset.py
executable file
·565 lines (530 loc) · 24.7 KB
/
convert_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
#!/usr/bin/env python
# Convert dataset DICOM and XML to images and masks directory
import os
import random
import sys
import time
import re
from pathlib import Path
import ast
import cv2
from pydicom import dcmread
import xmltodict
import numpy as np
from PIL import Image
import json
import yaml
from csv import DictReader
import shutil
import pandas as pd
import argparse
from utils import yolo_to_coco
# -- How to Use --- #
# The folder in which this script is located in must contain:
# "datasets/INbreast Release 1.0", "datasets/CBIS-DDSM", "datasets/all-mias"
# After running the script, images/, labels/, and dataset.yaml is created for yolo format, json files for coco
# ** Limitations **
# - Mask mode not supported for MIAS dataset, yet
# - Mask mode currently does not support benign/malignant mode (low/high risk)
# Features:
# - YOLO and COCO style dataset output for INBreast, CBIS-DDSM, and MIAS dataset
# - Mass low and high based on Bi-Rads/Malignant or benign
# --- Parameters --- #
# Change as necessary
chosen_datasets = ['inbreast', 'cbis-ddsm', 'mias'] # Available options: 'inbreast', 'cbis-ddsm', 'mias'
# Classes chosen for segmentation
chosen_classes = ['mass'] # Available options: 'mass', 'calcification'
# Recommended: YOLO
# Hacky point: YOLO mode now generates annotations.json for COCO style as well
output_choice = 'yolo' # available modes: yolo, mask
# Use Bi-Rads or not; When True, adds 'mass_low' and 'mass_high' to class names
low_high_mode = True
# Train / Validation / Test split ratio
split_mode = True
split_ratio = [0.8, 0.1, 0.1]
split_not_in_mask_mode = True # Must be set to true for to_3d_nrrd.py to work properly
# --- Input paths --- #
# INBreast Dataset
inbreast_path = os.path.join('datasets', 'INbreast Release 1.0')
inbreast_xml_dir = os.path.join(inbreast_path, 'AllXML')
inbreast_dcm_dir = os.path.join(inbreast_path, 'AllDICOMs')
inbreast_csv = os.path.join(inbreast_path, 'INbreast.csv')
# CBIS-DDSM Dataset
cbis_path = os.path.join('datasets', 'CBIS-DDSM')
cbis_jpeg = os.path.join(cbis_path, 'jpeg')
cbis_csv = os.path.join(cbis_path, 'csv')
# MIAS Dataset
mias_path = os.path.join('datasets', 'all-mias')
mias_info = os.path.join(mias_path, 'Info.txt')
mias_chosen = {'mass': ['CIRC', 'SPIC', 'MISC']}
# Output paths
image_out_dir = 'images' # Images
mask_out_dir = 'masks' # Mask
yaml_out = 'dataset.yaml' # YOLO .yaml
txt_out_dir = 'labels' # YOLO labels .txt
split_dirs = ['train', 'val', 'test']
# --- End of Input paths --- #
# Remove boxes smaller than this amount in length of X or Y
bbox_length_threshold = 0.005
# --- End of Parameters --- #
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--mode', type=str, choices=['yolo', 'mask', 'coco'],
help="Output dataset style", required=False, default=output_choice)
args = parser.parse_args()
output_choice = args.mode
output_choice = output_choice.lower()
# Skip not implemented features
if output_choice == 'mask':
# Faulty masks are made due to wrong names
if os.path.exists(image_out_dir):
print("Directory 'images' must be removed before running this program in mask mode!")
choice = input("Should images/ directory be removed? (Y/n)").lower()
if choice == 'y':
shutil.rmtree(image_out_dir)
else:
raise Exception("Cannot continue with images/ existing!")
# MIAS dataset does not currently support mask output
if 'mias' in chosen_datasets:
print("[!] MIAS dataset mask output is not implemented yet, skipping the dataset...", file=sys.stderr)
chosen_datasets.remove('mias')
# Mask mode train/val/test split is not desired, therefore disabled unless chosen otherwise
if split_not_in_mask_mode:
split_mode = False
# Lazy code, COCO mode deprecated, YOLO mode also does COCO
if output_choice == 'coco':
output_choice = 'yolo'
# Overall Counters for ID
image_id = 0
def normalize(pixel_array):
pixel_range = pixel_array.max() - pixel_array.min()
return (pixel_array - pixel_array.min()) / pixel_range
def normalize_with_threshold(pixel_array, threshold=(1000, 2000)):
pixel_array[pixel_array < threshold[0]] = threshold[0]
pixel_array[pixel_array > threshold[1]] = threshold[1]
return normalize(pixel_array)
# all_classes is a list of all class names, for later reference and assigning IDs to class names
all_classes = []
if 'mass' in chosen_classes:
if low_high_mode == True:
all_classes.extend(['mass_low', 'mass_high'])
else:
all_classes.append('mass')
if 'calcification' in chosen_classes:
all_classes.append('calcification')
# ** Deprecated code **
# json_data is deprecated variable, but code is still not removed. It doesn't interfere with the program.
json_data = None # Prevent undefined error
# Create output json and/or dirs
if output_choice == 'coco':
# Prevent unnecessary folders being created
mask_out_dir = None
txt_out_dir = None
json_data = {
'categories': [],
'annotations': [],
'images': []
}
# Prepare JSON categories
for cls in all_classes:
json_data['categories'].append({
'id': all_classes.index(cls),
'name': cls
})
if output_choice == 'yolo':
mask_out_dir = None
for directory in [image_out_dir, mask_out_dir, txt_out_dir]:
if directory and not os.path.isdir(directory):
os.mkdir(directory)
# Prevent Undefined
txt_lines = []
# --- INBreast --- #
inbreast_classes = set()
inbreast_xmls = [str(x) for x in list(Path(inbreast_xml_dir).glob('*.xml'))] # Load XML paths
if 'inbreast' in chosen_datasets:
# Read CSV for malignant/benign
inbreast_csv_data = pd.read_csv(inbreast_csv, sep=';')
file_score_pairs = {}
for filename, score in zip(inbreast_csv_data['File Name'], inbreast_csv_data['Bi-Rads']):
filename = str(filename)
score = int(re.sub(r'\D', '', score)) # 4a, 4b, 4c => 4
file_score_pairs[filename] = score
for inbreast_xml in inbreast_xmls:
image_id += 1
# Read XML to Dict
with open(inbreast_xml) as f:
xml_data = f.read()
xml_dict = xmltodict.parse(xml_data)
entries = xml_dict['plist']['dict']['array']['dict']['array']['dict']
if not isinstance(entries, list):
entries = [entries]
# Prepare rois dict/list
rois = {}
for cls in chosen_classes:
rois[cls] = []
# Get every ROI and save in rois[]
for entry in entries:
class_name = entry['string'][1]
if class_name:
class_name = class_name.lower()
inbreast_classes.add(class_name)
if class_name in chosen_classes:
# Literal eval in order to convert string of list to actual list
roi = entry['array'][1]['string']
if isinstance(roi, list):
roi_tmp = []
for point in roi:
roi_tmp.append(ast.literal_eval(point))
roi = roi_tmp
elif isinstance(roi, str):
roi = ast.literal_eval(roi)
if len(roi) > 2:
roi = np.array(roi, dtype=np.int32) # Required for later cv2.fillPoly
rois[class_name].append(roi)
if any(rois.values()):
# Get corresponding DICOM image prefix in name
dcm_prefix = Path(inbreast_xml).stem
patient_dir = None
for filename in os.listdir(inbreast_dcm_dir):
if re.match(dcm_prefix + '.*\.dcm', filename):
patient_dir = os.path.join(inbreast_dcm_dir, filename)
break
if not patient_dir:
raise Exception('The following DICOM file was not found in the directory: ' + dcm_prefix)
# Identify low or high and set the full class name
cls_suffix = ''
if low_high_mode:
if dcm_prefix in file_score_pairs:
score = file_score_pairs[dcm_prefix]
if score <= 3:
cls_suffix = '_low'
elif score > 3:
cls_suffix = '_high'
# Extract image from DICOM in dcm_path
# Read pixels from DICOM, convert tp 0-255 range for JPEG
pixel_array = dcmread(patient_dir).pixel_array
pixel_array = normalize_with_threshold(pixel_array) * 255
pixel_array = np.uint8(pixel_array)
image = Image.fromarray(pixel_array)
jpeg_path = os.path.join(image_out_dir, dcm_prefix + '.jpg')
if not os.path.exists(jpeg_path):
image.save(jpeg_path, format='JPEG')
if output_choice == 'mask':
# Mask mode
mask = np.zeros(pixel_array.shape, dtype=np.uint8)
for cls in chosen_classes:
mask = cv2.fillPoly(mask, rois[cls], 255 - all_classes.index(cls + cls_suffix))
mask = Image.fromarray(mask)
mask.save(os.path.join(mask_out_dir, dcm_prefix + '.jpg'), format='JPEG')
if output_choice == 'coco' or output_choice == 'yolo':
if output_choice == 'coco':
# Get image creation date
image_date = time.gmtime(os.path.getctime(jpeg_path))
image_date = '{}/{}/{} {}:{}:{}'.format(image_date.tm_year, image_date.tm_mon, image_date.tm_mday,
image_date.tm_hour, image_date.tm_min, image_date.tm_sec)
# Add image to JSON
json_data['images'].append({
'id': image_id,
'file_name': jpeg_path,
'width': image.width,
'height': image.height,
'date_captured': image_date
})
# Add annotations with YOLO or COCO style
txt_lines = [] # For YOLO
for cls in rois.keys():
# Identify low or high and set the full class name
cls_suffix = ''
# Currently only doing low and high for mass segments
if low_high_mode == True:
if cls == 'mass':
if dcm_prefix in file_score_pairs:
score = file_score_pairs[dcm_prefix]
if score <= 3:
cls_suffix = '_low'
elif score > 3:
cls_suffix = '_high'
class_id = all_classes.index(cls + cls_suffix)
# Go through each ROI in the iamge
for roi in rois[cls]:
if output_choice == 'coco':
# COCO mode
# Bug: Currently, we are not taking into account the polygons with disconnected parts.
# They may not be present in INBreast dataset, however, it is something that we could not
# take into account.
# ---
# Calculate bounding box
roi = roi.astype(int)
x_s, y_s = roi[:, 0], roi[:, 1]
# Convert all data to simple list of simple integers
bbox = np.array([x_s.min(), y_s.min(), x_s.max(), y_s.max()]).tolist()
# Relative coord must be lower than 1
if any([x > 1 for x in bbox]):
raise Exception('Bbox calculated relative coord larger than 1 for ROI: ' + roi)
json_data['annotations'].append({
'image_id': image_id,
'category_id': class_id,
'segmentation': [roi.tolist()],
'bbox': bbox
})
elif output_choice == 'yolo':
# YOLO mode
# Relative Xs and Ys
x_s, y_s = roi[:, 0] / image.width, roi[:, 1] / image.height
# BBOX in YOLO: X-Center Y-Center Width Height
bbox = x_s.mean(), y_s.mean(), x_s.max() - x_s.min(), y_s.max() - y_s.min()
# Relative coord must be lower than 1
if any([x > 1 for x in bbox]):
raise Exception('Bbox calculated relative coord larger than 1 for ROI: ' + roi)
bbox = [str(x) for x in bbox]
txt_lines.append("{} {} {} {} {}\n".format(str(class_id), *bbox))
# If YOLO, write TXT labels accumulated for the current image
if len(txt_lines) > 0:
txt_path = os.path.join(txt_out_dir, dcm_prefix + '.txt')
if not os.path.exists(txt_path):
with open(txt_path, 'w') as f:
f.writelines(txt_lines)
elif output_choice != 'mask':
raise Exception("Image without mask")
# -- End of INBreast --- #
# --- CBIS-DDSM --- #
if 'cbis-ddsm' in chosen_datasets:
# dicom_info.csv tells which jpeg corresponds to which dicom originally in DDSM dataset
# CBIS-DDSM has converted dicom to jpeg and removed the original .dcm files.
# Creating a dictionary linking previous .dcm to current .jpg files,
dcm_jpeg_dict = {}
with open(os.path.join(cbis_csv, 'dicom_info.csv')) as f:
list_of_dict = list(DictReader(f))
for item in list_of_dict:
# Skip cropped images, accept only mammography and ROI
if 'crop' not in item['SeriesDescription']:
# patient_dir
dcm_path = Path(item['file_path'].strip()).parent.parts[-1]
# patient_dir/jpeg_name.jpg
jpeg_path = os.path.join(*Path(item['image_path'].strip()).parts[-2:])
dcm_jpeg_dict[dcm_path] = jpeg_path
# Load data
# Paths start with CBIS-DDSM, make sure this is the name of the folder that contains csv and jpeg folders
cbis_base = str(Path(cbis_path).parent)
csv_names = [
'calc_case_description_train_set.csv',
'calc_case_description_test_set.csv',
'mass_case_description_test_set.csv',
'mass_case_description_train_set.csv'
]
# Dictionary with image => mask pairs
image_mask_pairs = {}
# Mask path => class id
mask_class_pairs = {}
for csv_name in csv_names:
# Get class_name
if csv_name[:4].lower() == 'calc':
class_name = 'calcification'
elif csv_name[:4].lower() == 'mass':
class_name = 'mass'
else:
raise Exception(
'Unexpected class name. CSV file prefixes for CBIS-DDSM should only start with calc or mass!')
# Skip if class is not desired
if class_name not in chosen_classes:
continue
# Load CSV data
with open(os.path.join(cbis_csv, csv_name)) as f:
list_of_dict = list(DictReader(f))
for item in list_of_dict:
# Separate parent directory for later reference to files (folder structure stuff)
patient_dir = Path(item['image file path'].strip()).parent.parts[-1]
# Only support mass for Bi-Rads for now
cls_suffix = ''
if low_high_mode:
score = int(str(item['assessment']).strip())
if class_name == 'mass':
if score <= 3:
cls_suffix = '_low'
elif score > 3:
cls_suffix = '_high'
# Skip invalid images
if patient_dir not in dcm_jpeg_dict:
continue
# Convert dcm name to actual jpeg name
jpeg_path = dcm_jpeg_dict[patient_dir]
if jpeg_path not in image_mask_pairs:
image_mask_pairs[jpeg_path] = []
# patient_dir is an arbitrary name pointing to the base folder for jpg files
patient_dir = Path(item['ROI mask file path'].strip()).parent.parts[-1]
if patient_dir not in dcm_jpeg_dict:
continue
mask_path = dcm_jpeg_dict[patient_dir]
image_mask_pairs[jpeg_path].append(mask_path)
# Add mask class
mask_class_pairs[mask_path] = all_classes.index(class_name + cls_suffix)
# Mask mode
# Bug: Multi class not implemented (NOT_IMPLEMENTED)
image_id = 0
if output_choice == 'mask':
for item in image_mask_pairs.items():
image_id += 1
image_name = 'cbm_' + str(image_id) + '.jpg'
shutil.copy(os.path.join(cbis_jpeg, item[0]), os.path.join(image_out_dir, image_name))
mask_out_path = os.path.join(mask_out_dir, image_name)
if len(item[1]) == 1:
mask_path = item[1][0]
shutil.copy(os.path.join(cbis_jpeg, mask_path), mask_out_path)
elif len(item[1]) > 1:
masked = None
for i in range(len(item[1])):
if i < (len(item[1]) - 1):
img1_path = os.path.join(cbis_jpeg, item[1][i])
img2_path = os.path.join(cbis_jpeg, item[1][i + 1])
img1 = Image.open(img1_path)
img2 = Image.open(img2_path)
img1.paste(img2, (0, 0), mask=img2)
masked = img1
masked.save(mask_out_path)
# YOLO mode
image_id = 0
if output_choice == 'yolo':
for image_path in image_mask_pairs.keys():
image_id += 1 # Used only for output name
output_name = "cb_" + str(image_id)
txt_lines = []
for mask_path in image_mask_pairs[image_path]:
# Read gray mask
mask = cv2.imread(os.path.join(cbis_jpeg, mask_path), cv2.IMREAD_GRAYSCALE)
# Apply threshold to mask pixels
_, mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
# Get contours for label text files
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Each element in contours list is a polygon (supposedly)
for contour in contours:
contour = np.array(contour).reshape(-1, 2) # Fix shape for later use
x_s = contour[:, 0]
y_s = contour[:, 1]
# Relative Xs and Ys
x_s, y_s = x_s / mask.shape[1], y_s / mask.shape[0]
# BBOX in YOLO: X-Center Y-Center Width Height
bbox = x_s.mean(), y_s.mean(), x_s.max() - x_s.min(), y_s.max() - y_s.min()
if any([x > 1 for x in bbox]):
raise Exception('Bbox calculated relative coord larger than 1 for ROI: ' + str(contour))
if bbox[2] >= bbox_length_threshold and bbox[3] >= bbox_length_threshold:
bbox = [str(x) for x in bbox]
# Get class id
class_id = mask_class_pairs[mask_path]
txt_lines.append("{} {} {} {} {}\n".format(str(class_id), *bbox))
# Write label txt file for current image
# Skip image if txt_lines wasn't filled
if len(txt_lines) > 0:
output_path = os.path.join(txt_out_dir, output_name + ".txt")
if not os.path.exists(output_path):
with open(output_path, 'w') as f:
f.writelines(txt_lines)
# Copy image to output dir
output_path = os.path.join(image_out_dir, output_name + ".jpg")
if not os.path.exists(output_path):
shutil.copy(os.path.join(cbis_jpeg, image_path), output_path)
# --- End of CBIS-DDSM --- #
# --- MIAS --- #
if 'mias' in chosen_datasets:
with open(mias_info) as f:
for line in f.readlines():
line_parts = line.split(' ')
# Skip irrelevant lines
if line[:3] != 'mdb' or len(line_parts) < 7:
continue
# Image name
image_name = line_parts[0]
# Read image
image = cv2.imread(os.path.join(mias_path, image_name + '.pgm'), cv2.IMREAD_GRAYSCALE)
# Label txt lines to be filled
txt_lines = []
for class_name in chosen_classes:
if line_parts[2] in mias_chosen[class_name]:
# Malignant / benign
cls_suffix = ''
if low_high_mode:
if line_parts[3] == 'M':
cls_suffix = '_high'
else:
cls_suffix = '_low'
# Calculate bounding box
x_center, y_center, radius = int(line_parts[4]), int(line_parts[5]), int(line_parts[6])
# To relative coord
height, width = image.shape
bbox = [x_center / width, y_center / height, radius / width, radius / height]
if any([x > 1 for x in bbox]):
raise Exception('Bbox calculated relative coord larger than 1 for ROI: ' + str(contour))
# Txt line
class_id = all_classes.index(class_name + cls_suffix)
bbox = [str(x) for x in bbox]
txt_lines.append("{} {} {} {} {}\n".format(str(class_id), *bbox))
# Skip images with no labels (or with non-chosen labels)
if len(txt_lines) > 0:
# Write to labels/
output_path = os.path.join(txt_out_dir, image_name + ".txt")
if not os.path.exists(output_path):
with open(output_path, 'w') as f:
f.writelines(txt_lines)
# Save to images/
output_path = os.path.join(image_out_dir, image_name + '.jpg')
if not os.path.exists(output_path):
cv2.imwrite(output_path, image)
# --- End of MIAS --- #
# Train / Val / Test split
if split_mode:
out_dir = txt_out_dir
file_suffix = '.txt'
if output_choice == 'mask':
out_dir = 'masks'
file_suffix = '.jpg'
warning_said = False
for directory1 in split_dirs:
if os.path.exists(directory1) and not warning_said:
print("Train/Val/Test directories exist! You must remove these directories before running the program.\n" + \
"Otherwise this may cause random extra images in these directories.", file=sys.stderr)
warning_said = True
else:
os.mkdir(directory1)
for directory2 in [image_out_dir, out_dir]:
directory = os.path.join(directory1, directory2)
if not os.path.exists(directory):
os.mkdir(directory)
file_prefixes = [Path(x).stem for x in os.listdir(out_dir)]
random.shuffle(file_prefixes)
i = 0
for file_prefix in file_prefixes:
i += 1
# Choose destination
ratio = i / len(file_prefixes)
if ratio <= split_ratio[0]:
dest_id = 0 # train
elif ratio <= split_ratio[0] + split_ratio[1]:
dest_id = 1 # val
elif ratio <= split_ratio[0] + split_ratio[1] + split_ratio[2]:
dest_id = 2 # test
else:
dest = 'train'
# Move
dest = os.path.join(split_dirs[dest_id], image_out_dir, file_prefix + '.jpg')
if not os.path.exists(dest):
shutil.move(os.path.join(image_out_dir, file_prefix + '.jpg'), dest)
dest = os.path.join(split_dirs[dest_id], out_dir, file_prefix + file_suffix)
if not os.path.exists(dest):
shutil.move(os.path.join(out_dir, file_prefix + file_suffix), dest)
# Create COCO annotations json regardless of output style choice (yolo/coco)
if output_choice == 'yolo':
for directory in split_dirs:
yolo_to_coco(os.path.join(directory, txt_out_dir), os.path.join(directory, image_out_dir), directory + '.json')
# Required for YOLO
yaml_data = {
'path': os.getcwd(),
'train': os.path.join(split_dirs[0], image_out_dir),
'val': os.path.join(split_dirs[1], image_out_dir),
'test': os.path.join(split_dirs[2], image_out_dir),
'names': all_classes
}
with open(yaml_out, 'w') as f:
yaml.dump(yaml_data, f)
# Clean up empty directories
for directory in [image_out_dir, out_dir]:
if len(os.listdir(directory)) == 0:
os.rmdir(directory)