Skip to content

Commit

Permalink
Merge pull request #78 from Emory-HITI/dev
Browse files Browse the repository at this point in the history
Refactor PNG Extractor Slurm version
  • Loading branch information
pradeeban authored Dec 9, 2020
2 parents 334089a + ded267d commit 1506dc8
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 60 deletions.
37 changes: 25 additions & 12 deletions modules/png-extraction/ImageExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import logging
from multiprocessing import Pool
import json
import errno
import sys

#pydicom imports needed to handle data errrors
Expand All @@ -36,11 +35,13 @@
dicom_home = niffler['DICOMHome'] #the folder containing your dicom files
output_directory = niffler['OutputDirectory']
depth = niffler['Depth']
half_mode = niffler['UseHalfOfTheProcessorsOnly'] #use only half of the available processors.

png_destination = output_directory + '/extracted-images/'
csvDestination = output_directory + '/metadata.csv'
mappings= output_directory + '/mapping.csv'
failed = output_directory +'/failed-dicom/'

csv_destination = output_directory + '/metadata.csv'
mappings = output_directory + '/mapping.csv'
LOG_FILENAME = output_directory + '/ImageExtractor.out'


Expand All @@ -64,9 +65,10 @@
if not os.path.exists(failed + "/3"):
os.makedirs(failed + "/3")

if not os.path.exists(failed + "/4"):
os.makedirs(failed + "/4")

#%%Function for getting tuple for field,val pairs for this file
#plan is instance of dicom class, the data for single mammo file
#%%Function for getting tuple for field,val pairs
def get_tuples(plan, outlist = None, key = ""):
if len(key)>0:
key = key + "_"
Expand Down Expand Up @@ -95,8 +97,7 @@ def get_tuples(plan, outlist = None, key = ""):
outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe
return outlist

#%%Function called by multiprocessing.Takes a tuple with a (index,dicomPath)
#ff is the file to be loaded. nn is the index of the file in the fileList

def extract_headers(f_list_elem):
nn,ff = f_list_elem # unpack enumerated list
plan = dicom.dcmread(ff, force=True) #reads in dicom file
Expand Down Expand Up @@ -135,7 +136,7 @@ def extract_images(i):
imName=os.path.split(filedata.iloc[i].loc['file'])[1][:-4] #get file name ex: IM-0107-0022
#check for existence of patient folder, create if needed
if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
os.mkdir(png_destination+folderName)
os.mkdir(png_destination + folderName)

shape = ds.pixel_array.shape

Expand All @@ -148,7 +149,7 @@ def extract_images(i):
# Convert to uint
image_2d_scaled = np.uint8(image_2d_scaled)

pngfile = png_destination+folderName+'/' +imName +'.png'
pngfile = png_destination+folderName+'/' + imName + '.png'

# Write the PNG file
with open(pngfile , 'wb') as png_file:
Expand All @@ -162,10 +163,15 @@ def extract_images(i):
except ValueError as error:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
except BaseException as error : #ramonNote added base exception catch. so i can also catch this one
except BaseException as error:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
except:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
return (filemapping,fail_path,found_err)


#%%Function when pydicom fails to read a value attempt to read as
#other types.
def fix_mismatch_callback(raw_elem, **kwargs):
Expand Down Expand Up @@ -214,10 +220,17 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
config.data_element_callback_kwargs = {
'with_VRs': with_VRs,
}

fix_mismatch()
core_count = int(os.cpu_count()/2) # use half the cores avoid high ram usage

if half_mode:
core_count = int(os.cpu_count()/2) # use half the cores avoid high ram usage
else:
core_count = int(os.cpu_count())

#%% get set up to create dataframe
dirs = os.listdir(dicom_home)

#gets all dicom files. if editing this code, get filelist into the format of a list of strings,
#with each string as the file path to a different dicom file.
file_path = get_path(depth)
Expand Down Expand Up @@ -279,7 +292,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
data=pd.DataFrame(headerlist)

#%%export csv file of final dataframe
export_csv = data.to_csv (csvDestination, index = None, header=True)
export_csv = data.to_csv (csv_destination, index = None, header=True)

fields=df.keys()
count = 0; #potential painpoint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
import hashlib
from shutil import copyfile
import logging
import pickle #this is a temporary addition
import json
import pickle
from multiprocessing import Pool
from pydicom import config
from pydicom import datadict
Expand All @@ -27,23 +28,52 @@
#things needed for the slurm task array
task_id = int(os.environ['SLURM_ARRAY_TASK_ID'] )
num_task = int(os.environ['SLURM_ARRAY_TASK_COUNT'])
print("I am Task: " + str(task_id))
print(" There are this many others" + str(num_task))
#%%CHANGE THESE FOR YOUR USE
print_images=True #do you want to print the images from these dicom files?
print_only_common_headers=False #do you want the resulting dataframe csv to contain only the common headers? See section 'find common fields'
root = '/labs/banerjeelab/ramon_chxcl/JACR_Jan_April_2020_full/' #the root directory for yor project
dicomHome = os.path.join(root,'JACR_Jan_April_2020/') #the folder containing your dicom files
png_destination = os.path.join(root ,'extracted-images/') #where you want the extracted images to print
csvDestination = root + 'metadata_'+str(task_id)+'.csv' #where you want the dataframe csv to print
mappings= root + 'mapping_'+str(task_id)+'.csv'
failed = root +'failed-dicom/'

LOG_FILENAME = root + 'ImageExtractor_'+str(task_id)+'.out'

with open('config.json', 'r') as f:
niffler = json.load(f)

#Get variables for StoreScp from config.json.
print_images = niffler['PrintImages']
print_only_common_headers = niffler['CommonHeadersOnly']
dicom_home = niffler['DICOMHome'] #the folder containing your dicom files
output_directory = niffler['OutputDirectory']
depth = niffler['Depth']
half_mode = niffler['UseHalfOfTheProcessorsOnly'] #use only half of the available processors.

png_destination = output_directory + '/extracted-images/'
failed = output_directory +'/failed-dicom/'

csv_destination = output_directory + '/metadata_'+str(task_id)+'.csv'
mappings= output_directory + '/mapping_'+str(task_id)+'.csv'
LOG_FILENAME = output_directory + '/ImageExtractor_'+str(task_id)+'.out'
pickle_file = output_directory +'/ImageExtractor.pickle'


if not os.path.exists(output_directory):
os.makedirs(output_directory)

logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
depth = 4
#%%Function for getting tuple for field,val pairs for this file
#plan is instance of dicom class, the data for single mammo file

if not os.path.exists(png_destination):
os.makedirs(png_destination)

if not os.path.exists(failed):
os.makedirs(failed)

if not os.path.exists(failed + "/1"):
os.makedirs(failed + "/1")

if not os.path.exists(failed + "/2"):
os.makedirs(failed + "/2")

if not os.path.exists(failed + "/3"):
os.makedirs(failed + "/3")

if not os.path.exists(failed + "/4"):
os.makedirs(failed + "/4")


#%%Function for getting tuple for field,val pairs
def get_tuples(plan, outlist = None, key = ""):
if len(key)>0:
key = key + "_"
Expand All @@ -53,8 +83,7 @@ def get_tuples(plan, outlist = None, key = ""):
try:
hasattr(plan,aa)
except TypeError as e:
print(aa)
print(plan)
logging.warning('Type Error encountered')
if (hasattr(plan, aa) and aa!='PixelData'):
value = getattr(plan, aa)
if type(value) is dicom.sequence.Sequence:
Expand All @@ -72,6 +101,8 @@ def get_tuples(plan, outlist = None, key = ""):
value = str(value)
outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe
return outlist


def extract_headers(f_list_elem):
nn,ff = f_list_elem # unpack enumerated list
plan = dicom.dcmread(ff, force=True) #reads in dicom file
Expand All @@ -91,6 +122,13 @@ def extract_headers(f_list_elem):
kv.append(('category','no image')) #adds my custom category field, makes note as imageless
return dict(kv)


#%%Function to extract pixel array information
#takes an integer used to index into the global filedata dataframe
#returns tuple of
# filemapping: dicom to png paths (as str)
# fail_path: dicom to failed folder (as tuple)
# found_err: error code produced when processing
def extract_images(i):
ds = dicom.dcmread(filedata.iloc[i].loc['file'], force=True) #read file in
found_err=None
Expand Down Expand Up @@ -125,20 +163,21 @@ def extract_images(i):
w.write(png_file, image_2d_scaled)

filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n'
#fm.write(filemapping)
except AttributeError as error:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
#copyfile(filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm')
except ValueError as error:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
#copyfile(filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm')
except BaseException as error : #ramonNote added base exception catch. so i can also catch this one
except BaseException as error:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
#copyfile(filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm')
except:
found_err = error
fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
return (filemapping,fail_path,found_err)


def fix_mismatch_callback(raw_elem, **kwargs):
try:
values.convert_value(raw_elem.VR, raw_elem)
Expand All @@ -154,6 +193,17 @@ def fix_mismatch_callback(raw_elem, **kwargs):
return raw_elem


def get_path(depth):
directory = dicom_home + '/'

i = 0;
while i < depth:
directory += "*/"
i += 1

return directory + "*.dcm"


def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
"""A callback function to check that RawDataElements are translatable
with their provided VRs. If not, re-attempt translation using
Expand All @@ -172,39 +222,38 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
config.data_element_callback_kwargs = {
'with_VRs': with_VRs,
}

fix_mismatch()

#%% get set up to create dataframe
dirs = os.listdir( root )
dirs = os.listdir(dicom_home)

file_path = get_path(depth)

#gets all dicom files. if editing this code, get filelist into the format of a list of strings,
#with each string as the file path to a different dicom file.
if os.path.isfile(root+'ramen.pickle'):
f= open(root +'ramen.pickle', 'rb')
if os.path.isfile(pickle_file):
f= open(pickle_file, 'rb')
filelist = pickle.load(f)
else:
filelist=glob.glob(dicomHome + '*/*/*/*.dcm', recursive=True) #this searches the folders at the depth we request and finds all dicoms
pickle.dump(filelist,open(root+'ramen.pickle','wb'))
filelist=glob.glob(file_path, recursive=True) #this searches the folders at the depth we request and finds all dicoms
pickle.dump(filelist,open(pickle_file,'wb'))

logging.info('Number of dicom files: ' + str(len(filelist)))
print('Original File list is:' + str(len(filelist)))
file_split = np.array_split(filelist,num_task)
filelist = file_split[task_id]
print('Task will have a nominal File list of' + str(len(filelist)))
ff = filelist[0] #load first file as a templat to look at all
plan = dicom.dcmread(ff, force=True)
logging.debug('Loaded the first file successfully')
#print(type(plan)) #is recorded as pydicom class, has attributes numerated in keys
#print(plan.dir()) #lists class attributes
keys = [(aa) for aa in plan.dir() if (hasattr(plan, aa) and aa!='PixelData')]
#print(keys) keys are attributes in this instance of the dicom class from the source file

#%%checks for images in fields and prints where they are
for field in plan.dir():
if (hasattr(plan, field) and field!='PixelData'):
entry = getattr(plan, field)
#print(field) #prints header
#print(str(entry)) #prints associated value
if type(entry) is bytes:
print(field)
print(str(entry))
logging.debug(field)
logging.debug(str(entry))


#set([ type(getattr(plan, field)) for field in plan.dir() if (hasattr(plan, field) and field!='PixelData')])
Expand All @@ -215,7 +264,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
#%%step through whole file list, read in file, append fields to future dataframe of all files
headerlist = []
#start up a multi processing pool
print('start with getting headers')
p = Pool(15)
stamp = time.time()
res= p.imap_unordered(extract_headers,enumerate(filelist))
Expand All @@ -226,8 +274,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
headerlist.append(e)
p.close()
p.join()
print('done with getting headers')
#RAMON NOTE: im assuming that the context manager handles closing for me
#Assuming that the context manager handles closing for me
#make dataframe containing all fields and all files
df = pd.DataFrame(headerlist)

Expand All @@ -240,7 +287,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
common_fields = set(np.asarray(df.columns)[mask_common_fields]) #define the common fields as those with more than 90% filled


#print(headerlist) #list of all field,value arguments for all data
for nn,kv in enumerate(headerlist):
#print(kv) #all field,value tuples for this one in headerlist
for kk in list(kv.keys()):
Expand All @@ -254,7 +300,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
data=pd.DataFrame(headerlist)

#%%export csv file of final dataframe
export_csv = data.to_csv(csvDestination, index = None, header=True)
export_csv = data.to_csv(csv_destination, index = None, header=True)

fields=df.keys()

Expand All @@ -264,7 +310,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):

#todo: in consumer loop add sgment that checks if an error has occured and updates error count
if print_images:
print("Start processing Images")
filedata=data
count =0
other =0
Expand All @@ -285,10 +330,8 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
logging.error( err_msg)
else:
fm.write(fmap)
print("Done working closing pool")
p.close()
p.join()
print("Pool Close we done boys")
fm.close()
#insert multiprocessing call here
#for i in range(len(filedata)):
Expand Down
Loading

0 comments on commit 1506dc8

Please sign in to comment.