Merge pull request #78 from Emory-HITI/dev

Refactor PNG Extractor Slurm version
Emory-HITI · Dec 9, 2020 · 1506dc8 · 1506dc8
2 parents 334089a + ded267d
commit 1506dc8
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 60 deletions.
diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
@@ -18,7 +18,6 @@
 import logging
 from multiprocessing import Pool
 import json
-import errno
 import sys
 
 #pydicom imports needed to handle data errrors 
@@ -36,11 +35,13 @@
 dicom_home = niffler['DICOMHome'] #the folder containing your dicom files
 output_directory = niffler['OutputDirectory']
 depth = niffler['Depth']
+half_mode = niffler['UseHalfOfTheProcessorsOnly'] #use only half of the available processors.
 
 png_destination = output_directory + '/extracted-images/' 
-csvDestination = output_directory + '/metadata.csv'
-mappings= output_directory + '/mapping.csv'
 failed = output_directory +'/failed-dicom/'
+
+csv_destination = output_directory + '/metadata.csv'
+mappings = output_directory + '/mapping.csv'
 LOG_FILENAME = output_directory + '/ImageExtractor.out'
 
 
@@ -64,9 +65,10 @@
 if not os.path.exists(failed + "/3"):
     os.makedirs(failed + "/3")
 
+if not os.path.exists(failed + "/4"):
+    os.makedirs(failed + "/4")
 
-#%%Function for getting tuple for field,val pairs for this file
-#plan is instance of dicom class, the data for single mammo file
+#%%Function for getting tuple for field,val pairs
 def get_tuples(plan, outlist = None, key = ""):
     if len(key)>0:
         key =  key + "_"
@@ -95,8 +97,7 @@ def get_tuples(plan, outlist = None, key = ""):
                 outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe
     return outlist
 
-#%%Function called by multiprocessing.Takes a tuple with a (index,dicomPath)
-#ff is the file to be loaded. nn is the index of the file in the fileList
+
 def extract_headers(f_list_elem): 
     nn,ff = f_list_elem # unpack enumerated list 
     plan = dicom.dcmread(ff, force=True)  #reads in dicom file
@@ -135,7 +136,7 @@ def extract_images(i):
         imName=os.path.split(filedata.iloc[i].loc['file'])[1][:-4] #get file name ex: IM-0107-0022
         #check for existence of patient folder, create if needed
         if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time. 
-            os.mkdir(png_destination+folderName)              
+            os.mkdir(png_destination + folderName)              
 
         shape = ds.pixel_array.shape
 
@@ -148,7 +149,7 @@ def extract_images(i):
         # Convert to uint
         image_2d_scaled = np.uint8(image_2d_scaled)
 
-        pngfile = png_destination+folderName+'/' +imName +'.png'
+        pngfile = png_destination+folderName+'/' + imName + '.png'
 
         # Write the PNG file
         with open(pngfile , 'wb') as png_file:
@@ -162,10 +163,15 @@ def extract_images(i):
     except ValueError as error:
         found_err = error
         fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-    except BaseException as error : #ramonNote added base exception catch. so i can also catch this one 
+    except BaseException as error: 
         found_err = error
         fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
+    except:
+        found_err = error
+        fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'       
     return (filemapping,fail_path,found_err)
+
+
 #%%Function when pydicom fails to read a value attempt to read as 
 #other types. 
 def fix_mismatch_callback(raw_elem, **kwargs):
@@ -214,10 +220,17 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     config.data_element_callback_kwargs = {
         'with_VRs': with_VRs,
     }
+
 fix_mismatch()
-core_count = int(os.cpu_count()/2) # use half the cores avoid  high ram usage
+
+if half_mode:
+    core_count = int(os.cpu_count()/2) # use half the cores avoid  high ram usage
+else:
+    core_count = int(os.cpu_count())
+
 #%% get set up to create dataframe
 dirs = os.listdir(dicom_home)
+
 #gets all dicom files. if editing this code, get filelist into the format of a list of strings, 
 #with each string as the file path to a different dicom file.
 file_path = get_path(depth)
@@ -279,7 +292,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
 data=pd.DataFrame(headerlist)
 
 #%%export csv file of final dataframe
-export_csv = data.to_csv (csvDestination, index = None, header=True) 
+export_csv = data.to_csv (csv_destination, index = None, header=True) 
 
 fields=df.keys()
 count = 0; #potential painpoint 

diff --git a/...es/png-extraction/ImageExtractor_slurm.py → ...les/png-extraction/ImageExtractorSlurm.py b/...es/png-extraction/ImageExtractor_slurm.py → ...les/png-extraction/ImageExtractorSlurm.py
@@ -18,7 +18,8 @@
 import hashlib
 from shutil import copyfile
 import logging
-import pickle #this is a temporary addition 
+import json
+import pickle 
 from multiprocessing import Pool
 from pydicom import config
 from pydicom import datadict
@@ -27,23 +28,52 @@
 #things needed for the slurm task array 
 task_id = int(os.environ['SLURM_ARRAY_TASK_ID'] )
 num_task = int(os.environ['SLURM_ARRAY_TASK_COUNT'])
-print("I am Task: " + str(task_id))
-print(" There are this many others" + str(num_task))
-#%%CHANGE THESE FOR YOUR USE
-print_images=True #do you want to print the images from these dicom files?
-print_only_common_headers=False #do you want the resulting dataframe csv to contain only the common headers? See section 'find common fields'
-root = '/labs/banerjeelab/ramon_chxcl/JACR_Jan_April_2020_full/'     #the root directory for yor project
-dicomHome = os.path.join(root,'JACR_Jan_April_2020/') #the folder containing your dicom files
-png_destination = os.path.join(root ,'extracted-images/') #where you want the extracted images to print 
-csvDestination = root + 'metadata_'+str(task_id)+'.csv' #where you want the dataframe csv to print
-mappings= root + 'mapping_'+str(task_id)+'.csv'
-failed = root +'failed-dicom/'
-
-LOG_FILENAME = root + 'ImageExtractor_'+str(task_id)+'.out'
+
+with open('config.json', 'r') as f:
+    niffler = json.load(f)
+
+#Get variables for StoreScp from config.json.
+print_images = niffler['PrintImages'] 
+print_only_common_headers = niffler['CommonHeadersOnly'] 
+dicom_home = niffler['DICOMHome'] #the folder containing your dicom files
+output_directory = niffler['OutputDirectory']
+depth = niffler['Depth']
+half_mode = niffler['UseHalfOfTheProcessorsOnly'] #use only half of the available processors.
+
+png_destination = output_directory + '/extracted-images/' 
+failed = output_directory +'/failed-dicom/'
+
+csv_destination = output_directory + '/metadata_'+str(task_id)+'.csv'
+mappings= output_directory + '/mapping_'+str(task_id)+'.csv'
+LOG_FILENAME = output_directory + '/ImageExtractor_'+str(task_id)+'.out'
+pickle_file = output_directory +'/ImageExtractor.pickle'
+
+
+if not os.path.exists(output_directory):
+    os.makedirs(output_directory)
+
 logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
-depth = 4
-#%%Function for getting tuple for field,val pairs for this file
-#plan is instance of dicom class, the data for single mammo file
+
+if not os.path.exists(png_destination):
+    os.makedirs(png_destination)
+
+if not os.path.exists(failed):
+    os.makedirs(failed)
+
+if not os.path.exists(failed + "/1"):
+    os.makedirs(failed + "/1")
+
+if not os.path.exists(failed + "/2"):
+    os.makedirs(failed + "/2")
+
+if not os.path.exists(failed + "/3"):
+    os.makedirs(failed + "/3")
+
+if not os.path.exists(failed + "/4"):
+    os.makedirs(failed + "/4")
+
+
+#%%Function for getting tuple for field,val pairs
 def get_tuples(plan, outlist = None, key = ""):
     if len(key)>0:
         key =  key + "_"
@@ -53,8 +83,7 @@ def get_tuples(plan, outlist = None, key = ""):
         try: 
             hasattr(plan,aa) 
         except TypeError as e: 
-            print(aa)
-            print(plan)
+            logging.warning('Type Error encountered')
         if (hasattr(plan, aa) and aa!='PixelData'):
             value = getattr(plan, aa)
             if type(value) is dicom.sequence.Sequence:
@@ -72,6 +101,8 @@ def get_tuples(plan, outlist = None, key = ""):
                     value = str(value)
                 outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe
     return outlist
+
+
 def extract_headers(f_list_elem): 
     nn,ff = f_list_elem # unpack enumerated list 
     plan = dicom.dcmread(ff, force=True)  #reads in dicom file
@@ -91,6 +122,13 @@ def extract_headers(f_list_elem):
         kv.append(('category','no image'))      #adds my custom category field, makes note as imageless
     return dict(kv)
 
+
+#%%Function to extract pixel array information 
+#takes an integer used to index into the global filedata dataframe
+#returns tuple of 
+# filemapping: dicom to png paths   (as str) 
+# fail_path: dicom to failed folder (as tuple) 
+# found_err: error code produced when processing
 def extract_images(i):
     ds = dicom.dcmread(filedata.iloc[i].loc['file'], force=True) #read file in
     found_err=None
@@ -125,20 +163,21 @@ def extract_images(i):
             w.write(png_file, image_2d_scaled)
 
         filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n'
-        #fm.write(filemapping)
     except AttributeError as error:
         found_err = error
         fail_path = filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-        #copyfile(filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm')
     except ValueError as error:
         found_err = error
         fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-        #copyfile(filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm')
-    except BaseException as error : #ramonNote added base exception catch. so i can also catch this one 
+    except BaseException as error: 
         found_err = error
         fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-        #copyfile(filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm')
+    except:
+        found_err = error
+        fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
     return (filemapping,fail_path,found_err)
+
+
 def fix_mismatch_callback(raw_elem, **kwargs):
     try:
         values.convert_value(raw_elem.VR, raw_elem)
@@ -154,6 +193,17 @@ def fix_mismatch_callback(raw_elem, **kwargs):
     return raw_elem
 
 
+def get_path(depth):
+    directory = dicom_home + '/'
+
+    i = 0;
+    while i < depth:
+        directory += "*/"
+        i += 1
+
+    return directory + "*.dcm"
+
+
 def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     """A callback function to check that RawDataElements are translatable
     with their provided VRs.  If not, re-attempt translation using
@@ -172,39 +222,38 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     config.data_element_callback_kwargs = {
         'with_VRs': with_VRs,
     }
+
 fix_mismatch()
+
 #%% get set up to create dataframe
-dirs = os.listdir( root )
+dirs = os.listdir(dicom_home)
+
+file_path = get_path(depth)
+
 #gets all dicom files. if editing this code, get filelist into the format of a list of strings, 
 #with each string as the file path to a different dicom file.
-if os.path.isfile(root+'ramen.pickle'):
-    f= open(root +'ramen.pickle', 'rb')
+if os.path.isfile(pickle_file):
+    f= open(pickle_file, 'rb')
     filelist = pickle.load(f)
 else:
-    filelist=glob.glob(dicomHome + '*/*/*/*.dcm', recursive=True) #this searches the folders at the depth we request and finds all dicoms
-    pickle.dump(filelist,open(root+'ramen.pickle','wb'))
+    filelist=glob.glob(file_path, recursive=True) #this searches the folders at the depth we request and finds all dicoms
+    pickle.dump(filelist,open(pickle_file,'wb'))
+
 logging.info('Number of dicom files: ' + str(len(filelist)))
-print('Original File list is:' + str(len(filelist)))
 file_split = np.array_split(filelist,num_task)
 filelist = file_split[task_id]
-print('Task will have a nominal  File list of' + str(len(filelist)))
 ff = filelist[0] #load first file as a templat to look at all 
 plan = dicom.dcmread(ff, force=True) 
 logging.debug('Loaded the first file successfully')
-#print(type(plan)) #is recorded as pydicom class, has attributes numerated in keys
-#print(plan.dir()) #lists class attributes
 keys = [(aa) for aa in plan.dir() if (hasattr(plan, aa) and aa!='PixelData')]
-#print(keys) keys are attributes in this instance of the dicom class from the source file
 
 #%%checks for images in fields and prints where they are
 for field in plan.dir():
     if (hasattr(plan, field) and field!='PixelData'):
         entry = getattr(plan, field)
-        #print(field)        #prints header
-        #print(str(entry))   #prints associated value
         if type(entry) is bytes:
-            print(field)
-            print(str(entry))
+            logging.debug(field)
+            logging.debug(str(entry))
 
 
 #set([ type(getattr(plan, field)) for field in plan.dir() if (hasattr(plan, field) and field!='PixelData')])
@@ -215,7 +264,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
 #%%step through whole file list, read in file, append fields to future dataframe of all files
 headerlist = []
 #start up a multi processing pool 
-print('start with getting headers')
 p = Pool(15)
 stamp = time.time()
 res= p.imap_unordered(extract_headers,enumerate(filelist))
@@ -226,8 +274,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     headerlist.append(e)
 p.close()
 p.join()
-print('done with getting headers')
-#RAMON NOTE: im assuming that the context manager handles closing for me 
+#Assuming that the context manager handles closing for me 
 #make dataframe containing all fields and all files
 df = pd.DataFrame(headerlist)
 
@@ -240,7 +287,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
 common_fields = set(np.asarray(df.columns)[mask_common_fields]) #define the common fields as those with more than 90% filled
 
 
-#print(headerlist) #list of all field,value arguments for all data
 for nn,kv in enumerate(headerlist):
     #print(kv)                #all field,value tuples for this one in headerlist
     for kk in list(kv.keys()):
@@ -254,7 +300,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
 data=pd.DataFrame(headerlist)
 
 #%%export csv file of final dataframe
-export_csv = data.to_csv(csvDestination, index = None, header=True) 
+export_csv = data.to_csv(csv_destination, index = None, header=True) 
 
 fields=df.keys()
 
@@ -264,7 +310,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
 
 #todo: in consumer loop add sgment that checks if an error has occured and updates error count 
 if print_images:
-    print("Start processing Images")
     filedata=data
     count =0 
     other =0 
@@ -285,10 +330,8 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
             logging.error( err_msg)
         else: 
             fm.write(fmap)
-    print("Done working closing  pool")
     p.close()
     p.join()              
-    print("Pool Close we done boys")
 fm.close()
 #insert multiprocessing call here     
 #for i in range(len(filedata)):