From 61e82e267b4833797f669c044078bf2b4da10041 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 7 Jun 2017 21:00:38 -0400 Subject: [PATCH] modified: README.md modified: docs/dicom_import.md modified: sendit/apps/main/tasks.py modified: sendit/settings/main.py --- README.md | 18 ++--------- docs/dicom_import.md | 23 +++++++++++--- sendit/apps/main/tasks.py | 66 ++++++++++++++++++++++++++++++--------- sendit/settings/main.py | 4 +-- 4 files changed, 75 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index b7a3c2c..81c6c40 100644 --- a/README.md +++ b/README.md @@ -160,21 +160,9 @@ These can be tweaked as needed, and likely I will do this as I develop the appli # Questions -For the folder that is placed in `/data`, I am making the assumption that it will have the traditional DICOM storage format, meaning it looks something like this: -``` -("Patient -> Study -> Series -> Images hierarchy") -``` - -(The REDADME +- When there is error reading a dicom file (I am using "force" so this will read most dicom files, but will have a KeyError for a non dicom file) I am logging the error and moving on, without deleting the file. Is this the approach we want to take? +- I am assuming that the fields that are in the list given by Susan to de-identify, those are ones we want to save to DASHER as custom metadata (that aren't the obvious entity id, etc)? +- Should the id_source be the "human friendly" name, or the entire SOPInstanceUID? -In a dicom directory... - - can we expect a dicom file to ever have an extension other than `.dcm`? Always lowercase? I am currently expecting lowercase `.dcm` as my criteria for including a file. - - will the files always be on one level (i.e., no subdirectories), eg series/dicom1...N. I am using os.listdir which will only look one level into the folder, but this could be changed. - - can there be more than one series in a folder ever? I am currently checking each file - - How should I expect the images to be moved into the final folder? The standard dicom structure is like: - - -``` -So I think in the above, the top level folder would be the patient, the IMAGES would be the study, and REF would be the series? diff --git a/docs/dicom_import.md b/docs/dicom_import.md index bdcbe6e..7e0b82f 100644 --- a/docs/dicom_import.md +++ b/docs/dicom_import.md @@ -20,8 +20,21 @@ In the above, we see that `dcmqr` is used to call `C-MOVE` to dump a bunch of di # Dicom Import When the [watcher](watcher.md) detects a `FINISHED` session directory in the folder being watched (`/data` in the container, mapping to `data` in the application base folder on the host), the process of importing the images into the database is started. This means the following steps: - - although the name of the folder for the series must by default be unique compared to others around it, the series id itself is extracted from the dicom files. Thus, for the actual metadata, the folder name is irrelevant - - all files in the folder are assumed to be dicom, as it is the case the extensions may vary. If a file is attempted to be read as dicom fails, a warning is issued and the file skipped, but the process continued. - - each dicom file is read, and during reading, added as an `Image` object to the database. The study and session are also extracted from the header, and these are added as `Study` and `Session` objects, respectively. - - adding a file to the database means replicating it in the database (media) storage. This isn't completely necessary, but it allows for deletion of the folder in `/data` so a human observer can more easily see processing occurring on the filesystem. - - all the images found in a folder are considered to be a "batch," and when all files for a batch have been added, the function fires off the list to be deidentified. +## 1. Adding Models to Database +Each dicom file is read, and during reading, added as an `Image` object to the database. The study and session are also extracted from the header, and these are added as `Study` and `Session` objects, respectively. The Series and Study Ids are extracted from these fields in the dicom header, for each file separately: + +``` +StudyID +SeriesInstanceUID +``` + +If we run into some case where the fields are not defined, I have put a check that will use the folder name instead, prefixed with `series_` or `study_`. For example, a folder `ST-1234` with a dicom missing header information would have study and session `series_ST-1234` and study `study_ST-1234`. + + +## 2. Saving Dicoms +All files in the folder are assumed to be dicom, as it is the case the extensions may vary. If a file is attempted to be read as dicom fails, a warning is issued and the file skipped, but the process continued. The file is not removed, in case inspection is warranted later (is this how we want it?) (some notification?) + +The dicom file itself, when saved to the model, is saved with the application's media at `/images`. Once the file is saved here, it is deleted from it's temporary folder in `/data`. This is to maximize space on the server, in case that is needed. + +## 3. Finishing Batch +All the images found in a folder are considered to be a "batch," and when all files for a batch have been added, the function fires off the list to be deidentified. If there were no files in the batch, the function is not fired. diff --git a/sendit/apps/main/tasks.py b/sendit/apps/main/tasks.py index ceefe06..5f1d8a5 100644 --- a/sendit/apps/main/tasks.py +++ b/sendit/apps/main/tasks.py @@ -36,6 +36,8 @@ Image ) +from sendit.apps.main.utils import save_image_dicom + from sendit.settings import ( DEIDENTIFY_RESTFUL, SEND_TO_ORTHANC, @@ -57,6 +59,7 @@ from pydicom.filereader import read_dicomdir from pydicom.dataset import Dataset from pydicom import read_file +from pydicom.errors import InvalidDicomError os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sendit.settings') app = Celery('sendit') @@ -75,21 +78,51 @@ def import_dicomdir(dicom_dir): # Add in each dicom file to the series for dcm_file in dicom_files: - dcm = read_file(dcm_file) - study,created = Study.objects.get_or_create(uid=dcm.StudyID) - series,created = Series.objects.get_or_create(uid=dcm.SeriesInstanceUID, - study=study) - # A dicom instance number must be unique for its series - # Since the field isn't consistent, we will use file name - dicom_uid = os.path.basename(os.path.splitext(dcm_file)[0]) - dicom = Image.objects.create(series=series, - uid=dicom_uid) - dicom.image = dcm_file - dicom.save() - dicom_ids.append(dicom.id) - + + try: + dcm = read_file(dcm_file,force=True) + dcm_folder = os.path.basename( os.path.dirname(dcm_file) ) + + # If a series or studyID is missing, use folder name + # which should be the accession number + uids = {"study":dcm.StudyID, + "series":dcm.SeriesInstanceUID} + for uid_key,uid in uids.items(): + if uid in [None,''] + uids[uid_key] = "%s_%s" %(uid_key, + dcm_folder) + + study,created = Study.objects.get_or_create(uid=uids['study']) + series,created = Series.objects.get_or_create(uid=uids['series'], + study=study) + + # A dicom instance number must be unique for its series + dicom_uid = os.path.basename(os.path.splitext(dcm_file)[0]) + + # Create the Image object in the database + dicom = Image.objects.create(series=series, + uid=dicom_uid) + + # Save the dicom file to storage + dicom = save_image_dicom(dicom=dicom, + dicom_file=dcm_file) # Also saves + os.remove(dcm_file) + dicom_ids.append(dicom.id) + + + # Note that on error we don't remove files + except InvalidDicomError: + bot.error("%s is an invalid dicom file, skipping." %(dcm_file)) + + except KeyError: + bot.error("%s is possibly an invalid dicom file, skipping." %(dcm_file)) + + # At the end, submit the dicoms to be deidentified as a batch - get_identifiers.apply_async(kwargs={"dicom_ids":dicom_ids}) + if len(dicom_ids) > 0: + bot.debug("Submitting task to get_identifiers for series %s with %s dicoms." %(series.uid, + len(dicom_ids))) + get_identifiers.apply_async(kwargs={"dicom_ids":dicom_ids}) else: bot.warning('Cannot find %s' %dicom_dir) @@ -105,13 +138,18 @@ def get_identifiers(dicom_ids): ''' if DEIDENTIFY_RESTFUL is True: + identifiers = dict() for dcm_id in dicom_ids: try: dcm = Image.objects.get(id=dcm_id) except Image.DoesNotExist: bot.warning("Cannot find image with id %s" %dcm_id) + dicom = read_file(dcm.image.path,force=True) + entity_id = dicom.get("PatientID", None) + id_source = dicom.get("") + ##CURRENTLY WTITING THIS study,created = Study.objects.get_or_create(uid=dcm.StudyID) series,created = Series.objects.get_or_create(uid=dcm.SeriesInstanceUID, diff --git a/sendit/settings/main.py b/sendit/settings/main.py index 897926e..446bbe3 100644 --- a/sendit/settings/main.py +++ b/sendit/settings/main.py @@ -151,8 +151,8 @@ # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.9/howto/static-files/ -MEDIA_ROOT = '/var/www/data' -MEDIA_URL = '/data/' +MEDIA_ROOT = '/var/www/images' +MEDIA_URL = '/images/' STATIC_ROOT = '/var/www/static' STATIC_URL = '/static/'