From ed868e911cd032418032050c9edbd20b2161c542 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 8 Jun 2017 01:07:57 -0400 Subject: [PATCH] modified: README.md --- README.md | 7 +- docs/README.md | 2 +- docs/deidentify.md | 232 ++++++++++++++++++++++++++++++++++++++ sendit/apps/main/tasks.py | 39 ++++--- 4 files changed, 259 insertions(+), 21 deletions(-) create mode 100644 docs/deidentify.md diff --git a/README.md b/README.md index 81c6c40..39c97ea 100644 --- a/README.md +++ b/README.md @@ -164,5 +164,8 @@ These can be tweaked as needed, and likely I will do this as I develop the appli - When there is error reading a dicom file (I am using "force" so this will read most dicom files, but will have a KeyError for a non dicom file) I am logging the error and moving on, without deleting the file. Is this the approach we want to take? - I am assuming that the fields that are in the list given by Susan to de-identify, those are ones we want to save to DASHER as custom metadata (that aren't the obvious entity id, etc)? - Should the id_source be the "human friendly" name, or the entire SOPInstanceUID? - - +- The request to the identifiers (uid) endpoint has an entity, and then a list of items. The entity maps nicely to whatever individual is relevant for a series of images, but it isn't clear to me how I know what the id_source is for the data. I could either assume all are from Stanford and call it "Stanford MRN", or I could use the source of the images, which would be incorrect because it's from a machine. + - What should we do if the dicom image doesn't have a PatientID (and thus we have no way to identify the patient?) Right now I'm skipping the image. +- For each item in a request, there is an `id_source` and the example is `GE PACS`. However, it's not clear if this is something that should come from the dicom data (or set by default by us, since we are pulling all from the same PACS) or if it should be one of the following (in the dicom header). Right now I am using `SOPInstanceUID`, but that isn't super human friendly. +- For the fields that we don't need to remove from the dicom images (eg, about the image data) I think it wouldn't be useful to have as `custom_fields`, so I am suggesting (and currently implementing) that we don't send it to dasher with each item. We can send these fields to datastore to be easily searched, if that functionality is wanted. +- I originally had the PatientID being used as the identifiers main id, but I think this should actually be AccessionNumber (and the PatientID represented in the custom_fields, because we don't even always have a patient, but we will have an accession number!) Right now I am using accession number, and I can change this if necessary. diff --git a/docs/README.md b/docs/README.md index 0f846f0..0a4a1c1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,4 +21,4 @@ Reasonable updates would be: ## Steps in Pipeline 1. [Dicom Import](dicom_import.md): The logic for when a session directory is detected as finished by the Watcher. - 2. [Deidentify](): the defaults (and configuration) for the de-identification step of the pipeline (under development) + 2. [Deidentify](deidentify.md): the defaults (and configuration) for the de-identification step of the pipeline (under development) diff --git a/docs/deidentify.md b/docs/deidentify.md new file mode 100644 index 0000000..6fce248 --- /dev/null +++ b/docs/deidentify.md @@ -0,0 +1,232 @@ +# De-id Request + +Deidentification is a celery task defined in [main/tasks.py](../sendit/apps/main/tasks.py) that is triggered by the first task that adds a complete set of dicom files (beloning to a dataset with one accession number) to the database. In this task, we do the following: + +## 1. Datastructure Generated +The post to DASHER will like something like the following: + +```javascript +{ + "identifiers": [ + [ + { + "id": 12345678, + "id_source": "Stanford MRN", + "id_timestamp": {}, + "custom_fields": [ + { + "key": "studySiteID", + "value": 78329 + } + ], + "items": [ + { + "id": "A654321", + "id_source": "GE PACS", + "id_timestamp": {}, + "custom_fields": [ + { + "key": "studySiteID", + "value": 78329 + } + ] + } + ] + } + ] + ] +} + +``` + +A list of identifiers is given, and we can think of each thing in the list being an Entity, or corresponding to one Patient/Session. Each in this list has a set of id_* fields, and a list of items associated. This matches to our dicom import model, as the identifiers will be associated with one Accession Number, and the items the corresponding dicom files for the series. + +**Important** A dicom file that doesn't have an Entity (`AccessionNumber`) OR `InstanceNumber` Item id will be skipped, as these fields are required. + +While it is assumed that one folder of files, corresponding to one accession number, will truly have that be the case, given that the headers present different information (eg, different series/study) we will post a call to the API for each separate Entity represented in the dataset. + +### Identifiers +We can only get so much information about an individual from a dicom image, so most of these will be default, or empty. `id`: will correspond to the `PatientID`. The `id_source`, since it is not provided in the data, will always (for now) default to `Stanford MRN`. The `id_timestamp` will be blank, because it's not clear to me how we could derive when the id was generated. Fields that are specific to the patient will be put into `custom_fields` for the patient, so it might look something like the following: + +``` + "id": 12345678, + "id_source": "Stanford MRN", + "id_timestamp": {}, + "custom_fields": [ + { + "key": "OtherPatientIDs","value": "value" + }, + { + "key": "OtherPatientNames","value": "value" + }, + { + "key": "OtherPatientIDsSequence","value": "value" + }, + { + "key": "PatientAddress", "value": "value" + }, + { + "key": "PatientBirthDate","value": "value" + }, + { + "key": "PatientBirthName","value": "value" + }, + { + "key": "PatientMotherBirthName","value": "value" + }, + { + "key": "PatientName","value": "value" + }, + { + "key": "PatientTelephoneNumbers","value": "value" + } + +``` + +## Items +A list of items is associated with each Entity (the example above). The id for the item will correspond to the InstanceNumber, and the `id_source` will correspond to the `InstanceCreatorUID`. The timestamp must be derived from `InstanceCreationDate` and `InstanceCreationTime`. + +``` + "items": [ + { + "id": "A654321", + "id_source": "GE PACS", + "id_timestamp": {}, + "custom_fields": [ + { + "key": "studySiteID", + "value": 78329 + } + ] + } +``` + +## 2. Mapping of Identifiers +We will be removing all PHI from the datasets before moving into the cloud, as specified per HIPAA. This means we will remove the following HIPAA identifiers: + +- Name +- Geographic information smaller than state +- Dates more precise than year, and all ages greater than or equal to 90 years of age +- Telephone numbers +- Fax numbers +- Email addresses +- Social security numbers +- Medical record numbers +- Account numbers +- Certificate or license number +- Vehicle identifiers and serial numbers including license plate +- Device identifiers and serial numbers +- URLs +- IP address numbers +- Biometric identifiers +- Full face photographic images and comparable images +- Health plan beneficiary numbers +- Any other unique identifying number, characteristic, or code + + +To be explicitly clear, here are a set of tables to describe **1** the dicom identifier, **2** if relevent, how it is mapped to a field for the DASHER API, **3**, if the data is removed (meaning left as an empty string) before going into the cloud, meaning that it is considered in the HIPAA list above. Not all dicoms have all of these fields, and if the field is not found, no action is taken. + +### PHI Identifiers +For each of the below, a field under `DASHER` is assumed to be given with an Entity, one of which makes up a list of identifiers, for a `POST`. Removed does not mean that the field is deleted, but that it is made empty. If replacement is defined, the field from the `DASHER` response is subbed instead of a ''. For most of the below, we give the PHI data as a `custom_field` (to be stored with `DASHER`) and put an empty string in its spot for the data uploaded to Storage. + + +| Dicom Header Field | DASHER | Removed? | Replacement | +| -----------------------|:-------------:| ------------:| ----------------------: +| AccessionNumber |`custom_fields`| Yes | `` | +| ContentDate |`custom_fields`| Yes | `` | +| ImageComments |`custom_fields`| Yes | `` | +| InstanceCreationDate |`custom_fields`| Yes | `jittered_timestamp` | +| InstanceCreationTime |`custom_fields`| Yes | `` | +| InstanceCreatorUID |`custom_fields`| Yes | `` | +| MedicalRecordLocator |`custom_fields`| Yes | `` | +| OtherPatientIDs |`custom_fields`| Yes | `` | +| OtherPatientNames |`custom_fields`| Yes | `` | +| OtherPatientIDsSequence|`custom_fields`| Yes | `` | +| PatientAddress |`custom_fields`| Yes | `` | +| PatientBirthDate |`custom_fields`| Yes | `` | +| PatientBirthName |`custom_fields`| Yes | `` | +| PatientID | `id` (Entity) | Yes | `suid` | +| PatientMotherBirthName |`custom_fields`| Yes | `` | +| PatientName |`custom_fields`| Yes | `` | +| PatientTelephoneNumbers|`custom_fields`| Yes | `` | +| ReferringPhysicianName |`custom_fields`| Yes | `` | +| SeriesDate |`custom_fields`| Yes | `` | +| SeriesInstanceUID |`custom_fields`| Yes | `` | +| SeriesNumber |`custom_fields`| Yes | `` | +| SOPClassUID |`custom_fields`| Yes | `` | +| SOPInstanceUID |`custom_fields`| Yes | `` | +| SpecimenAccessionNumber|`custom_fields`| Yes | `` | +| StudyDate |`custom_fields`| Yes | `` | +| StudyID |`custom_fields`| Yes | `` | +| StudyInstanceUID |`custom_fields`| Yes | `` | +| StudyTime |`custom_fields`| Yes | `` | + + +The following fields are not considered PHI. For example, the InstanceNumber is not enough to uniquely identify an image - it could be the number '1', and this information is essential for researchers to have to reconstruct sequences. Thus, we don't need to remove / replace it, and we don't need to provide it in `custom_fields` for `DASHER`. We will, however, send it as metadata about the images to be searchable in Google Datastore. + + +| Dicom Header Field | +| ------------------------------------| +| BitsAllocated | +| BitsStored | +| Columns | +| ConversionType | +| DataSetTrailingPadding | +| DateOfSecondaryCapture | +| HighBit | +| InstanceNumber | +| Manufacturer | +| Modality | +| NumberOfFrames | +| PatientOrientation | +| PatientSex | +| PhotometricInterpretation | +| PixelData | +| PixelRepresentation | +| Rows | +| SamplesPerPixel | +| SecondaryCaptureDeviceManufacturer | +| TimezoneOffsetFromUTC | + + + +# De-id Response +The response might look like the following: + +``` + +{ + "results": [ + [ + { + "id": 12345678, + "id_source": "Stanford MRN", + "suid": "103e", + "jittered_timestamp": {}, + "custom_fields": [ + { + "key": "studySiteID", + "value": 78329 + } + ], + "items": [ + { + "id": "A654321", + "id_source": "GE PACS", + "suid": "103e", + "jittered_timestamp": {}, + "custom_fields": [ + { + "key": "studySiteID", + "value": 78329 + } + ] + } + ] + } + ] + ] +} +``` + +**MORE TO COME** not done yet :) diff --git a/sendit/apps/main/tasks.py b/sendit/apps/main/tasks.py index 5f1d8a5..2743ca0 100644 --- a/sendit/apps/main/tasks.py +++ b/sendit/apps/main/tasks.py @@ -37,6 +37,7 @@ ) from sendit.apps.main.utils import save_image_dicom +from som.api.identifiers.dicom import get_identifiers from sendit.settings import ( DEIDENTIFY_RESTFUL, @@ -56,8 +57,6 @@ ) import os -from pydicom.filereader import read_dicomdir -from pydicom.dataset import Dataset from pydicom import read_file from pydicom.errors import InvalidDicomError @@ -145,28 +144,32 @@ def get_identifiers(dicom_ids): dcm = Image.objects.get(id=dcm_id) except Image.DoesNotExist: bot.warning("Cannot find image with id %s" %dcm_id) - dicom = read_file(dcm.image.path,force=True) + + # Returns dictionary with {"id": {"identifiers"...}} + ids = get_identifiers(dicom_file=dcm.image.path) + + for uid,identifiers in ids.items(): + + # STOPPED HERE - I'm not sure why we need to keep + # study given that we represent things as batches of dicom + # It might be more suitable to model as a Batch, + # where a batch is a grouping of dicoms (that might actually + # be more than one series. Then we would store as Batch, + # and use the batch ID to pass around and get the images. + # Stopping here for tonight. + # Will need to test this out: + replacements = SeriesIdentifiers.objects.create(series=) - entity_id = dicom.get("PatientID", None) - id_source = dicom.get("") - ##CURRENTLY WTITING THIS - - study,created = Study.objects.get_or_create(uid=dcm.StudyID) - series,created = Series.objects.get_or_create(uid=dcm.SeriesInstanceUID, - study=study) - - #TODO here: put into data structures to send of to deidentify endpoint + + replace_identifiers.apply_async(kwargs={"dicom_ids":dicom_ids}) - bot.debug("Getting identifiers for %s" %(series)) - bot.warning('Vanessa write me!!') - # Send off task here to replace identifiers, which will send to storage else: - bot.debug("Vanessa write me!") - # Otherwise, just fire off function to send to storage as is. + bot.debug("Restful de-identification skipped [DEIDENTIFY_RESTFUL is False]") + upload_storage.apply_async(kwargs={"dicom_ids":dicom_ids}) @shared_task -def replace_identifiers(sid): +def replace_identifiers(dicom_ids): '''replace identifiers is called from get_identifiers, given that the user has asked to deidentify_restful. This function will do the replacement, and then trigger the function to send to storage