diff --git a/deid/config/__init__.py b/deid/config/__init__.py index fe966545..745e9276 100644 --- a/deid/config/__init__.py +++ b/deid/config/__init__.py @@ -91,28 +91,40 @@ def _get_section(self, name): """ section = None if self.deid is not None: - if name in self.deid: - section = self.deid[name] + section = self.deid.get(name) return section + # Get Sections + def get_format(self): """return the format of the loaded deid, if one exists """ return self._get_section("format") + def _get_named_section(self, section_name, name=None): + """a helper function to return an entire section, or if a name is + provided, a named section under it. If the section is not + defined, we appropriately return None. + """ + section = self._get_section(section_name) + if name is not None and section is not None: + section = section.get(name, []) + return section + def get_filters(self, name=None): """return all filters for a deid recipe, or a set based on a name """ - filters = self._get_section("filter") - if name is not None and filters is not None: - filters = filters[name] - return filters + return self._get_named_section("filter", name) - def ls_filters(self): - """list names of filter groups + def get_values_lists(self, name=None): + """return a values list by name """ - filters = self._get_section("filter") - return list(filters.keys()) + return self._get_named_section("values", name) + + def get_fields_lists(self, name=None): + """return a values list by name + """ + return self._get_named_section("fields", name) def get_actions(self, action=None, field=None): """get deid actions to perform on a header, or a subset based on a type @@ -137,6 +149,38 @@ def get_actions(self, action=None, field=None): return header + # Boolean properties + + def _has_list_content(self, name): + return len(self.deid.get(name, [])) > 0 + + def has_fields_lists(self): + return self._has_list_content("fields") + + def has_values_lists(self): + return self._has_list_content("values") + + def has_actions(self): + return self._has_list_content("header") + + # Listing + + def listof(self, section): + """return a list of keys for a section""" + listing = self._get_section(section) + return list(listing.keys()) + + def ls_filters(self): + return self.listof("filter") + + def ls_valuelists(self): + return self.listof("values") + + def ls_fieldlists(self): + return self.listof("fields") + + # Init + def _init_deid(self, deid=None, base=False, default_base="dicom"): """initalize the recipe with one or more deids, optionally including the default. This function is called at init time. If you need to add diff --git a/deid/config/standards.py b/deid/config/standards.py index d1c05f25..2f3e3076 100644 --- a/deid/config/standards.py +++ b/deid/config/standards.py @@ -31,9 +31,9 @@ # Supported Header Actions actions = ("ADD", "BLANK", "JITTER", "KEEP", "REPLACE", "REMOVE", "LABEL") -# Supported Group actions -fields_actions = ["FIELD"] -values_actions = ["FIELD", "SPLIT"] +# Supported Group actions (SPLIT only supported for values) +groups = ["values", "fields"] +group_actions = ("FIELD", "SPLIT") # Valid actions for a filter action filters = ( diff --git a/deid/config/utils.py b/deid/config/utils.py index 93b9d12d..2922c8c8 100644 --- a/deid/config/utils.py +++ b/deid/config/utils.py @@ -35,8 +35,8 @@ actions, sections, filters, - fields_actions, - values_actions, + groups, + group_actions, ) from collections import OrderedDict import os @@ -145,15 +145,9 @@ def load_deid(path=None): if line.startswith("#"): continue - # Starts with Format? - elif bool(re.match("format", line, re.I)): - fmt = re.sub("FORMAT|(\s+)", "", line).lower() - if fmt not in formats: - bot.exit("%s is not a valid format." % fmt) - - # Set format - config["format"] = fmt - bot.debug("FORMAT set to %s" % fmt) + # Set format + elif bool(re.match("^format", line, re.I)): + config["format"] = parse_format(line) # A new section? elif line.startswith("%"): @@ -174,24 +168,20 @@ def load_deid(path=None): config=config, section=section, section_name=section_name ) - # An action (replace, blank, remove, keep, jitter) + # A %fields action (only field allowed), %values allows split + elif line.upper().startswith(group_actions) and section in groups: + print("SECTION %s" % section) + print(config) + config = parse_group_action( + section=section, section_name=section_name, line=line, config=config + ) + + # An action (ADD, BLANK, JITTER, KEEP, REPLACE, REMOVE, LABEL) elif line.upper().startswith(actions): # Start of a filter group if line.upper().startswith("LABEL") and section == "filter": - members = [] - keep_going = True - while keep_going is True: - next_line = spec[0] - if next_line.upper().strip().startswith("LABEL"): - keep_going = False - elif next_line.upper().strip().startswith("%"): - keep_going = False - else: - new_member = spec.pop(0) - members.append(new_member) - if len(spec) == 0: - keep_going = False + members = parse_filter_group(spec) # Add the filter label to the config config = parse_label( @@ -244,6 +234,48 @@ def find_deid(path=None): return path +def parse_format(line): + """given a line that starts with FORMAT, parse the format of the + file and check that it is supported. If not, exit on error. If yes, + return the format. + + Parameters + ========== + line: the line that starts with format. + """ + fmt = re.sub("FORMAT|(\s+)", "", line).lower() + if fmt not in formats: + bot.exit("%s is not a valid format." % fmt) + bot.debug("FORMAT set to %s" % fmt) + return fmt + + +def parse_filter_group(spec): + """given the specification (a list of lines) continue parsing lines + until the filter group ends, as indicated by the start of a new LABEL, + (case 1), the start of a new section (case 2) or the end of the spec + file (case 3). Returns a list of members (lines) that belong to the + filter group. The list (by way of using pop) is updated in the calling + function. + + Parameters + ========== + spec: unparsed lines of the deid recipe file + """ + members = [] + keep_going = True + while keep_going and spec: + next_line = spec[0] + if next_line.upper().strip().startswith("LABEL"): + keep_going = False + elif next_line.upper().strip().startswith("%"): + keep_going = False + else: + new_member = spec.pop(0) + members.append(new_member) + return members + + def parse_label(section, config, section_name, members, label=None): """parse label will add a (optionally named) label to the filter section, including one or more criteria @@ -295,7 +327,10 @@ def parse_label(section, config, section_name, members, label=None): def parse_member(members, operator=None): - + """a parsing function for a filter member. Will return a single member + with fields, values, and an operator. In the case of multiple and/or + statements that are chained, will instead return a list. + """ main_operator = operator actions = [] @@ -388,7 +423,7 @@ def add_section(config, section, section_name=None): if section is None: bot.exit("You must define a section (e.g. %header) before any action.") - if section == "filter" and section_name is None: + if section in ["filter", "values", "fields"] and section_name is None: bot.exit("You must provide a name for a filter section.") if section not in sections: @@ -421,6 +456,55 @@ def _remove_comments(parts): return value.split("#")[0] # remove comments +def parse_group_action(section, line, config, section_name): + """parse a group action, either FIELD or SPLIT, which must belong to + either a fields or values section. + + Parameters + ========= + section: a valid section name from the deid config file + line: the line content to parse for the section/action + config: the growing/current config dictionary + section_name: optionally, a section name + """ + if not line.upper().startswith(group_actions): + bot.exit("%s is not a valid group action." % line) + + if not line.upper().startswith("FIELD") and section == "fields": + bot.exit("%fields only supports FIELD actions.") + + # We may have to deal with cases of spaces + bot.debug("%s: adding %s" % (section, line)) + parts = line.split(" ") + action = parts.pop(0).replace(" ", "") + + # Both require some parts + if not parts: + bot.exit("%s action %s requires additional arguments" % (section, action)) + + # For both, the second is always a field or field expander + field = parts.pop(0) + + # Fields supports one or more fields with expanders (no third arguments) + if section == "fields": + config[section][section_name].append({"action": action, "field": field}) + + # Values supports FIELD or SPLIT + elif section == "values": + + # If we have a third set of arguments + if parts: + value = _remove_comments(parts) + print(value) + config[section][section_name].append( + {"action": action, "field": field, "value": value} + ) + else: + config[section][section_name].append({"action": action, "field": field}) + + return config + + def parse_config_action(section, line, config, section_name=None): """add action will take a line from a deid config file, a config (dictionary), and an active section name (eg header) and add an entry to the config file to perform @@ -434,7 +518,6 @@ def parse_config_action(section, line, config, section_name=None): section_name: optionally, a section name """ - if not line.upper().startswith(actions): bot.exit("%s is not a valid action line." % line) diff --git a/deid/dicom/__init__.py b/deid/dicom/__init__.py index 480186bb..44982861 100644 --- a/deid/dicom/__init__.py +++ b/deid/dicom/__init__.py @@ -6,7 +6,5 @@ ) from .utils import get_files - from .fields import extract_sequence - from .pixels import has_burned_pixels, DicomCleaner diff --git a/deid/dicom/actions.py b/deid/dicom/actions.py index ba1c9630..038b348a 100644 --- a/deid/dicom/actions.py +++ b/deid/dicom/actions.py @@ -25,7 +25,7 @@ from deid.logger import bot from deid.config.standards import actions as valid_actions -from .fields import expand_field_expression +from .fields import expand_field_expression, find_by_values from deid.utils import get_timestamp, parse_value @@ -51,7 +51,7 @@ def perform_action(dicom, action, item=None, fields=None, return_seen=False): "action" (eg, REPLACE) what to do with the field "value": if needed, the field from the response to replace with """ - field = action.get("field") # e.g: PatientID, endswith:ID + field = action.get("field") # e.g: PatientID, endswith:ID, values:name, fields:name value = action.get("value") # "suid" or "var:field" action = action.get("action") # "REPLACE" @@ -60,8 +60,30 @@ def perform_action(dicom, action, item=None, fields=None, return_seen=False): bot.warning("%s in not a valid choice. Defaulting to blanked." % action) action = "BLANK" - # If there is an expander applied to field, we iterate over - fields = expand_field_expression(field=field, dicom=dicom, contenders=fields) + # If values or fields is provided, ids is required + if re.search("^(values|fields)", field): + if not item: + bot.exit( + "An item lookup must be provided to reference a list of values or fields." + ) + + # A values list returns fields with the value + if re.search("^values", field): + values = item.get(re.sub("^values:", "", field), []) + fields = find_by_values(values=values, dicom=dicom) + + # A fields list is used vertabim + elif re.search("^fields", field): + listing = [] + for contender in item.get(re.sub("^fields:", "", field), []): + listing += expand_field_expression( + field=contender, dicom=dicom, contenders=fields + ) + fields = listing + + else: + # If there is an expander applied to field, we iterate over + fields = expand_field_expression(field=field, dicom=dicom, contenders=fields) # Keep track of fields we have seen seen = [] diff --git a/deid/dicom/fields.py b/deid/dicom/fields.py index 16c2a44d..f0492a66 100644 --- a/deid/dicom/fields.py +++ b/deid/dicom/fields.py @@ -87,6 +87,22 @@ def extract_sequence(sequence, prefix=None): return items +def find_by_values(values, dicom): + """Given a list of values, find fields in the dicom that contain any + of those values, as determined by a regular expression search. + """ + fields = [] + contenders = get_fields(dicom) + + # Create single regular expression to search by + regexp = "(%s)" % "|".join(values) + for field, value in contenders.items(): + if re.search(regexp, value): + fields.append(field) + + return fields + + def expand_field_expression(field, dicom, contenders=None): """Get a list of fields based on an expression. If no expression found, return single field. Options for fields include: @@ -95,8 +111,7 @@ def expand_field_expression(field, dicom, contenders=None): startswith: filter to fields that start with the expression contains: filter to fields that contain the expression allfields: include all fields - exceptfields: filter to all fields except those listed ( | separated) - + exceptfields: filter to all fields except those listed ( | separated) """ # Expanders that don't have a : must be checked for expanders = ["all"] diff --git a/deid/dicom/groups.py b/deid/dicom/groups.py new file mode 100644 index 00000000..dbc31d32 --- /dev/null +++ b/deid/dicom/groups.py @@ -0,0 +1,108 @@ +""" + +groups: functions to derive groups of fields or values + +Copyright (c) 2020 Vanessa Sochat + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +""" + + +from deid.logger import bot +from .tags import remove_sequences +from .fields import get_fields, expand_field_expression + +import os + + +def extract_values_list(dicom, actions): + """Given a list of actions for a named group (a list) extract values from + the dicom based on the list of actions provided. This function + always returns a list intended to update some lookup to be used + to further process the dicom. + """ + values = [] + fields = get_fields(dicom) + for action in actions: + + # Just grab the entire value string for a field, no parsing + if action["action"] == "FIELD": + subset = expand_field_expression( + field=action["field"], dicom=dicom, contenders=fields + ) + [values.append(dicom.get(field)) for field in subset] + + # Split action, can optionally have a "by" and/or minlength parameter + elif action["action"] == "SPLIT": + subset = expand_field_expression( + field=action["field"], dicom=dicom, contenders=fields + ) + + # Default values for split are length 1 and character empty space + bot.debug("Parsing action %s" % action) + split_by = " " + minlength = 1 + + if "value" in action: + for param in action["value"].split(";"): + param_name, param_val = param.split("=") + + # Set a custom parameter legnth + if param_name == "minlength": + minlength = int(param_val) + bot.debug("Minimum length set to %s" % minlength) + elif param_name == "by": + split_by = param_val.strip("'").strip('"') + bot.debug("Splitting value set to %s" % split_by) + + for field in subset: + new_values = dicom.get(field, "").split(split_by) + for new_value in new_values: + if len(new_value) > minlength: + values.append(new_value) + + else: + bot.warning( + "Unrecognized action %s for values list extraction." % action["action"] + ) + + return values + + +def extract_fields_list(dicom, actions): + """Given a list of actions for a named group (a list) extract values from + the dicom based on the list of actions provided. This function + always returns a list intended to update some lookup to be used + to further process the dicom. + """ + subset = [] + fields = get_fields(dicom) + for action in actions: + + if action["action"] == "FIELD": + subset += expand_field_expression( + field=action["field"], dicom=dicom, contenders=fields + ) + + else: + bot.warning( + "Unrecognized action %s for fields list extraction." % action["action"] + ) + return subset diff --git a/deid/dicom/header.py b/deid/dicom/header.py index 922f5220..b55beadf 100644 --- a/deid/dicom/header.py +++ b/deid/dicom/header.py @@ -29,6 +29,7 @@ from deid.utils import read_json from .tags import remove_sequences +from .groups import extract_values_list, extract_fields_list from deid.dicom.tags import get_private from deid.config import DeidRecipe @@ -261,7 +262,22 @@ def replace_identifiers( dicom = remove_sequences(dicom) if recipe.deid is not None: + if dicom_file in ids: + + # Prepare additional lists of values and fields (updates item) + if deid.has_values_lists(): + for group, actions in deid.get_values_lists().items(): + ids[dicom_file][group] = extract_values_list( + dicom=dicom, actions=actions + ) + + if deid.has_fields_lists(): + for group, actions in deid.get_fields_lists().items(): + ids[dicom_file][group] = extract_fields_list( + dicom=dicom, actions=actions + ) + for action in deid.get_actions(): dicom = perform_action( dicom=dicom, item=ids[dicom_file], action=action diff --git a/deid/tests/test_config.py b/deid/tests/test_config.py index 71136f94..03dbf094 100644 --- a/deid/tests/test_config.py +++ b/deid/tests/test_config.py @@ -95,7 +95,7 @@ def test_standards(self): self.assertEqual(len(unknown), 0) print("Testing standards: allowed sections") - default_sections = ["header", "labels", "filter"] + default_sections = ["header", "labels", "filter", "fields", "values"] [self.assertTrue(x in sections) for x in default_sections] unknown = [x for x in sections if x not in default_sections] self.assertEqual(len(unknown), 0) diff --git a/deid/tests/test_utils.py b/deid/tests/test_utils.py index 3ce09842..2f41f3ba 100644 --- a/deid/tests/test_utils.py +++ b/deid/tests/test_utils.py @@ -92,7 +92,8 @@ def test_write_read_files(self): tmpfile = tempfile.mkstemp()[1] os.remove(tmpfile) write_json(good_json, tmpfile) - content = json.load(open(tmpfile, "r")) + with open(tmpfile, "r") as fd: + content = json.loads(fd.read()) self.assertTrue(isinstance(content, dict)) self.assertTrue("Wakkawakkawakka" in content) diff --git a/docs/_docs/examples/index.md b/docs/_docs/examples/index.md index 662d16a9..3dc8e9c8 100644 --- a/docs/_docs/examples/index.md +++ b/docs/_docs/examples/index.md @@ -19,7 +19,6 @@ want a quick start overview of deid. - [Tutorial]({{ site.baseurl }}/examples/recipe) - Recipes Files provided as [examples](https://github.com/pydicom/deid/tree/master/examples/deid) or [installed with deid](https://github.com/pydicom/deid/tree/master/deid/data). - ## Header Manipulation - [Replace with Function]({{ site.baseurl }}/examples/func-replace/) shows how to dynamically replace or update header values from a function. diff --git a/docs/_docs/examples/recipe.md b/docs/_docs/examples/recipe.md index df0ab5f1..892e45ff 100644 --- a/docs/_docs/examples/recipe.md +++ b/docs/_docs/examples/recipe.md @@ -9,10 +9,11 @@ and performing actions on headers (for replacement), are controlled by a text fi a deid recipe. If you want a reminder about how to write this text file, [read here]({{ site.baseurl }}/getting-started/dicom-config), and we hope to at some point have an interactive way as well (let us know your feedback!). -The basic gist of the file is that we have sections. In the `%header` -section we have a list of actions to take on header fields, and in each -`filter` section we have lists of criteria to check image headers against, -and given a match, we flag the image as belonging to the group. +The basic gist of the file is that we have different sections. + + - In the `%header` section we have a list of actions to take on header fields + - We can define groups, either field names `%fields` or values from fields `%values` to reference in header actions + - In the `%filter` section we have lists of criteria to check image headers against, and given a match, we flag the image as belonging to the group. In this small tutorial, we will walk through the basic steps of loading a recipe, interacting with it, and then using it to replace identifiers. If you want to @@ -20,6 +21,7 @@ jump in, then go straight to the [script](https://github.com/pydicom/deid/blob/m that describes this example. ## Recipe Management + The following sections will describe creating and combining recipes. ### Create a DeidRecipe @@ -72,7 +74,7 @@ recipe = DeidRecipe(deid=deid_file) I would strongly recommended starting with an example, and building your custom recipe from it. If you have an example that you think others would find useful, -please contribute it to the repository. +please contribute it to the repository in the examples folder. ### Combine Recipes @@ -117,10 +119,12 @@ that others might use, please [contribute it](https://github.com/pydicom/deid/bl ## Sections -Now let's discuss the sections that a recipe can include, including a header, labels, and filters. +Now let's discuss the sections that a recipe can include, including a header, labels, filters, and +groups for lists of values or fields. -## Recipe Filters +## Filters + The process of flagging images comes down to writing a set of filters to check if each image meets some criteria of interest. For example, I might create a filter called "xray" that is triggered when the Modality is CT or XR. @@ -141,6 +145,7 @@ recipe.get_filters('blacklist') ``` ## Header Actions + A header action is a step (e.g., replace, remove, blank) to be applied to a dicom image header. The headers are also part of the deid recipe. You don't need to necessarily use header actions and filters at the same time, but since @@ -185,6 +190,22 @@ recipe.get_actions(field='PatientID') # and logically, both recipe.get_actions(field='PatientID', action="REMOVE") # [{'action': 'REMOVE', 'field': 'PatientID'}] + +# If you have lists of fields or values defined, you can retrieve them too +recipe.get_fields_lists() +# OrderedDict([('instance_fields', +# [{'action': 'FIELD', 'field': 'contains:Instance'}])]) + +recipe.get_values_lists() +# OrderedDict([('cookie_names', +# [{'action': 'SPLIT', +# 'field': 'PatientID', +# 'value': 'by="^";minlength=4'}]), +# ('operator_names', +# [{'action': 'FIELD', 'field': 'startswith:Operator'}])]) + +recipe.get_values_lists("cookie_names") +# [{'action': 'SPLIT', 'field': 'PatientID', 'value': 'by="^";minlength=4'}] ``` If you have need for more advanced functions, please [file an issue](https://www.github.com/pydicom/deid/issues). @@ -309,3 +330,76 @@ cleaned_files = replace_identifiers(dicom_files=dicom_files, output_folder='/home/vanessa/Desktop', overwrite=True) ``` + +## Groups + +More advanced usage of header actions would be to define a group of values (the content of the +header fields) or field names (the names themselves) to use in an action. This corresponds +to `%fields` (a list of fields) and `%values` (a list of values from fields) to parse +at the onset of the dicom load, and use later in a recipe. Here is how that might look +in a recipe: + +``` +FORMAT dicom + +%values cookie_names +SPLIT PatientID by=" ";minlength=4 + +%values operator_names +FIELD startswith:Operator + +%fields instance_fields +FIELD contains:Instance + +%header + +ADD PatientIdentityRemoved Yes +REPLACE values:cookie_names var:id +REPLACE values:operator_names var:source_id +REMOVE fields:instance_fields +``` + +In the above, we define two lists of values (operator_names and cookie_names) +and a list of fields (instance_fields). The sections read as follows: + + - create a list of values called `cookie_names` that are from the PatientID field that is split by a space with a minimum length of 3 + - create a list of values called `operator_names` that includes any values from fields that start with "Operator" + - create a list of field names, `instance_fields` that includes any field that contains "Instance" + +And then in our `%header` section we take the following actions: + + - replace all fields that have any of the cookie names as a value with the variable defined by "id" + - replace all fields that have any of the operator_names as a value with the variable defined by source_id + - remove all fields defined in the list of instance_fields + +Let's give this a try with an example. We'll load a recipe, and then look +at the loaded deid (recipe.deid). + +```python +from deid.config import DeidRecipe +recipe = DeidRecipe("examples/deid/deid.dicom-group") +recipe.deid +OrderedDict([('format', 'dicom'), + ('values', + OrderedDict([('cookie_names', + [{'action': 'SPLIT', + 'field': 'PatientID', + 'value': 'by="^";minlength=4'}]), + ('operator_names', + [{'action': 'FIELD', + 'field': 'startswith:Operator'}])])), + ('fields', + OrderedDict([('instance_fields', + [{'action': 'FIELD', + 'field': 'contains:Instance'}])])), + ('header', + [{'action': 'ADD', + 'field': 'PatientIdentityRemoved', + 'value': 'Yes'}, + {'action': 'REPLACE', + 'field': 'values:cookie_names', + 'value': 'var:id'}, + {'action': 'REPLACE', + 'field': 'values:operator_names', + 'value': 'var:source_id'}])]) +``` diff --git a/examples/deid/deid.dicom-groups b/examples/deid/deid.dicom-groups new file mode 100644 index 00000000..56daf358 --- /dev/null +++ b/examples/deid/deid.dicom-groups @@ -0,0 +1,17 @@ +FORMAT dicom + +%values cookie_names +SPLIT PatientID by="^";minlength=4 + +%values operator_names +FIELD startswith:Operator + +%fields instance_fields +FIELD contains:Instance + +%header + +ADD PatientIdentityRemoved Yes +REPLACE values:cookie_names var:id +REPLACE values:operator_names var:source_id +REMOVE fields:instance_fields