Skip to content

Commit

Permalink
Creates a 'caseInsensitive' flag (#95)
Browse files Browse the repository at this point in the history
* Add 'caseInsensitive' option to field mapping

* Bump version to 0.6.0
  • Loading branch information
pipliggins authored Oct 2, 2024
1 parent 2d70bfa commit 8e4c96d
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 27 deletions.
54 changes: 36 additions & 18 deletions adtl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,19 +104,25 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
value = getattr(tf, transformation)(value, *params)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} has not been defined."
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
else:
try:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} has not been defined."
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
return value
if value == "":
return None
if "values" in rule:
if rule.get("caseInsensitive") and isinstance(value, str):
value = value.lower()
rule["values"] = {k.lower(): v for k, v in rule["values"].items()}

if rule.get("ignoreMissingKey"):
value = rule["values"].get(value, value)
else:
Expand All @@ -127,10 +133,10 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
assert "source_date" not in rule and "date" not in rule
source_unit = get_value(row, rule["source_unit"])
unit = rule["unit"]
if type(source_unit) != str:
if not isinstance(source_unit, str):
logging.debug(
f"Error converting source_unit {source_unit} to {unit!r} with rule: {rule}, "
"defaulting to assume source_unit is {unit}"
f"Error converting source_unit {source_unit} to {unit!r} with "
"rule: {rule}, defaulting to assume source_unit is {unit}"
)
return float(value)
try:
Expand Down Expand Up @@ -198,7 +204,8 @@ def parse_if(
cast_value = type(value)(attr_value)
except ValueError:
logging.debug(
f"Error when casting value {attr_value!r} with rule: {rule}, defaulting to False"
f"Error when casting value {attr_value!r} with rule: {rule}, defaulting"
" to False"
)
return False
if cmp == ">":
Expand Down Expand Up @@ -227,7 +234,8 @@ def parse_if(
cast_value = type(value)(attr_value)
except ValueError:
logging.debug(
f"Error when casting value {attr_value!r} with rule: {rule}, defaulting to False"
f"Error when casting value {attr_value!r} with rule: {rule}, defaulting"
" to False"
)
return False
return cast_value == value
Expand Down Expand Up @@ -371,7 +379,8 @@ def replace_val(
for_expr = match.pop("for")
if not isinstance(for_expr, dict):
raise ValueError(
f"for expression {for_expr!r} is not a dictionary of variables to list of values or a range"
f"for expression {for_expr!r} is not a dictionary of variables to list "
"of values or a range"
)

# Expand ranges when available
Expand All @@ -390,7 +399,8 @@ def replace_val(
pass
else:
raise ValueError(
f"for expression {for_expr!r} can only have lists or ranges for variables"
f"for expression {for_expr!r} can only have lists or ranges for "
"variables"
)
loop_vars = sorted(for_expr.keys())
loop_assignments = [
Expand Down Expand Up @@ -564,12 +574,14 @@ def __init__(
res = requests.get(schema)
if res.status_code != 200:
logging.warning(
f"Could not fetch schema for table {table!r}, will not validate"
f"Could not fetch schema for table {table!r}, will not "
"validate"
)
continue
except ConnectionError: # pragma: no cover
logging.warning(
f"Could not fetch schema for table {table!r}, will not validate"
f"Could not fetch schema for table {table!r}, will not "
"validate"
)
continue
self.schemas[table] = make_fields_optional(
Expand Down Expand Up @@ -618,7 +630,8 @@ def validate_spec(self):
)
if group_field is not None and aggregation != "lastNotNull":
raise ValueError(
f"groupBy needs aggregation=lastNotNull to be set for table: {table}"
"groupBy needs aggregation=lastNotNull to be set for table: "
f"{table}"
)

def _set_field_names(self):
Expand All @@ -632,7 +645,8 @@ def _set_field_names(self):
else:
if table not in self.schemas:
print(
f"Warning: no schema found for {table!r}, field names may be incomplete!"
f"Warning: no schema found for {table!r}, field names may be "
"incomplete!"
)
self.fieldnames[table] = list(
self.tables[table].get("common", {}).keys()
Expand Down Expand Up @@ -734,7 +748,8 @@ def update_table(self, table: str, row: StrDict):

if combined_type in ["all", "any", "min", "max"]:
values = [existing_value, value]
# normally calling eval() is a bad idea, but here values are restricted, so okay
# normally calling eval() is a bad idea, but here
# values are restricted, so okay
self.data[table][group_key][attr] = eval(combined_type)(
values
)
Expand Down Expand Up @@ -812,7 +827,8 @@ def parse_rows(self, rows: Iterable[StrDict], skip_validation=False):
"""Transform rows from an iterable according to specification
Args:
rows: Iterable of rows, specified as a dictionary of (field name, field value) pairs
rows: Iterable of rows, specified as a dictionary of
(field name, field value) pairs
skip_validation: Whether to skip validation, default off
Returns:
Expand Down Expand Up @@ -879,7 +895,8 @@ def write_csv(
Args:
table: Table that should be written to CSV
output: (optional) Output file name. If not specified, defaults to parser name + table name
output: (optional) Output file name. If not specified, defaults to parser
name + table name
with a csv suffix.
"""

Expand Down Expand Up @@ -960,8 +977,9 @@ def show_report(self):
print("|---------------|-------|-------|----------------|")
for table in self.report["total"]:
print(
f"|{table:14s}\t|{self.report['total_valid'][table]}\t|{self.report['total'][table]}\t"
f"|{self.report['total_valid'][table]/self.report['total'][table]:%} |"
f"|{table:14s}\t|{self.report['total_valid'][table]}\t"
f"|{self.report['total'][table]}\t"
f"|{self.report['total_valid'][table]/self.report['total'][table]:%} |" # noqa:E501
)
print()
for table in self.report["validation_errors"]:
Expand Down
1 change: 0 additions & 1 deletion docs/_static/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,3 @@ div.bodywrapper h4 {
padding-right: 0;
}
}

6 changes: 3 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

project = "adtl"
copyright = "2023, Global.health"
release = "0.5.0"
release = "0.6.0"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand All @@ -38,12 +38,12 @@

html_theme = "better"
html_static_path = ["_static"]
html_theme_path=[better.better_theme_path]
html_theme_path = [better.better_theme_path]
html_short_title = "Home"

html_theme_options = {
"rightsidebar": True,
"sidebarwidth": "25rem",
"cssfiles": ["_static/style.css"],
"showheader": False,
}
}
16 changes: 16 additions & 0 deletions docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,22 @@ values = { 1 = true, 2 = false }
description = "Dementia"
```

If the data for this field has a range of different capitalisations and you wish to
capture them all without specifying each variant, you can add `caseInsensitive = true`
to the rule:

```toml
[table.sex_at_birth]
field = "sex"
values = { homme = "male", femme = "female" }
caseInsensitive = true
```

When the parser encounters e.g. `Homme` or `FEMME` in the data it will still match to
`male` and `female` respectively. The parser will still ignore different spellings, e.g.
`Home` will return `null`.


### Combined type

Refers to multiple fields in the source format. Requires
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ packages = ["adtl"]

[project]
name = "adtl"
version = "0.5.0"
version = "0.6.0"
description = "Another data transformation language"
authors = [
{name = "Abhishek Dasgupta", email = "abhishek.dasgupta@dtc.ox.ac.uk"},
Expand Down
5 changes: 2 additions & 3 deletions tests/parsers/stop-overwriting.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
[visit.subject_id]
field = "subjid"
description = "Subject ID"

[visit.earliest_admission]
combinedType = "min"
fields = [
{ field = "first_admit" },
]

[visit.start_date]
combinedType = "firstNonNull"
fields = [
Expand All @@ -44,4 +44,3 @@
{ field = "overall_antiviral_dc___2", values = { 1 = "Lopinavir" } },
{ field = "overall_antiviral_dc___3", values = { 1 = "Interferon" } },
]

2 changes: 1 addition & 1 deletion tests/sources/stop-overwriting.csv
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ subjid,redcap,first_admit,enrolment,icu_admission_date,daily_antiviral_type___1,
2,day1,,,2020-11-30,0,1,0,0,0,0
3,admit,,2020-02-20,,0,0,0,0,0,0
3,discharge,,,,0,0,0,0,1,1
3,day1,,,,1,0,0,0,0,0
3,day1,,,,1,0,0,0,0,0
8 changes: 8 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@
"ignoreMissingKey": True,
}

RULE_CASEINSENSITIVE = {
"field": "diabetes_mhyn",
"values": {"Type 1": "E10", "TYPE 2": "E11"}, # ICD-10 codes
"caseInsensitive": True,
}

ROW_CONDITIONAL = {"outcome_date": "2022-01-01", "outcome_type": 4}
RULE_CONDITIONAL_OK = {"field": "outcome_date", "if": {"outcome_type": 4}}
RULE_CONDITIONAL_FAIL = {"field": "outcome_date", "if": {"outcome_type": {"<": 4}}}
Expand Down Expand Up @@ -445,6 +451,8 @@ def _subdict(d: Dict, keys: Iterable[Any]) -> Dict[str, Any]:
(({"first": "", "second": False}, RULE_COMBINED_FIRST_NON_NULL), False),
(({"diabetes_mhyn": "type 1"}, RULE_IGNOREMISSINGKEY), "E10"),
(({"diabetes_mhyn": "gestational"}, RULE_IGNOREMISSINGKEY), "gestational"),
(({"diabetes_mhyn": "type 2"}, RULE_CASEINSENSITIVE), "E11"),
(({"diabetes_mhyn": "TYPE 1"}, RULE_CASEINSENSITIVE), "E10"),
((ROW_CONDITIONAL, RULE_CONDITIONAL_OK), "2022-01-01"),
((ROW_CONDITIONAL, RULE_CONDITIONAL_FAIL), None),
((ROW_UNIT_MONTH, RULE_UNIT), 1.5),
Expand Down

0 comments on commit 8e4c96d

Please sign in to comment.