diff --git a/adtl/__init__.py b/adtl/__init__.py index 66c67b7..3df939a 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -20,8 +20,10 @@ import requests import fastjsonschema from tqdm import tqdm +import warnings import adtl.transformations as tf +from adtl.transformations import AdtlTransformationWarning SUPPORTED_FORMATS = {"json": json.load, "toml": tomli.load} DEFAULT_DATE_FORMAT = "%Y-%m-%d" @@ -77,6 +79,7 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: if "apply" in rule: # apply data transformations. transformation = rule["apply"]["function"] + params = None if "params" in rule["apply"]: params = [] for i in range(len(rule["apply"]["params"])): @@ -100,21 +103,25 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: else: params.append(rule["apply"]["params"][i]) - try: - value = getattr(tf, transformation)(value, *params) - except AttributeError: - raise AttributeError( - f"Error using a data transformation: Function {transformation} " - "has not been defined." - ) - else: - try: - value = getattr(tf, transformation)(value) - except AttributeError: - raise AttributeError( - f"Error using a data transformation: Function {transformation} " - "has not been defined." - ) + try: + with warnings.catch_warnings(): + warnings.simplefilter("error", category=AdtlTransformationWarning) + if params: + value = getattr(tf, transformation)(value, *params) + else: + value = getattr(tf, transformation)(value) + except AttributeError: + raise AttributeError( + f"Error using a data transformation: Function {transformation} " + "has not been defined." + ) + except AdtlTransformationWarning as e: + if ctx and ctx.get("returnUnmatched"): + warnings.warn(str(e), AdtlTransformationWarning) + return value + else: + logging.error(str(e)) + return None return value if value == "": return None @@ -123,10 +130,14 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: value = value.lower() rule["values"] = {k.lower(): v for k, v in rule["values"].items()} - if rule.get("ignoreMissingKey"): + if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")): value = rule["values"].get(value, value) else: value = rule["values"].get(value) + + # recheck if value is empty after mapping (use to map values to None) + if value == "": + return None # Either source_unit / unit OR source_date / date triggers conversion # do not parse units if value is empty if "source_unit" in rule and "unit" in rule: @@ -142,6 +153,9 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: try: value = pint.Quantity(float(value), source_unit).to(unit).m except ValueError: + if ctx and ctx.get("returnUnmatched"): + logging.debug(f"Could not convert {value} to a floating point") + return value raise ValueError(f"Could not convert {value} to a floating point") if "source_date" in rule or (ctx and ctx.get("is_date")): assert "source_unit" not in rule and "unit" not in rule @@ -156,6 +170,8 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: value = datetime.strptime(value, source_date).strftime(target_date) except (TypeError, ValueError): logging.info(f"Could not parse date: {value}") + if ctx and ctx.get("returnUnmatched"): + return value return None return value elif "combinedType" in rule: @@ -609,6 +625,7 @@ def ctx(self, attribute: str): if self.header.get("skipFieldPattern") else False ), + "returnUnmatched": self.header.get("returnUnmatched", False), } def validate_spec(self): @@ -1042,6 +1059,10 @@ def main(argv=None): include_defs = args.include_def or [] spec = Parser(args.spec, include_defs=include_defs, quiet=args.quiet) + # check for incompatible options + if spec.header.get("returnUnmatched") and args.parquet: + raise ValueError("returnUnmatched and parquet options are incompatible") + # run adtl adtl_output = spec.parse(args.file, encoding=args.encoding) adtl_output.save(args.output or spec.name, args.parquet) diff --git a/adtl/transformations.py b/adtl/transformations.py index 9598ab8..c6107b5 100644 --- a/adtl/transformations.py +++ b/adtl/transformations.py @@ -1,6 +1,5 @@ """ Functions which can be applied to source fields, allowing extensibility """ -import logging from typing import Any, Optional, List from datetime import datetime, timedelta, date @@ -17,6 +16,12 @@ from typing import Literal, Union +import warnings + + +class AdtlTransformationWarning(UserWarning): + pass + def isNotNull(value: Optional[str]) -> bool: "Returns whether value is not null or an empty string" @@ -55,6 +60,12 @@ def wordSubstituteSet(value: str, *params) -> List[str]: for match, subst in sub_map.items(): if re.search(r"\b" + match + r"\b", value, re.IGNORECASE): out.append(subst) + if not out and (value not in [None, ""]): + warnings.warn( + f"No matches found for: '{value}'", + AdtlTransformationWarning, + stacklevel=2, + ) return sorted(set(out)) if out else None @@ -118,7 +129,8 @@ def yearsElapsed( bd_format: str = "%Y-%m-%d", cd_format: str = "%Y-%m-%d", ): - """Returns the number of years elapsed between two dates, useful for calculating ages + """ + Returns the number of years elapsed between two dates, useful for calculating ages Args: birthdate: Start date of duration @@ -126,10 +138,10 @@ def yearsElapsed( epoch: Epoch year after which dates will be converted to the last century. As an example, if epoch is 2022, then the date 1/1/23 will be converted to the January 1, 1923. - bd_format: Date format for *birthdate* specified using :manpage:`strftime(3)` conventions. - Defaults to ISO format ("%Y-%m-%d") - cd_format: Date format for *currentdate* specified using :manpage:`strftime(3)` conventions. - Defaults to ISO format ("%Y-%m-%d") + bd_format: Date format for *birthdate* specified using :manpage:`strftime(3)` + conventions. Defaults to ISO format ("%Y-%m-%d") + cd_format: Date format for *currentdate* specified using :manpage:`strftime(3)` + conventions. Defaults to ISO format ("%Y-%m-%d") Returns: int | None: Number of years elapsed or None if invalid dates were encountered @@ -145,8 +157,15 @@ def yearsElapsed( cd = datetime.strptime(currentdate, cd_format) - days = cd - bd - return pint.Quantity(days.days, "days").to("years").m + try: + days = cd - bd + return pint.Quantity(days.days, "days").to("years").m + except ValueError: + warnings.warn( + f"Failed calculation yearsElapsed: {birthdate}, {currentdate}", + AdtlTransformationWarning, + stacklevel=2, + ) def durationDays(startdate: str, currentdate: str) -> int: @@ -210,15 +229,19 @@ def makeDate(year: str, month: str, day: str) -> str: try: year, month, day = int(year), int(month), int(day) except ValueError: - logging.error( - f"Error in casting to integer: year={year}, month={month}, day={day}" + warnings.warn( + f"Could not construct date from: year={year}, month={month}, day={day}", + AdtlTransformationWarning, + stacklevel=2, ) return None try: return date(year, month, day).isoformat() except ValueError: - logging.error( - f"Could not construct date from: year={year}, month={month}, day={day}" + warnings.warn( + f"Could not construct date from: year={year}, month={month}, day={day}", + AdtlTransformationWarning, + stacklevel=2, ) return None @@ -245,8 +268,10 @@ def makeDateTimeFromSeconds( tzinfo=zoneinfo.ZoneInfo(timezone) ) except ValueError: - logging.error( - f"Could not convert date {date!r} from date format {date_format!r}" + warnings.warn( + f"Could not convert date {date!r} from date format {date_format!r}", + AdtlTransformationWarning, + stacklevel=2, ) return None if time_seconds == "": @@ -279,8 +304,10 @@ def makeDateTime( tzinfo=zoneinfo.ZoneInfo(timezone) ) except ValueError: - logging.error( - f"Could not convert date {date!r} from date format {date_format!r}" + warnings.warn( + f"Could not convert date {date!r} from date format {date_format!r}", + AdtlTransformationWarning, + stacklevel=2, ) return None @@ -315,6 +342,11 @@ def splitDate( elif option == "day": return sd.day else: + warnings.warn( + f"Invalid option {option!r} for splitDate", + AdtlTransformationWarning, + stacklevel=2, + ) return None @@ -330,7 +362,8 @@ def startYear( Use to calculate year e.g. of birth from date (e.g. current date) and duration (e.g. age) - The date can be provided as a list of possible dates (if a hierarchy needs searching through) + The date can be provided as a list of possible dates (if a hierarchy needs + searching through) Args: duration: Duration value @@ -442,7 +475,11 @@ def correctOldDate(date: str, epoch: float, format: str, return_datetime: bool = try: cd = datetime.strptime(date, format) except ValueError: - logging.error(f"Could not convert date {date!r} from date format {format!r}") + warnings.warn( + f"Could not convert date {date!r} from date format {format!r}", + AdtlTransformationWarning, + stacklevel=2, + ) return None if cd.year >= epoch and "y" in format: diff --git a/docs/specification.md b/docs/specification.md index ec46008..26ddcfd 100644 --- a/docs/specification.md +++ b/docs/specification.md @@ -53,6 +53,13 @@ if not present in a datafile, following the same syntax as `fieldPattern` key. * **defaultDateFormat**: Default source date format, applied to all fields with either "date_" / "_date" in the field name or that have format date set in the JSON schema +* **returnUnmatched**: Returns all values that are not able to be converted + according to the provided rules and formats. For fields with [value mappings](#field-with-value-mapping), it is equivalent to using `ignoreMissingKeys`. Fields using [data transformation functions](#data-transformations-(apply)) will issue a warning to the + terminal describing the error in the transformation. Transformations requiring multiple + parameters will only return the current field value that was not transformed. + > :warning: This is likely to return columns with non-matching datatypes. External json + validation may fail. This option is incompatible with the `--parquet` option to save + outputs as parquet files (which required a consistent type down each column). ## Validation diff --git a/tests/__snapshots__/test_parser.ambr b/tests/__snapshots__/test_parser.ambr index 1a51152..1016ddb 100644 --- a/tests/__snapshots__/test_parser.ambr +++ b/tests/__snapshots__/test_parser.ambr @@ -47,6 +47,13 @@ ''' # --- +# name: test_return_unmapped + ''' + age,date_death,date_of_birth,diabetes_type,has_smoking,pregnancy_birth_weight_kg,subject_id + fifteen,2023,2023-11-20,no diabetes,today,eight,1 + + ''' +# --- # name: test_show_report ''' diff --git a/tests/parsers/return-unmapped.toml b/tests/parsers/return-unmapped.toml new file mode 100644 index 0000000..d0a4037 --- /dev/null +++ b/tests/parsers/return-unmapped.toml @@ -0,0 +1,46 @@ +[adtl] + name = "test-return-unmapped" + description = "Example using returnUnmatched to return unmapped fields" + returnUnmatched = true + +[adtl.tables.subject] + kind = "groupBy" + groupBy = "subject_id" + aggregation = "lastNotNull" + +[subject] + + [subject.subject_id] + field = "subjid" + description = "Subject ID" + + [subject.date_of_birth] + field = "first_admit" + source_date = "%m" + + [subject.age] + field = "age" + apply = {function = "getFloat"} + + [subject.pregnancy_birth_weight_kg] + field = "weight" + unit = "kg" + source_unit = "lbs" + + [subject.has_smoking] + field = "smoking" + values = { 1 = "current", 2 = "never", 3 = "former" } + + [subject.diabetes_type] + field = "diabetes_type" + + apply.function = "wordSubstituteSet" + apply.params = [ + ["type[\\s\\-]?1", "type-1"], + ["type[\\s\\-]?2", "type-2"] + ] + + [subject.date_death] + field = "death_year" + apply.function = "makeDate" + apply.params = ["$death_month", "$death_day"] diff --git a/tests/sources/return-unmapped.csv b/tests/sources/return-unmapped.csv new file mode 100644 index 0000000..0c6e40f --- /dev/null +++ b/tests/sources/return-unmapped.csv @@ -0,0 +1,2 @@ +subjid,redcap,first_admit,age,weight,smoking,diabetes_type,death_year,death_month,death_day +1,admit,2023-11-20,fifteen,eight,today,no diabetes,2023,11,80 diff --git a/tests/test_parser.py b/tests/test_parser.py index ca48b3f..072bf1e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1227,6 +1227,22 @@ def test_main_parquet(): Path("output-table.parquet").unlink() +def test_main_parquet_error(): + ARG = [ + str(TEST_PARSERS_PATH / "return-unmapped.toml"), + str(TEST_SOURCES_PATH / "return-unmapped.csv"), + "-o", + "output", + "--encoding", + "utf-8", + ] + + with pytest.raises( + ValueError, match="returnUnmatched and parquet options are incompatible" + ): + parser.main(ARG + ["--parquet"]) + + @responses.activate def test_main_web_schema(snapshot): # test with schema on the web @@ -1360,3 +1376,12 @@ def test_no_overwriting(): .read_table("visit") ) assert overwriting_output == OVERWRITE_OUTPUT + + +def test_return_unmapped(snapshot): + transformed_csv_data = ( + parser.Parser(TEST_PARSERS_PATH / "return-unmapped.toml") + .parse(TEST_SOURCES_PATH / "return-unmapped.csv") + .write_csv("subject") + ) + assert transformed_csv_data == snapshot