enh: impute nan-valued feature data if corresponding response was 0

AFM-analysis · Nov 10, 2023 · c9c5dbf · c9c5dbf
1 parent 232d51e
commit c9c5dbf
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 16 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,8 @@
-3.7.4
+4.0.0
+ - BREAKING CHANGE: Due to the new imputation of nan-valued features,
+   rating inference might change slightly. For the "zef18" dataset,
+   inference was off by about 0.1 on average.
+ - enh: impute nan-valued feature data if corresponding response was 0
  - enh: allow empty-valued groups in rating HDF5 file
  - enh: when encountering inf values in a training set, replace them
    with twice the maxium of that feature

diff --git a/nanite/rate/rater.py b/nanite/rate/rater.py
@@ -1,5 +1,6 @@
 import pathlib
 from pkg_resources import resource_filename
+from typing import List, Literal
 
 import numpy as np
 from sklearn.pipeline import make_pipeline
@@ -148,14 +149,52 @@ def get_training_set_path(label="zef18"):
         return resp_path
 
     @classmethod
-    def load_training_set(cls, path=None, names=None, which_type=None,
-                          replace_inf=True, remove_nan=True,
-                          ret_names=False):
+    def load_training_set(
+            cls,
+            path: pathlib.Path | str = None,
+            names: List[str] = None,
+            which_type: Literal["all", "binary", "continuous"] | List = None,
+            replace_inf: bool = True,
+            impute_zero_rated_nan: bool = True,
+            remove_nan: bool = True,
+            ret_names: bool = False):
         """Load a training set from a directory
 
-        By default, only the "continuous" features are imported. The
-        "binary" features are not needed for training; they are used
-        to sort out new force-distance data.
+        Parameters
+        ----------
+        path: pathlib.Path or str
+            Optional path to the training set directory. If none
+            is specified, the default "zef18" is loaded.
+        names: list of str
+            List of features to use, defaults to all features.
+        which_type: str
+            Which type of feature to return see :const:`.VALID_FEATURE_TYPES`
+            for valid options. By default, only the "continuous" features
+            are imported. The "binary" features are not needed for training;
+            they are used to sort out new force-distance data.
+        replace_inf: bool
+            Replace infinity-valued feature values with
+            `2 * sign * max(abs(values))`.
+        impute_zero_rated_nan: bool
+            If there are nan-valued features that have a zero response
+            (rated worst), replace those feature values with the mean
+            of the zero-response features that are not nan-valued.
+        remove_nan: bool
+            Remove any nan-valued features (after `impute_zero_rated_nan`
+            was applied). This is necessary, since skimage cannot handle
+            nan-valued sample values.
+        ret_names: bool
+            Return the names of the features in addition to the samples
+            and response.
+
+        Returns
+        -------
+        samples: 2d ndarray
+            Sample values with axes `(data_size, num_features)`
+        response: 1d ndarray
+            Response array of length `data_size`
+        names: list, optional
+            List of feature names corresponsing to axis `1` in `samples`
         """
         if which_type is None:
             which_type = ["continuous"]
@@ -173,14 +212,39 @@ def load_training_set(cls, path=None, names=None, which_type=None,
         samples = [np.loadtxt(sp, dtype=float, ndmin=2) for sp in sample_paths]
         samples = np.concatenate(samples, axis=1)
         response = np.loadtxt(resp_path, dtype=float)
+
+        # Deal with NaN-valued feature data with a response of 0.
+        if impute_zero_rated_nan:
+            resp0 = response == 0
+            # For each feature, find values that are NaN where the
+            # response is zero. Those values are then be set to values
+            # where the response is zero and the values are not NaN.
+            for ii, fn in enumerate(fnames):
+                # locations where the feature is nan
+                fdat = samples[:, ii]
+                fnans = np.isnan(fdat)
+                # locations where feature is nan AND response is 0
+                # (those are the locations we would like to change)
+                coloc = np.logical_and(resp0, fnans)
+                # location where the feature is not nan AND response is 0
+                # (those are the reference locations)
+                ref = np.logical_and(resp0, ~fnans)
+                if np.any(coloc) and np.any(ref):
+                    # We have values
+                    refval = np.mean(fdat[ref])
+                    samples[coloc, ii] = refval
+
+        # Deal with remaining NaN-valued feature data.
         if remove_nan:
             # Remove nan-values from training set
             valid = ~np.array(np.sum(np.isnan(samples), axis=1), dtype=bool)
             samples = samples[valid, :]
             # remove corresponding responses
             response = response[valid]
+
+        # Deal with infinite feature data.
         if replace_inf:
-            for ii in range(samples.shape[1]):
+            for ii in range(len(fnames)):
                 si = samples[:, ii]
                 isinf = np.isinf(si)
                 if np.any(isinf):

diff --git a/tests/test_cli_rating.py b/tests/test_cli_rating.py
@@ -66,6 +66,6 @@ def test_fit_data_with_zef18():
     rating.fit_perform(path=jpkfile2, path_results=pout, profile_path=name)
     stats = np.loadtxt(pout / "statistics.tsv", skiprows=1, usecols=(1, 2, 3))
     assert np.all(stats[:, 0] == [109, 129, 416])
-    assert stats[0, 2] == 9.5
-    assert stats[1, 2] == 2.4
-    assert stats[2, 2] == 4.9
+    assert stats[0, 2] == 9.6
+    assert stats[1, 2] == 2.5
+    assert stats[2, 2] == 5.0
diff --git a/tests/test_qmap.py b/tests/test_qmap.py
@@ -136,9 +136,9 @@ def test_feat_rating():
 
     qd = qm.get_qmap("fit: rating", qmap_only=True)
     vals = qd.flat[~np.isnan(qd.flat)]
-    assert np.allclose(vals[0], 9.370435813605962), "gray matter"
-    assert np.allclose(vals[2], 4.942804081687071), "white matter"
-    assert np.allclose(vals[1], 2.432396277782555), "background"
+    assert np.allclose(vals[0], 9.495637970738416), "gray matter"
+    assert np.allclose(vals[2], 4.9471000727759815), "white matter"
+    assert np.allclose(vals[1], 2.443830486100795), "background"
 
 
 def test_feat_rating_nofit():

diff --git a/tests/test_rate_io.py b/tests/test_rate_io.py
@@ -81,7 +81,7 @@ def test_rate_manager_basic():
     assert np.allclose(
         np.ndarray.item(rmg.get_rates(which="Extra Trees",
                                       training_set="zef18")),
-        3.5492840783289035)
+        3.3822603687594004)
 
 
 def test_rate_manager_crossval():

diff --git a/tests/test_rate_training_set.py b/tests/test_rate_training_set.py
@@ -41,6 +41,43 @@ def test_user_training_set():
     assert 4 < r2 < 5, "with the given random state we end up at 4.55"
 
 
+def test_training_set_impute_nans():
+    tdir = setup_training_set()
+    # edit one of the training feature data to contain an inf value
+    fpath = tdir / "train_feat_con_apr_flatness.txt"
+    rpath = tdir / "train_response.txt"
+
+    fdat = np.loadtxt(fpath)
+    fdat[10] = 1.1
+    fdat[11] = 1.2
+    fdat[12] = np.nan
+    fdat[13] = np.nan
+    np.savetxt(fpath, fdat)
+
+    rdat = np.loadtxt(rpath)
+    rdat[rdat == 0] = 1
+    rdat[10] = 0
+    rdat[11] = 0
+    rdat[12] = 0
+    rdat[13] = 0
+    np.savetxt(rpath, rdat)
+
+    samples, response, names = IndentationRater.load_training_set(
+        path=tdir,
+        which_type="continuous",
+        impute_zero_rated_nan=True,  # This should be the default
+        replace_inf=True,  # This should be the default
+        ret_names=True,
+    )
+    idf = names.index("feat_con_apr_flatness")
+    data = samples[:, idf]
+
+    assert np.allclose(data[10], 1.1)
+    assert np.allclose(data[11], 1.2)
+    assert np.allclose(data[12], 1.15)
+    assert np.allclose(data[13], 1.15)
+
+
 def test_training_set_inf_values():
     tdir = setup_training_set()
     # edit one of the training feature data to contain an inf value
@@ -54,7 +91,7 @@ def test_training_set_inf_values():
 
     samples, response, names = IndentationRater.load_training_set(
         path=tdir,
-        which_type=["continuous"],
+        which_type="continuous",
         replace_inf=True,  # This should be the default
         ret_names=True,
     )