diff --git a/CHANGELOG b/CHANGELOG index 67473e7..f2cb5c3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,8 @@ -3.7.4 +4.0.0 + - BREAKING CHANGE: Due to the new imputation of nan-valued features, + rating inference might change slightly. For the "zef18" dataset, + inference was off by about 0.1 on average. + - enh: impute nan-valued feature data if corresponding response was 0 - enh: allow empty-valued groups in rating HDF5 file - enh: when encountering inf values in a training set, replace them with twice the maxium of that feature diff --git a/nanite/rate/rater.py b/nanite/rate/rater.py index a88d4f1..e5c47aa 100644 --- a/nanite/rate/rater.py +++ b/nanite/rate/rater.py @@ -1,5 +1,6 @@ import pathlib from pkg_resources import resource_filename +from typing import List, Literal import numpy as np from sklearn.pipeline import make_pipeline @@ -148,14 +149,52 @@ def get_training_set_path(label="zef18"): return resp_path @classmethod - def load_training_set(cls, path=None, names=None, which_type=None, - replace_inf=True, remove_nan=True, - ret_names=False): + def load_training_set( + cls, + path: pathlib.Path | str = None, + names: List[str] = None, + which_type: Literal["all", "binary", "continuous"] | List = None, + replace_inf: bool = True, + impute_zero_rated_nan: bool = True, + remove_nan: bool = True, + ret_names: bool = False): """Load a training set from a directory - By default, only the "continuous" features are imported. The - "binary" features are not needed for training; they are used - to sort out new force-distance data. + Parameters + ---------- + path: pathlib.Path or str + Optional path to the training set directory. If none + is specified, the default "zef18" is loaded. + names: list of str + List of features to use, defaults to all features. + which_type: str + Which type of feature to return see :const:`.VALID_FEATURE_TYPES` + for valid options. By default, only the "continuous" features + are imported. The "binary" features are not needed for training; + they are used to sort out new force-distance data. + replace_inf: bool + Replace infinity-valued feature values with + `2 * sign * max(abs(values))`. + impute_zero_rated_nan: bool + If there are nan-valued features that have a zero response + (rated worst), replace those feature values with the mean + of the zero-response features that are not nan-valued. + remove_nan: bool + Remove any nan-valued features (after `impute_zero_rated_nan` + was applied). This is necessary, since skimage cannot handle + nan-valued sample values. + ret_names: bool + Return the names of the features in addition to the samples + and response. + + Returns + ------- + samples: 2d ndarray + Sample values with axes `(data_size, num_features)` + response: 1d ndarray + Response array of length `data_size` + names: list, optional + List of feature names corresponsing to axis `1` in `samples` """ if which_type is None: which_type = ["continuous"] @@ -173,14 +212,39 @@ def load_training_set(cls, path=None, names=None, which_type=None, samples = [np.loadtxt(sp, dtype=float, ndmin=2) for sp in sample_paths] samples = np.concatenate(samples, axis=1) response = np.loadtxt(resp_path, dtype=float) + + # Deal with NaN-valued feature data with a response of 0. + if impute_zero_rated_nan: + resp0 = response == 0 + # For each feature, find values that are NaN where the + # response is zero. Those values are then be set to values + # where the response is zero and the values are not NaN. + for ii, fn in enumerate(fnames): + # locations where the feature is nan + fdat = samples[:, ii] + fnans = np.isnan(fdat) + # locations where feature is nan AND response is 0 + # (those are the locations we would like to change) + coloc = np.logical_and(resp0, fnans) + # location where the feature is not nan AND response is 0 + # (those are the reference locations) + ref = np.logical_and(resp0, ~fnans) + if np.any(coloc) and np.any(ref): + # We have values + refval = np.mean(fdat[ref]) + samples[coloc, ii] = refval + + # Deal with remaining NaN-valued feature data. if remove_nan: # Remove nan-values from training set valid = ~np.array(np.sum(np.isnan(samples), axis=1), dtype=bool) samples = samples[valid, :] # remove corresponding responses response = response[valid] + + # Deal with infinite feature data. if replace_inf: - for ii in range(samples.shape[1]): + for ii in range(len(fnames)): si = samples[:, ii] isinf = np.isinf(si) if np.any(isinf): diff --git a/tests/test_cli_rating.py b/tests/test_cli_rating.py index d43c383..0c817d4 100644 --- a/tests/test_cli_rating.py +++ b/tests/test_cli_rating.py @@ -66,6 +66,6 @@ def test_fit_data_with_zef18(): rating.fit_perform(path=jpkfile2, path_results=pout, profile_path=name) stats = np.loadtxt(pout / "statistics.tsv", skiprows=1, usecols=(1, 2, 3)) assert np.all(stats[:, 0] == [109, 129, 416]) - assert stats[0, 2] == 9.5 - assert stats[1, 2] == 2.4 - assert stats[2, 2] == 4.9 + assert stats[0, 2] == 9.6 + assert stats[1, 2] == 2.5 + assert stats[2, 2] == 5.0 diff --git a/tests/test_qmap.py b/tests/test_qmap.py index a72612d..6f433ad 100644 --- a/tests/test_qmap.py +++ b/tests/test_qmap.py @@ -136,9 +136,9 @@ def test_feat_rating(): qd = qm.get_qmap("fit: rating", qmap_only=True) vals = qd.flat[~np.isnan(qd.flat)] - assert np.allclose(vals[0], 9.370435813605962), "gray matter" - assert np.allclose(vals[2], 4.942804081687071), "white matter" - assert np.allclose(vals[1], 2.432396277782555), "background" + assert np.allclose(vals[0], 9.495637970738416), "gray matter" + assert np.allclose(vals[2], 4.9471000727759815), "white matter" + assert np.allclose(vals[1], 2.443830486100795), "background" def test_feat_rating_nofit(): diff --git a/tests/test_rate_io.py b/tests/test_rate_io.py index fd88a6d..51d1c50 100644 --- a/tests/test_rate_io.py +++ b/tests/test_rate_io.py @@ -81,7 +81,7 @@ def test_rate_manager_basic(): assert np.allclose( np.ndarray.item(rmg.get_rates(which="Extra Trees", training_set="zef18")), - 3.5492840783289035) + 3.3822603687594004) def test_rate_manager_crossval(): diff --git a/tests/test_rate_training_set.py b/tests/test_rate_training_set.py index 78b303e..7064e42 100644 --- a/tests/test_rate_training_set.py +++ b/tests/test_rate_training_set.py @@ -41,6 +41,43 @@ def test_user_training_set(): assert 4 < r2 < 5, "with the given random state we end up at 4.55" +def test_training_set_impute_nans(): + tdir = setup_training_set() + # edit one of the training feature data to contain an inf value + fpath = tdir / "train_feat_con_apr_flatness.txt" + rpath = tdir / "train_response.txt" + + fdat = np.loadtxt(fpath) + fdat[10] = 1.1 + fdat[11] = 1.2 + fdat[12] = np.nan + fdat[13] = np.nan + np.savetxt(fpath, fdat) + + rdat = np.loadtxt(rpath) + rdat[rdat == 0] = 1 + rdat[10] = 0 + rdat[11] = 0 + rdat[12] = 0 + rdat[13] = 0 + np.savetxt(rpath, rdat) + + samples, response, names = IndentationRater.load_training_set( + path=tdir, + which_type="continuous", + impute_zero_rated_nan=True, # This should be the default + replace_inf=True, # This should be the default + ret_names=True, + ) + idf = names.index("feat_con_apr_flatness") + data = samples[:, idf] + + assert np.allclose(data[10], 1.1) + assert np.allclose(data[11], 1.2) + assert np.allclose(data[12], 1.15) + assert np.allclose(data[13], 1.15) + + def test_training_set_inf_values(): tdir = setup_training_set() # edit one of the training feature data to contain an inf value @@ -54,7 +91,7 @@ def test_training_set_inf_values(): samples, response, names = IndentationRater.load_training_set( path=tdir, - which_type=["continuous"], + which_type="continuous", replace_inf=True, # This should be the default ret_names=True, )