Skip to content

Commit

Permalink
enh: impute nan-valued feature data if corresponding response was 0
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Nov 10, 2023
1 parent 232d51e commit c9c5dbf
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 16 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
3.7.4
4.0.0
- BREAKING CHANGE: Due to the new imputation of nan-valued features,
rating inference might change slightly. For the "zef18" dataset,
inference was off by about 0.1 on average.
- enh: impute nan-valued feature data if corresponding response was 0
- enh: allow empty-valued groups in rating HDF5 file
- enh: when encountering inf values in a training set, replace them
with twice the maxium of that feature
Expand Down
78 changes: 71 additions & 7 deletions nanite/rate/rater.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pathlib
from pkg_resources import resource_filename
from typing import List, Literal

import numpy as np
from sklearn.pipeline import make_pipeline
Expand Down Expand Up @@ -148,14 +149,52 @@ def get_training_set_path(label="zef18"):
return resp_path

@classmethod
def load_training_set(cls, path=None, names=None, which_type=None,
replace_inf=True, remove_nan=True,
ret_names=False):
def load_training_set(
cls,
path: pathlib.Path | str = None,
names: List[str] = None,
which_type: Literal["all", "binary", "continuous"] | List = None,
replace_inf: bool = True,
impute_zero_rated_nan: bool = True,
remove_nan: bool = True,
ret_names: bool = False):
"""Load a training set from a directory
By default, only the "continuous" features are imported. The
"binary" features are not needed for training; they are used
to sort out new force-distance data.
Parameters
----------
path: pathlib.Path or str
Optional path to the training set directory. If none
is specified, the default "zef18" is loaded.
names: list of str
List of features to use, defaults to all features.
which_type: str
Which type of feature to return see :const:`.VALID_FEATURE_TYPES`
for valid options. By default, only the "continuous" features
are imported. The "binary" features are not needed for training;
they are used to sort out new force-distance data.
replace_inf: bool
Replace infinity-valued feature values with
`2 * sign * max(abs(values))`.
impute_zero_rated_nan: bool
If there are nan-valued features that have a zero response
(rated worst), replace those feature values with the mean
of the zero-response features that are not nan-valued.
remove_nan: bool
Remove any nan-valued features (after `impute_zero_rated_nan`
was applied). This is necessary, since skimage cannot handle
nan-valued sample values.
ret_names: bool
Return the names of the features in addition to the samples
and response.
Returns
-------
samples: 2d ndarray
Sample values with axes `(data_size, num_features)`
response: 1d ndarray
Response array of length `data_size`
names: list, optional
List of feature names corresponsing to axis `1` in `samples`
"""
if which_type is None:
which_type = ["continuous"]
Expand All @@ -173,14 +212,39 @@ def load_training_set(cls, path=None, names=None, which_type=None,
samples = [np.loadtxt(sp, dtype=float, ndmin=2) for sp in sample_paths]
samples = np.concatenate(samples, axis=1)
response = np.loadtxt(resp_path, dtype=float)

# Deal with NaN-valued feature data with a response of 0.
if impute_zero_rated_nan:
resp0 = response == 0
# For each feature, find values that are NaN where the
# response is zero. Those values are then be set to values
# where the response is zero and the values are not NaN.
for ii, fn in enumerate(fnames):
# locations where the feature is nan
fdat = samples[:, ii]
fnans = np.isnan(fdat)
# locations where feature is nan AND response is 0
# (those are the locations we would like to change)
coloc = np.logical_and(resp0, fnans)
# location where the feature is not nan AND response is 0
# (those are the reference locations)
ref = np.logical_and(resp0, ~fnans)
if np.any(coloc) and np.any(ref):
# We have values
refval = np.mean(fdat[ref])
samples[coloc, ii] = refval

# Deal with remaining NaN-valued feature data.
if remove_nan:
# Remove nan-values from training set
valid = ~np.array(np.sum(np.isnan(samples), axis=1), dtype=bool)
samples = samples[valid, :]
# remove corresponding responses
response = response[valid]

# Deal with infinite feature data.
if replace_inf:
for ii in range(samples.shape[1]):
for ii in range(len(fnames)):
si = samples[:, ii]
isinf = np.isinf(si)
if np.any(isinf):
Expand Down
6 changes: 3 additions & 3 deletions tests/test_cli_rating.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ def test_fit_data_with_zef18():
rating.fit_perform(path=jpkfile2, path_results=pout, profile_path=name)
stats = np.loadtxt(pout / "statistics.tsv", skiprows=1, usecols=(1, 2, 3))
assert np.all(stats[:, 0] == [109, 129, 416])
assert stats[0, 2] == 9.5
assert stats[1, 2] == 2.4
assert stats[2, 2] == 4.9
assert stats[0, 2] == 9.6
assert stats[1, 2] == 2.5
assert stats[2, 2] == 5.0
6 changes: 3 additions & 3 deletions tests/test_qmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ def test_feat_rating():

qd = qm.get_qmap("fit: rating", qmap_only=True)
vals = qd.flat[~np.isnan(qd.flat)]
assert np.allclose(vals[0], 9.370435813605962), "gray matter"
assert np.allclose(vals[2], 4.942804081687071), "white matter"
assert np.allclose(vals[1], 2.432396277782555), "background"
assert np.allclose(vals[0], 9.495637970738416), "gray matter"
assert np.allclose(vals[2], 4.9471000727759815), "white matter"
assert np.allclose(vals[1], 2.443830486100795), "background"


def test_feat_rating_nofit():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rate_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_rate_manager_basic():
assert np.allclose(
np.ndarray.item(rmg.get_rates(which="Extra Trees",
training_set="zef18")),
3.5492840783289035)
3.3822603687594004)


def test_rate_manager_crossval():
Expand Down
39 changes: 38 additions & 1 deletion tests/test_rate_training_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,43 @@ def test_user_training_set():
assert 4 < r2 < 5, "with the given random state we end up at 4.55"


def test_training_set_impute_nans():
tdir = setup_training_set()
# edit one of the training feature data to contain an inf value
fpath = tdir / "train_feat_con_apr_flatness.txt"
rpath = tdir / "train_response.txt"

fdat = np.loadtxt(fpath)
fdat[10] = 1.1
fdat[11] = 1.2
fdat[12] = np.nan
fdat[13] = np.nan
np.savetxt(fpath, fdat)

rdat = np.loadtxt(rpath)
rdat[rdat == 0] = 1
rdat[10] = 0
rdat[11] = 0
rdat[12] = 0
rdat[13] = 0
np.savetxt(rpath, rdat)

samples, response, names = IndentationRater.load_training_set(
path=tdir,
which_type="continuous",
impute_zero_rated_nan=True, # This should be the default
replace_inf=True, # This should be the default
ret_names=True,
)
idf = names.index("feat_con_apr_flatness")
data = samples[:, idf]

assert np.allclose(data[10], 1.1)
assert np.allclose(data[11], 1.2)
assert np.allclose(data[12], 1.15)
assert np.allclose(data[13], 1.15)


def test_training_set_inf_values():
tdir = setup_training_set()
# edit one of the training feature data to contain an inf value
Expand All @@ -54,7 +91,7 @@ def test_training_set_inf_values():

samples, response, names = IndentationRater.load_training_set(
path=tdir,
which_type=["continuous"],
which_type="continuous",
replace_inf=True, # This should be the default
ret_names=True,
)
Expand Down

0 comments on commit c9c5dbf

Please sign in to comment.