Skip to content

Commit

Permalink
Merge pull request #70 from chenyangkang/beta
Browse files Browse the repository at this point in the history
Reconcile for occupancy model #68
  • Loading branch information
chenyangkang authored Nov 20, 2024
2 parents aae2122 + 3029c1e commit b19de92
Show file tree
Hide file tree
Showing 11 changed files with 395 additions and 130 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/run_pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:

jobs:
pytesting:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04
strategy:
fail-fast: true
matrix:
Expand All @@ -16,7 +16,7 @@ jobs:
- name: Set Swap Space
uses: pierotofy/set-swap-space@master
with:
swap-size-gb: 10
swap-size-gb: 20
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
Expand All @@ -33,8 +33,8 @@ jobs:
- name: Run pytest
run: |
pytest -n auto --cov --no-cov-on-fail --cov-report=term-missing:skip-covered --cov-report xml:coverage.xml
timeout-minutes: 300 # Set the timeout to 1.5 hours for this step
pytest -n auto --cov --no-cov-on-fail --cov-report=term-missing:skip-covered --cov-report xml:coverage.xml -v
timeout-minutes: 1000 # Set the timeout to 1.5 hours for this step

- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v4
Expand Down
266 changes: 178 additions & 88 deletions stemflow/model/AdaSTEM.py

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions stemflow/model/Hurdle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def fit(self, X_train: Union[pd.core.frame.DataFrame, np.ndarray], y_train: Sequ
"""
binary_ = np.unique(np.where(y_train > 0, 1, 0))
if len(binary_) == 1:
warnings.warn("Warning: only one class presented. Replace with dummy classifier & regressor.")
# warnings.warn("Warning: only one class presented. Replace with dummy classifier & regressor.")
self.classifier = dummy_model1(binary_[0])
self.regressor = dummy_model1(binary_[0])
return
Expand Down Expand Up @@ -88,7 +88,7 @@ def predict(self, X_test: Union[pd.core.frame.DataFrame, np.ndarray]) -> np.ndar
reg_res = self.regressor.predict(X_test)
# reg_res = np.where(reg_res>=0, reg_res, 0) ### we constrain the reg value to be positive
res = np.where(cls_res > 0, reg_res, cls_res)
return res.reshape(-1, 1)
return res.flatten()

def predict_proba(self, X_test: Union[pd.core.frame.DataFrame, np.ndarray]) -> np.ndarray:
"""Predicting probability
Expand Down Expand Up @@ -253,7 +253,7 @@ def predict(
# reg_res = np.where(reg_res>=0, reg_res, 0) ### we constrain the reg value to be positive
res = np.where(cls_res < 0.5, 0, cls_res)
res = np.where(cls_res > 0.5, reg_res, cls_res)
return res.reshape(-1, 1)
return res.flatten()

def predict_proba(
self,
Expand Down
20 changes: 14 additions & 6 deletions stemflow/model/STEM.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def __init__(
plot_empty: bool = False,
completely_random_rotation: bool = False,
lazy_loading: bool = False,
lazy_loading_dir: Union[str, None] = None
lazy_loading_dir: Union[str, None] = None,
min_class_sample: int = 1
):
"""Make a STEM object
Expand Down Expand Up @@ -118,6 +119,8 @@ def __init__(
If True, ensembles of models will be saved in disk, and only loaded when being used (e.g., prediction phase), and the ensembles of models are dump to disk once it is used.
lazy_loading_dir:
If lazy_loading, the directory of the model to temporary save to. Default to None, where a random number will be generated as folder name.
min_class_sample:
Minimum umber of samples needed to train the classifier in each stixel. If the sample does not satisfy, fit a dummy one. This parameter does not influence regression tasks.
Raises:
AttributeError: Base model do not have method 'fit' or 'predict'
Expand Down Expand Up @@ -175,7 +178,8 @@ def __init__(
plot_empty=plot_empty,
completely_random_rotation=completely_random_rotation,
lazy_loading=lazy_loading,
lazy_loading_dir=lazy_loading_dir
lazy_loading_dir=lazy_loading_dir,
min_class_sample=min_class_sample
)

self.grid_len = grid_len
Expand Down Expand Up @@ -240,7 +244,8 @@ def __init__(
plot_empty: bool = False,
completely_random_rotation: bool = False,
lazy_loading: bool = False,
lazy_loading_dir: Union[str, None] = None
lazy_loading_dir: Union[str, None] = None,
min_class_sample: int = 1
):
super().__init__(
base_model=base_model,
Expand Down Expand Up @@ -272,7 +277,8 @@ def __init__(
plot_empty=plot_empty,
completely_random_rotation=completely_random_rotation,
lazy_loading=lazy_loading,
lazy_loading_dir=lazy_loading_dir
lazy_loading_dir=lazy_loading_dir,
min_class_sample=min_class_sample
)

self.grid_len = grid_len
Expand Down Expand Up @@ -337,7 +343,8 @@ def __init__(
plot_empty: bool = False,
completely_random_rotation: bool = False,
lazy_loading: bool = False,
lazy_loading_dir: Union[str, None] = None
lazy_loading_dir: Union[str, None] = None,
min_class_sample: int = 1
):
super().__init__(
base_model=base_model,
Expand Down Expand Up @@ -369,7 +376,8 @@ def __init__(
plot_empty=plot_empty,
completely_random_rotation=completely_random_rotation,
lazy_loading=lazy_loading,
lazy_loading_dir=lazy_loading_dir
lazy_loading_dir=lazy_loading_dir,
min_class_sample=min_class_sample
)

self.grid_len = grid_len
25 changes: 17 additions & 8 deletions stemflow/model/SphereAdaSTEM.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ def __init__(
plot_empty: bool = False,
radius: float = 6371.0,
lazy_loading: bool = False,
lazy_loading_dir: Union[str, None] = None
lazy_loading_dir: Union[str, None] = None,
min_class_sample: int = 1
):
"""Make a Spherical AdaSTEM object
Expand Down Expand Up @@ -157,8 +158,9 @@ def __init__(
If True, ensembles of models will be saved in disk, and only loaded when being used (e.g., prediction phase), and the ensembles of models are dump to disk once it is used.
lazy_loading_dir:
If lazy_loading, the directory of the model to temporary save to. Default to None, where a random number will be generated as folder name.
min_class_sample:
Minimum umber of samples needed to train the classifier in each stixel. If the sample does not satisfy, fit a dummy one. This parameter does not influence regression tasks.
Raises:
AttributeError: Base model do not have method 'fit' or 'predict'
AttributeError: task not in one of ['regression', 'classification', 'hurdle']
Expand Down Expand Up @@ -214,7 +216,8 @@ def __init__(
verbosity=verbosity,
plot_empty=plot_empty,
lazy_loading=lazy_loading,
lazy_loading_dir=lazy_loading_dir
lazy_loading_dir=lazy_loading_dir,
min_class_sample=min_class_sample
)

if not self.Spatio1 == "longitude":
Expand Down Expand Up @@ -550,7 +553,8 @@ def __init__(
verbosity=0,
plot_empty=False,
lazy_loading=False,
lazy_loading_dir=None
lazy_loading_dir=None,
min_class_sample: int = 1
):
super().__init__(
base_model=base_model,
Expand Down Expand Up @@ -581,7 +585,8 @@ def __init__(
verbosity=verbosity,
plot_empty=plot_empty,
lazy_loading=lazy_loading,
lazy_loading_dir=lazy_loading_dir
lazy_loading_dir=lazy_loading_dir,
min_class_sample=min_class_sample
)

self.predict = MethodType(AdaSTEMClassifier.predict, self)
Expand Down Expand Up @@ -641,7 +646,8 @@ def __init__(
verbosity=0,
plot_empty=False,
lazy_loading=False,
lazy_loading_dir=None
lazy_loading_dir=None,
min_class_sample: int = 1
):
super().__init__(
base_model=base_model,
Expand Down Expand Up @@ -672,5 +678,8 @@ def __init__(
verbosity=verbosity,
plot_empty=plot_empty,
lazy_loading=lazy_loading,
lazy_loading_dir=lazy_loading_dir
lazy_loading_dir=lazy_loading_dir,
min_class_sample=min_class_sample
)

self.predict = MethodType(AdaSTEMRegressor.predict, self)
3 changes: 2 additions & 1 deletion stemflow/model/dummy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ def predict(self, X_test):
"""Fake predict"""
return np.array([self.the_value] * X_test.shape[0])

def predict_proba(self, X_test):
def predict_proba(self, X_test, **additional_parameters_for_base_model):
"""Fake predict_proba"""

if self.the_value == 0:
return np.array([[1, 0]] * X_test.shape[0])
elif self.the_value == 1:
Expand Down
40 changes: 21 additions & 19 deletions stemflow/model/static_func_AdaSTEM.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def train_one_stixel(
sample_weights_for_classifier: bool,
subset_x_names: bool,
stixel_X_train: pd.core.frame.DataFrame,
min_class_sample: int,
) -> Tuple[Union[None, BaseEstimator], list]:
"""Train one stixel
Expand All @@ -44,6 +45,7 @@ def train_one_stixel(
sample_weights_for_classifier (bool): Whether to balance the sample weights in classifier for imbalanced samples.
subset_x_names (bool): Whether to only store variables with std > 0 for each stixel.
sub_X_train (pd.core.frame.DataFrame): Input training dataframe for THE stixel.
min_class_sample (int): Minimum umber of samples needed to train the classifier in each stixel. If the sample does not satisfy, fit a dummy one.
Returns:
tuple[Union[None, BaseEstimator], list]: trained_model, stixel_specific_x_names
Expand All @@ -61,8 +63,11 @@ def train_one_stixel(
if nan_count > 0:
return (None, [], "Contain_Nan")

sample_count_each_class = {i:np.sum(np.where(sub_y_train > 0, 1, 0)==i) for i in unique_sub_y_train_binary}
min_sample_count_each_class = min([sample_count_each_class[i] for i in sample_count_each_class])

# fit
if (not task == "regression") and (len(unique_sub_y_train_binary) == 1):
if (not task == "regression") and ((len(unique_sub_y_train_binary) == 1) or min_sample_count_each_class < min_class_sample):
trained_model = dummy_model1(float(unique_sub_y_train_binary[0]))
return (trained_model, [], "Success")
else:
Expand All @@ -84,23 +89,15 @@ def train_one_stixel(
if (not task == "regression") and sample_weights_for_classifier:
sample_weights = class_weight.compute_sample_weight(
class_weight="balanced", y=np.where(sub_y_train > 0, 1, 0)
)

try:
trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train, sample_weight=sample_weights)

except Exception as e:
print(e)
# raise
return (None, [], "Base_model_fitting_error(non-regression, balanced weight)")
).astype('float32')
class_weights = class_weight.compute_class_weight(
class_weight="balanced", classes=np.array([0,1]), y=np.where(sub_y_train > 0, 1, 0)
).astype('float32')
trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train, sample_weight=sample_weights)
trained_model.my_class_weights = class_weights

else:
try:
trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train)

except Exception as e:
print(e)
# raise
return (None, [], "Base_model_fitting_error(regression)")
trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train)

return (trained_model, stixel_specific_x_names, "Success")

Expand Down Expand Up @@ -434,17 +431,20 @@ def predict_one_stixel(
X_test_stixel: pd.core.frame.DataFrame,
task: str,
model_x_names_tuple: Tuple[Union[None, BaseEstimator], list],
**base_model_prediction_param
) -> pd.core.frame.DataFrame:
"""predict_one_stixel
Args:
X_test_stixel (pd.core.frame.DataFrame): Input testing variables
task (str): One of 'regression', 'classification' and 'hurdle'
model_x_names_tuple (tuple[Union[None, BaseEstimator], list]): A tuple of (model, stixel_specific_x_names)
base_model_prediction_param: Additional parameter passed to base_model.predict_proba or base_model.predict
Returns:
A Dataframe of predicted results. With 'index' the same as the input indexes.
"""

if model_x_names_tuple[0] is None:
return None

Expand All @@ -453,9 +453,11 @@ def predict_one_stixel(

# get test data
if task == "regression":
pred = model_x_names_tuple[0].predict(np.array(X_test_stixel[model_x_names_tuple[1]]))
pred = model_x_names_tuple[0].predict(X_test_stixel[model_x_names_tuple[1]])
else:
pred = model_x_names_tuple[0].predict_proba(np.array(X_test_stixel[model_x_names_tuple[1]]))[:, 1]
pred = model_x_names_tuple[0].predict_proba(X_test_stixel[model_x_names_tuple[1]], **base_model_prediction_param)
pred = pred[:,1]


res = pd.DataFrame({"index": list(X_test_stixel.index), "pred": np.array(pred).flatten()}).set_index("index")

Expand Down
2 changes: 2 additions & 0 deletions stemflow/utils/plot_gif.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def round_to_same_decimal_places(A, B):
if log_scale
else np.max(data[col].values)
)

print(vmin, vmax)
norm = Normalize(vmin=vmin, vmax=vmax)

# Prepare colormap
Expand Down
Loading

0 comments on commit b19de92

Please sign in to comment.