Merge pull request #70 from chenyangkang/beta

Reconcile for occupancy model #68
chenyangkang · Nov 20, 2024 · b19de92 · b19de92
2 parents aae2122 + 3029c1e
commit b19de92
Show file tree

Hide file tree

Showing 11 changed files with 395 additions and 130 deletions.
diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   pytesting:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     strategy:
       fail-fast: true
       matrix:
@@ -16,7 +16,7 @@ jobs:
     - name: Set Swap Space
       uses: pierotofy/set-swap-space@master
       with:
-        swap-size-gb: 10
+        swap-size-gb: 20
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -33,8 +33,8 @@ jobs:
 
     - name: Run pytest
       run: |
-        pytest -n auto --cov --no-cov-on-fail --cov-report=term-missing:skip-covered --cov-report xml:coverage.xml
-      timeout-minutes: 300  # Set the timeout to 1.5 hours for this step
+        pytest -n auto --cov --no-cov-on-fail --cov-report=term-missing:skip-covered --cov-report xml:coverage.xml -v
+      timeout-minutes: 1000  # Set the timeout to 1.5 hours for this step
 
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@v4

diff --git a/stemflow/model/AdaSTEM.py b/stemflow/model/AdaSTEM.py
diff --git a/stemflow/model/Hurdle.py b/stemflow/model/Hurdle.py
@@ -50,7 +50,7 @@ def fit(self, X_train: Union[pd.core.frame.DataFrame, np.ndarray], y_train: Sequ
         """
         binary_ = np.unique(np.where(y_train > 0, 1, 0))
         if len(binary_) == 1:
-            warnings.warn("Warning: only one class presented. Replace with dummy classifier & regressor.")
+            # warnings.warn("Warning: only one class presented. Replace with dummy classifier & regressor.")
             self.classifier = dummy_model1(binary_[0])
             self.regressor = dummy_model1(binary_[0])
             return
@@ -88,7 +88,7 @@ def predict(self, X_test: Union[pd.core.frame.DataFrame, np.ndarray]) -> np.ndar
         reg_res = self.regressor.predict(X_test)
         # reg_res = np.where(reg_res>=0, reg_res, 0) ### we constrain the reg value to be positive
         res = np.where(cls_res > 0, reg_res, cls_res)
-        return res.reshape(-1, 1)
+        return res.flatten()
 
     def predict_proba(self, X_test: Union[pd.core.frame.DataFrame, np.ndarray]) -> np.ndarray:
         """Predicting probability
@@ -253,7 +253,7 @@ def predict(
         # reg_res = np.where(reg_res>=0, reg_res, 0) ### we constrain the reg value to be positive
         res = np.where(cls_res < 0.5, 0, cls_res)
         res = np.where(cls_res > 0.5, reg_res, cls_res)
-        return res.reshape(-1, 1)
+        return res.flatten()
 
     def predict_proba(
         self,

diff --git a/stemflow/model/STEM.py b/stemflow/model/STEM.py
@@ -48,7 +48,8 @@ def __init__(
         plot_empty: bool = False,
         completely_random_rotation: bool = False,
         lazy_loading: bool = False,
-        lazy_loading_dir: Union[str, None] = None
+        lazy_loading_dir: Union[str, None] = None,
+        min_class_sample: int = 1
     ):
         """Make a STEM object
 
@@ -118,6 +119,8 @@ def __init__(
                 If True, ensembles of models will be saved in disk, and only loaded when being used (e.g., prediction phase), and the ensembles of models are dump to disk once it is used.
             lazy_loading_dir:
                 If lazy_loading, the directory of the model to temporary save to. Default to None, where a random number will be generated as folder name.
+            min_class_sample:
+                Minimum umber of samples needed to train the classifier in each stixel. If the sample does not satisfy, fit a dummy one. This parameter does not influence regression tasks.
 
         Raises:
             AttributeError: Base model do not have method 'fit' or 'predict'
@@ -175,7 +178,8 @@ def __init__(
             plot_empty=plot_empty,
             completely_random_rotation=completely_random_rotation,
             lazy_loading=lazy_loading,
-            lazy_loading_dir=lazy_loading_dir
+            lazy_loading_dir=lazy_loading_dir,
+            min_class_sample=min_class_sample
         )
 
         self.grid_len = grid_len
@@ -240,7 +244,8 @@ def __init__(
         plot_empty: bool = False,
         completely_random_rotation: bool = False,
         lazy_loading: bool = False,
-        lazy_loading_dir: Union[str, None] = None
+        lazy_loading_dir: Union[str, None] = None,
+        min_class_sample: int = 1
     ):
         super().__init__(
             base_model=base_model,
@@ -272,7 +277,8 @@ def __init__(
             plot_empty=plot_empty,
             completely_random_rotation=completely_random_rotation,
             lazy_loading=lazy_loading,
-            lazy_loading_dir=lazy_loading_dir
+            lazy_loading_dir=lazy_loading_dir,
+            min_class_sample=min_class_sample
         )
 
         self.grid_len = grid_len
@@ -337,7 +343,8 @@ def __init__(
         plot_empty: bool = False,
         completely_random_rotation: bool = False,
         lazy_loading: bool = False,
-        lazy_loading_dir: Union[str, None] = None
+        lazy_loading_dir: Union[str, None] = None,
+        min_class_sample: int = 1
     ):
         super().__init__(
             base_model=base_model,
@@ -369,7 +376,8 @@ def __init__(
             plot_empty=plot_empty,
             completely_random_rotation=completely_random_rotation,
             lazy_loading=lazy_loading,
-            lazy_loading_dir=lazy_loading_dir
+            lazy_loading_dir=lazy_loading_dir,
+            min_class_sample=min_class_sample
         )
 
         self.grid_len = grid_len
diff --git a/stemflow/model/SphereAdaSTEM.py b/stemflow/model/SphereAdaSTEM.py
@@ -85,7 +85,8 @@ def __init__(
         plot_empty: bool = False,
         radius: float = 6371.0,
         lazy_loading: bool = False,
-        lazy_loading_dir: Union[str, None] = None
+        lazy_loading_dir: Union[str, None] = None,
+        min_class_sample: int = 1
     ):
         """Make a Spherical AdaSTEM object
 
@@ -157,8 +158,9 @@ def __init__(
                 If True, ensembles of models will be saved in disk, and only loaded when being used (e.g., prediction phase), and the ensembles of models are dump to disk once it is used.
             lazy_loading_dir:
                 If lazy_loading, the directory of the model to temporary save to. Default to None, where a random number will be generated as folder name.
-
-
+            min_class_sample:
+                Minimum umber of samples needed to train the classifier in each stixel. If the sample does not satisfy, fit a dummy one. This parameter does not influence regression tasks.
+                
         Raises:
             AttributeError: Base model do not have method 'fit' or 'predict'
             AttributeError: task not in one of ['regression', 'classification', 'hurdle']
@@ -214,7 +216,8 @@ def __init__(
             verbosity=verbosity,
             plot_empty=plot_empty,
             lazy_loading=lazy_loading,
-            lazy_loading_dir=lazy_loading_dir
+            lazy_loading_dir=lazy_loading_dir,
+            min_class_sample=min_class_sample
         )
 
         if not self.Spatio1 == "longitude":
@@ -550,7 +553,8 @@ def __init__(
         verbosity=0,
         plot_empty=False,
         lazy_loading=False,
-        lazy_loading_dir=None
+        lazy_loading_dir=None,
+        min_class_sample: int = 1
     ):
         super().__init__(
             base_model=base_model,
@@ -581,7 +585,8 @@ def __init__(
             verbosity=verbosity,
             plot_empty=plot_empty,
             lazy_loading=lazy_loading,
-            lazy_loading_dir=lazy_loading_dir
+            lazy_loading_dir=lazy_loading_dir,
+            min_class_sample=min_class_sample
         )
 
         self.predict = MethodType(AdaSTEMClassifier.predict, self)
@@ -641,7 +646,8 @@ def __init__(
         verbosity=0,
         plot_empty=False,
         lazy_loading=False,
-        lazy_loading_dir=None
+        lazy_loading_dir=None,
+        min_class_sample: int = 1
     ):
         super().__init__(
             base_model=base_model,
@@ -672,5 +678,8 @@ def __init__(
             verbosity=verbosity,
             plot_empty=plot_empty,
             lazy_loading=lazy_loading,
-            lazy_loading_dir=lazy_loading_dir
+            lazy_loading_dir=lazy_loading_dir,
+            min_class_sample=min_class_sample
         )
+
+        self.predict = MethodType(AdaSTEMRegressor.predict, self)
diff --git a/stemflow/model/dummy_model.py b/stemflow/model/dummy_model.py
@@ -25,8 +25,9 @@ def predict(self, X_test):
         """Fake predict"""
         return np.array([self.the_value] * X_test.shape[0])
 
-    def predict_proba(self, X_test):
+    def predict_proba(self, X_test, **additional_parameters_for_base_model):
         """Fake predict_proba"""
+
         if self.the_value == 0:
             return np.array([[1, 0]] * X_test.shape[0])
         elif self.the_value == 1:

diff --git a/stemflow/model/static_func_AdaSTEM.py b/stemflow/model/static_func_AdaSTEM.py
@@ -33,6 +33,7 @@ def train_one_stixel(
     sample_weights_for_classifier: bool,
     subset_x_names: bool,
     stixel_X_train: pd.core.frame.DataFrame,
+    min_class_sample: int,
 ) -> Tuple[Union[None, BaseEstimator], list]:
     """Train one stixel
 
@@ -44,6 +45,7 @@ def train_one_stixel(
         sample_weights_for_classifier (bool): Whether to balance the sample weights in classifier for imbalanced samples.
         subset_x_names (bool): Whether to only store variables with std > 0 for each stixel.
         sub_X_train (pd.core.frame.DataFrame): Input training dataframe for THE stixel.
+        min_class_sample (int): Minimum umber of samples needed to train the classifier in each stixel. If the sample does not satisfy, fit a dummy one.
 
     Returns:
         tuple[Union[None, BaseEstimator], list]: trained_model, stixel_specific_x_names
@@ -61,8 +63,11 @@ def train_one_stixel(
     if nan_count > 0:
         return (None, [], "Contain_Nan")
 
+    sample_count_each_class = {i:np.sum(np.where(sub_y_train > 0, 1, 0)==i) for i in unique_sub_y_train_binary}
+    min_sample_count_each_class = min([sample_count_each_class[i] for i in sample_count_each_class])
+
     # fit
-    if (not task == "regression") and (len(unique_sub_y_train_binary) == 1):
+    if (not task == "regression") and ((len(unique_sub_y_train_binary) == 1) or min_sample_count_each_class < min_class_sample):
         trained_model = dummy_model1(float(unique_sub_y_train_binary[0]))
         return (trained_model, [], "Success")
     else:
@@ -84,23 +89,15 @@ def train_one_stixel(
         if (not task == "regression") and sample_weights_for_classifier:
             sample_weights = class_weight.compute_sample_weight(
                 class_weight="balanced", y=np.where(sub_y_train > 0, 1, 0)
-            )
-
-            try:
-                trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train, sample_weight=sample_weights)
-
-            except Exception as e:
-                print(e)
-                # raise
-                return (None, [], "Base_model_fitting_error(non-regression, balanced weight)")
+            ).astype('float32')
+            class_weights = class_weight.compute_class_weight(
+                class_weight="balanced", classes=np.array([0,1]), y=np.where(sub_y_train > 0, 1, 0)
+            ).astype('float32')
+            trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train, sample_weight=sample_weights)
+            trained_model.my_class_weights = class_weights
+
         else:
-            try:
-                trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train)
-
-            except Exception as e:
-                print(e)
-                # raise
-                return (None, [], "Base_model_fitting_error(regression)")
+            trained_model.fit(sub_X_train[stixel_specific_x_names], sub_y_train)
 
     return (trained_model, stixel_specific_x_names, "Success")
 
@@ -434,17 +431,20 @@ def predict_one_stixel(
     X_test_stixel: pd.core.frame.DataFrame,
     task: str,
     model_x_names_tuple: Tuple[Union[None, BaseEstimator], list],
+    **base_model_prediction_param
 ) -> pd.core.frame.DataFrame:
     """predict_one_stixel
 
     Args:
         X_test_stixel (pd.core.frame.DataFrame): Input testing variables
         task (str): One of 'regression', 'classification' and 'hurdle'
         model_x_names_tuple (tuple[Union[None, BaseEstimator], list]): A tuple of (model, stixel_specific_x_names)
+        base_model_prediction_param: Additional parameter passed to base_model.predict_proba or base_model.predict
 
     Returns:
         A Dataframe of predicted results. With 'index' the same as the input indexes.
     """
+
     if model_x_names_tuple[0] is None:
         return None
 
@@ -453,9 +453,11 @@ def predict_one_stixel(
 
     # get test data
     if task == "regression":
-        pred = model_x_names_tuple[0].predict(np.array(X_test_stixel[model_x_names_tuple[1]]))
+        pred = model_x_names_tuple[0].predict(X_test_stixel[model_x_names_tuple[1]])
     else:
-        pred = model_x_names_tuple[0].predict_proba(np.array(X_test_stixel[model_x_names_tuple[1]]))[:, 1]
+        pred = model_x_names_tuple[0].predict_proba(X_test_stixel[model_x_names_tuple[1]], **base_model_prediction_param)
+        pred = pred[:,1]
+
 
     res = pd.DataFrame({"index": list(X_test_stixel.index), "pred": np.array(pred).flatten()}).set_index("index")
 

diff --git a/stemflow/utils/plot_gif.py b/stemflow/utils/plot_gif.py
@@ -122,6 +122,8 @@ def round_to_same_decimal_places(A, B):
             if log_scale
             else np.max(data[col].values)
         )
+
+    print(vmin, vmax)
     norm = Normalize(vmin=vmin, vmax=vmax)
 
     # Prepare colormap