pre-release fixes in extractors and industrial repo init

aimclub · Dec 22, 2023 · 2a79ed6 · 2a79ed6
1 parent 419dcb8
commit 2a79ed6
Show file tree

Hide file tree

Showing 9 changed files with 200 additions and 46 deletions.
diff --git a/benchmark/abstract_bench.py b/benchmark/abstract_bench.py
@@ -58,6 +58,8 @@ def evaluate_loop(self, dataset, experiment_setup: dict = None):
         matplotlib.use('TkAgg')
         train_data, test_data = DataLoader(dataset_name=dataset).load_data()
         experiment_setup['output_folder'] = experiment_setup['output_folder'] + f'/{dataset}'
+        if 'tuning_params' in experiment_setup.keys():
+            del experiment_setup['tuning_params']
         model = FedotIndustrial(**experiment_setup)
         model.fit(train_data)
         prediction = model.predict(test_data)
@@ -71,6 +73,19 @@ def evaluate_loop(self, dataset, experiment_setup: dict = None):
             print('No_visualisation')
         gc.collect()
         return prediction, model.predict_data.target
+
+    def finetune_loop(self, dataset, experiment_setup: dict = None):
+        train_data, test_data = DataLoader(dataset_name=dataset).load_data()
+        experiment_setup['output_folder'] = experiment_setup['output_folder'] + f'/{dataset}'
+        tuning_params = experiment_setup['tuning_params']
+        del experiment_setup['tuning_params']
+        model = FedotIndustrial(**experiment_setup)
+        model.load(path=experiment_setup['output_folder'] + '/0_pipeline_saved')
+        model.finetune(train_data, tuning_params=tuning_params)
+        prediction = model.finetune_predict(test_data)
+        gc.collect()
+        return prediction, model.predict_data.target
+
     def collect_results(self, output_dir):
         """Collect the results of the benchmark.
 

diff --git a/benchmark/benchmark_TSC.py b/benchmark/benchmark_TSC.py
@@ -55,7 +55,8 @@ def run(self):
                 metric = Accuracy(target, prediction).metric()
                 metric_dict.update({dataset_name: metric})
                 basic_results.loc[dataset_name, 'Fedot_Industrial'] = metric
-                dataset_path = os.path.join(self.experiment_setup['output_folder'], f'{dataset_name}', 'metrics_report.csv')
+                dataset_path = os.path.join(self.experiment_setup['output_folder'], f'{dataset_name}',
+                                            'metrics_report.csv')
                 basic_results.to_csv(dataset_path)
             except Exception:
                 print('Skip dataset')
@@ -64,6 +65,29 @@ def run(self):
         basic_results.to_csv(basic_path)
         self.logger.info("Benchmark test finished")
 
+    def finetune(self):
+        self.logger.info('Benchmark finetune started')
+        for dataset_name in self.custom_datasets:
+            try:
+                composed_model_path = PROJECT_PATH + self.path_to_save + f'/{dataset_name}' + '/0_pipeline_saved'
+                if os.path.isdir(composed_model_path):
+                    self.experiment_setup['output_folder'] = PROJECT_PATH + self.path_to_save
+                    experiment_setup = deepcopy(self.experiment_setup)
+                    prediction, target = self.finetune_loop(dataset_name, experiment_setup)
+                    metric = Accuracy(target, prediction).metric()
+                    dataset_path = os.path.join(self.experiment_setup['output_folder'], f'{dataset_name}',
+                                                'metrics_report.csv')
+                    fedot_results = pd.read_csv(dataset_path, index_col=0)
+                    fedot_results.loc[dataset_name, 'Fedot_Industrial_finetuned'] = metric
+
+                    fedot_results.to_csv(dataset_path)
+                else:
+                    print(f"No composed model for dataset - {dataset_name}")
+            except Exception:
+                print('Skip dataset')
+            gc.collect()
+        self.logger.info("Benchmark finetune finished")
+
     def load_local_basic_results(self, path: str = None):
         if path is None:
             path = PROJECT_PATH + self.path_to_result
@@ -83,4 +107,3 @@ def load_web_results(self):
         sota_results = get_estimator_results(estimators=sota_estimators['classification'].values.tolist())
         sota_results_df = pd.DataFrame(sota_results)
         return sota_results_df
-
diff --git a/examples/benchmark/time_series_uni_clf_benchmark.py b/examples/benchmark/time_series_uni_clf_benchmark.py
@@ -27,6 +27,9 @@
                     'initial_assumption': None,
                     'max_pipeline_fit_time': 10,
                     'with_tuning': False,
+                    'tuning_params': {'tuning_timeout': 10,
+                                      'tuning_iterations': 1000,
+                                      'tuning_early_stop': 50},
                     'early_stopping_iterations': 5,
                     'early_stopping_timeout': 90,
                     'optimizer': IndustrialEvoOptimizer}
@@ -36,41 +39,41 @@
                              custom_datasets=[
                                  # "Beef",
                                  # "BeetleFly",
-                                 "BirdChicken",
-                                 "BME",
-                                 "Car",
-                                 "CBF",
-                                 "Chinatown",
-                                 "ChlorineConcentration",
-                                 "CinCECGTorso",
-                                 "Coffee",
-                                 "Computers",
-                                 "CricketX",
-                                 "CricketY",
-                                 "CricketZ",
-                                 "Crop",
-                                 "DiatomSizeReduction",
-                                 "DistalPhalanxOutlineCorrect",
-                                 "DistalPhalanxOutlineAgeGroup",
-                                 "DistalPhalanxTW",
-                                 "Earthquakes",
-                                 "ECG200",
-                                 "ECG5000",
-                                 "ECGFiveDays",
-                                 "ElectricDevices",
-                                 "EOGHorizontalSignal",
-                                 "EOGVerticalSignal",
-                                 "EthanolLevel",
-                                 "FaceAll",
-                                 "FaceFour",
-                                 "FacesUCR",
-                                 "FiftyWords",
-                                 "Fish",
-                                 "FordA",
-                                 "FordB",
-                                 "FreezerRegularTrain",
-                                 "FreezerSmallTrain",
-                                 "Fungi",
+                                 # "BirdChicken",
+                                 # "BME",
+                                 # "Car",
+                                 # "CBF",
+                                 # "Chinatown",
+                                 # "ChlorineConcentration",
+                                 # "CinCECGTorso",
+                                 # "Coffee",
+                                 # "Computers",
+                                 # "CricketX",
+                                 # "CricketY",
+                                 # "CricketZ",
+                                 # "Crop",
+                                 # "DiatomSizeReduction",
+                                 # "DistalPhalanxOutlineCorrect",
+                                 # "DistalPhalanxOutlineAgeGroup",
+                                 # "DistalPhalanxTW",
+                                 # "Earthquakes",
+                                 # "ECG200",
+                                 # "ECG5000",
+                                 # "ECGFiveDays",
+                                 # "ElectricDevices",
+                                 # "EOGHorizontalSignal",
+                                 # "EOGVerticalSignal",
+                                 # "EthanolLevel",
+                                 # "FaceAll",
+                                 # "FaceFour",
+                                 # "FacesUCR",
+                                 # "FiftyWords",
+                                 # "Fish",
+                                 # "FordA",
+                                 # "FordB",
+                                 # "FreezerRegularTrain",
+                                 # "FreezerSmallTrain",
+                                 # "Fungi",
                                  "GunPoint",
                                  "GunPointAgeSpan",
                                  "GunPointMaleVersusFemale",
@@ -147,3 +150,4 @@
                              ],
                              use_small_datasets=True)
     benchmark.run()
+    #benchmark.finetune()
diff --git a/fedot_ind/api/main.py b/fedot_ind/api/main.py
@@ -1,4 +1,5 @@
 import logging
+from functools import partial
 from pathlib import Path
 from typing import Tuple, Union
 
@@ -7,7 +8,10 @@
 from fedot.api.main import Fedot
 from fedot.core.pipelines.pipeline import Pipeline
 from fedot.core.pipelines.pipeline_builder import PipelineBuilder
-
+from fedot.core.pipelines.tuning.tuner_builder import TunerBuilder
+from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum
+from golem.core.tuning.simultaneous import SimultaneousTuner
+from golem.core.tuning.sequential import SequentialTuner
 from fedot_ind.api.utils.checkers_collections import DataCheck
 from fedot_ind.api.utils.path_lib import default_path_to_save_results
 from fedot_ind.core.operation.transformation.splitter import TSTransformer
@@ -98,7 +102,6 @@ def _preprocessing_strategy(self, input_data):
             self.logger.info(f'Dataset size before preprocessing - {input_data.features.shape}')
             self.logger.info('PCA transformation was applied to input data due to dataset size')
             if len(input_data.features.shape) == 3:
-                #self.preprocessing_model = PipelineBuilder().add_node('eigen_basis', params={'tensor_approximation': True}).build()
                 self.preprocessing_model = PipelineBuilder().add_node('pca', params={'n_components': 0.9}).build()
             else:
                 self.preprocessing_model = PipelineBuilder().add_node('pca', params={'n_components': 0.9}).build()
@@ -161,6 +164,48 @@ def predict_proba(self, predict_data, **kwargs) -> np.ndarray:
             self.logger.info(f'Test Dataset size after preprocessing - {self.predict_data.features.shape}')
         return self.solver.predict_proba(self.predict_data)
 
+    def finetune(self, train_data, tuning_params) -> np.ndarray:
+        """
+        Method to obtain prediction probabilities from trained Industrial model.
+
+        Args:
+            test_features: raw test data
+
+        Returns:
+            the array with prediction probabilities
+
+        """
+        train_data = DataCheck(input_data=train_data, task=self.config_dict['problem']).check_input_data()
+        if train_data.num_classes > 2:
+            metric = ClassificationMetricsEnum.f1
+        else:
+            metric = ClassificationMetricsEnum.accuracy
+        tuning_method = partial(SequentialTuner, inverse_node_order=True)
+        tuning_method = SimultaneousTuner
+        pipeline_tuner = TunerBuilder(train_data.task) \
+            .with_tuner(tuning_method) \
+            .with_metric(metric) \
+            .with_timeout(tuning_params['tuning_timeout']) \
+            .with_early_stopping_rounds(tuning_params['tuning_early_stop']) \
+            .with_iterations(tuning_params['tuning_iterations']) \
+            .build(train_data)
+        self.current_pipeline = pipeline_tuner.tune(self.current_pipeline)
+        self.current_pipeline.fit(train_data)
+
+    def finetune_predict(self, test_data) -> np.ndarray:
+        """
+        Method to obtain prediction probabilities from trained Industrial model.
+
+        Args:
+            test_features: raw test data
+
+        Returns:
+            the array with prediction probabilities
+
+        """
+        self.predict_data = DataCheck(input_data=test_data, task=self.config_dict['problem']).check_input_data()
+        return self.current_pipeline.predict(self.predict_data, 'labels').predict
+
     def get_metrics(self, **kwargs) -> dict:
         """
         Method to obtain Gets quality metrics
@@ -210,7 +255,8 @@ def load(self, path):
             path (str): path to the model
 
         """
-        raise NotImplementedError()
+        self.current_pipeline = Pipeline(use_input_preprocessing=self.solver.params.get('use_input_preprocessing'))
+        self.current_pipeline.load(path)
 
     def save_optimization_history(self, **kwargs):
         """Plot prediction of the model"""

diff --git a/fedot_ind/core/models/base_extractor.py b/fedot_ind/core/models/base_extractor.py
@@ -106,7 +106,7 @@ def get_statistical_features(self,
 
         for method in list_of_methods:
             try:
-                features.append(method[1](time_series))
+                features.append(method[1](time_series).reshape(-1,1))
                 names.append(method[0])
             except ValueError:
                 continue

diff --git a/fedot_ind/core/models/signal/signal_extractor.py b/fedot_ind/core/models/signal/signal_extractor.py
@@ -24,7 +24,7 @@ def __init__(self, params: Optional[OperationParameters] = None):
 
     def _transform(self, input_data: InputData) -> np.array:
         wavelet_basis = self.wavelet_basis({'n_components': self.n_components,
-                                            'wavelet': self.wavelet})
+                                            'wavelet':self.wavelet})
         transformed_features = wavelet_basis.transform(input_data)
         predict = self._clean_predict(transformed_features.predict)
         return predict
diff --git a/fedot_ind/core/operation/transformation/basis/abstract_basis.py b/fedot_ind/core/operation/transformation/basis/abstract_basis.py
@@ -77,9 +77,10 @@ def _transform(self, input_data: Union[InputData, pd.DataFrame]) -> np.array:
             features = np.array(ListMonad(*input_data.features.tolist()).value)
         else:
             features = np.array(ListMonad(*input_data.tolist()).value)
-        #features = np.array([series[~np.isnan(series)] for series in features])
         if len(features.shape) == 2 and features.shape[1] == 1:
             features = features.reshape(1, -1)
+        elif len(features.shape) == 3 and features.shape[1] == 1:
+            features = features.squeeze()
         parallel = Parallel(n_jobs=self.n_processes, verbose=0, pre_dispatch="2*n_jobs")
         v = parallel(delayed(self._transform_one_sample)(sample) for sample in features)
 

diff --git a/fedot_ind/core/repository/initializer_industrial_models.py b/fedot_ind/core/repository/initializer_industrial_models.py
@@ -3,10 +3,16 @@
 from enum import Enum
 from typing import Sequence
 from random import choice, sample
+from typing import List, Iterable, Union, Optional
+
+import numpy as np
+
+from fedot.core.data.array_utilities import atleast_4d
 
 from fedot.api.api_utils.api_composer import ApiComposer
 from fedot.api.api_utils.api_params_repository import ApiParamsRepository
 from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation
+from fedot.core.data.merge.data_merger import ImageDataMerger
 from fedot.core.pipelines.adapters import PipelineAdapter
 from fedot.core.pipelines.node import PipelineNode
 from fedot.core.pipelines.pipeline import Pipeline
@@ -309,6 +315,46 @@ def has_no_data_flow_conflicts_in_industrial_pipeline(pipeline: Pipeline):
     return True
 
 
+def preprocess_predicts(*args) -> List[np.array]:
+    predicts = args[1]
+    if len(predicts[0].shape) <= 3:
+        return predicts
+    else:
+        reshaped_predicts = list(map(atleast_4d, predicts))
+
+        # And check image sizes
+        img_wh = [predict.shape[1:3] for predict in reshaped_predicts]
+        invalid_sizes = len(set(img_wh)) > 1  # Can merge only images of the same size
+        if invalid_sizes:
+            raise ValueError("Can't merge images of different sizes: " + str(img_wh))
+        return reshaped_predicts
+
+
+def merge_predicts(*args) -> np.array:
+    predicts = args[1]
+    predicts = [x.reshape(-1, 1) if len(x.shape) == 1 else x for x in predicts]
+
+    channel_shape, elem_shape = [(x.shape[1], x.shape[2]) if len(x.shape) > 2 else (1, x.shape[0]) for x in predicts][0]
+
+    chanel_concat = [x.shape[1] == channel_shape if len(x.shape) > 2
+                     else 1 == channel_shape for x in predicts]
+
+    element_wise_concat = [x.shape[2] == elem_shape if len(x.shape) > 2
+                           else x.shape[1] == elem_shape for x in predicts]
+
+    if all(chanel_concat) and all(element_wise_concat):
+        try:
+            return np.concatenate(predicts, axis=1)
+        except Exception:
+            return np.concatenate(predicts, axis=0)
+    elif not all(chanel_concat) and not all(element_wise_concat):
+        prediction_2d = np.concatenate([x.reshape(x.shape[0], x.shape[1] * x.shape[2]) if len(x.shape) > 2
+                                        else x for x in predicts], axis=1)
+        return prediction_2d.reshape(prediction_2d.shape[0], 1, prediction_2d.shape[1])
+    else:
+        return np.concatenate(predicts, axis=1)
+
+
 class IndustrialModels:
     def __init__(self):
         self.industrial_data_operation_path = pathlib.Path(PROJECT_PATH, 'fedot_ind',
@@ -341,6 +387,8 @@ def setup_repository(self):
 
         setattr(PipelineSearchSpace, "get_parameters_dict", get_industrial_search_space)
         setattr(ApiParamsRepository, "_get_default_mutations", _get_default_industrial_mutations)
+        setattr(ImageDataMerger, "preprocess_predicts", preprocess_predicts)
+        setattr(ImageDataMerger, "merge_predicts", merge_predicts)
         class_rules.append(has_no_data_flow_conflicts_in_industrial_pipeline)
         MutationStrengthEnum = MutationStrengthEnumIndustrial
         # common_rules.append(has_no_data_flow_conflicts_in_industrial_pipeline)

diff --git a/fedot_ind/core/tuning/search_space.py b/fedot_ind/core/tuning/search_space.py
@@ -11,7 +11,7 @@
     'wavelet_basis':
         {'n_components': {'hyperopt-dist': hp.uniformint, 'sampling-scope': [2, 10]},
          'wavelet': {'hyperopt-dist': hp.choice,
-                     'sampling-scope': [['mexh', 'shan', 'morl', 'cmor', 'fbsp', 'db5', 'sym5']]}},
+                     'sampling-scope': [['mexh', 'morl', 'db5', 'sym5']]}},
     'fourier_basis':
         {'threshold': {'hyperopt-dist': hp.uniformint, 'sampling-scope': [10000, 50000]}},
     'topological_extractor':
@@ -26,8 +26,9 @@
          # 'rec_metric': (hp.choice, [['chebyshev', 'cosine', 'euclidean', 'mahalanobis']])
          },
     'signal_extractor':
-        {'wavelet': {'hyperopt-dist': hp.choice,
-                     'sampling-scope': [['mexh', 'shan', 'morl', 'cmor', 'fbsp', 'db5', 'sym5']]}},
+        {'n_components': {'hyperopt-dist': hp.uniformint, 'sampling-scope': [2, 10]},
+         'wavelet': {'hyperopt-dist': hp.choice,
+                     'sampling-scope': [['mexh', 'morl', 'db5', 'sym5']]}},
     'minirocket_extractor':
         {'num_features': {'hyperopt-dist': hp.choice, 'sampling-scope': [[x for x in range(5000, 20000, 1000)]]}},
     'patch_tst_model':
@@ -320,6 +321,22 @@ def get_industrial_search_space(self):
                 'sampling-scope': [1, 365],
                 'type': 'discrete'}
         },
+        'mlp': {
+            'hidden_layer_sizes': {
+                'hyperopt-dist': hp.choice,
+                'sampling-scope': [[(256, 128, 64, 32), (1028, 512, 64,)]],
+                'type': 'categorical'},
+            'activation': {
+                'hyperopt-dist': hp.choice,
+                'sampling-scope': [['logistic', 'tanh', 'relu']],
+                'type': 'categorical'},
+            'max_iter': {'hyperopt-dist': hp.uniformint,
+                         'sampling-scope': [1000, 2000],
+                         'type': 'discrete'},
+            'learning_rate': {'hyperopt-dist': hp.choice,
+                              'sampling-scope': [['constant', 'adaptive']],
+                              'type': 'categorical'}
+        },
         'ar': {
             'lag_1': {
                 'hyperopt-dist': hp.uniform,