diff --git a/examples/benchmark_example/time_series_multi_clf_benchmark.py b/examples/benchmark_example/time_series_multi_clf_benchmark.py index 07c30cc64..30c84910a 100644 --- a/examples/benchmark_example/time_series_multi_clf_benchmark.py +++ b/examples/benchmark_example/time_series_multi_clf_benchmark.py @@ -33,7 +33,7 @@ if __name__ == "__main__": benchmark = BenchmarkTSC(experiment_setup=experiment_setup, custom_datasets=[ - 'EthanolConcentration', + #'EthanolConcentration', 'Handwriting', 'StandWalkJump', 'EigenWorms', diff --git a/fedot_ind/core/models/manifold/riemann_embeding.py b/fedot_ind/core/models/manifold/riemann_embeding.py index f840491f3..8320401f2 100644 --- a/fedot_ind/core/models/manifold/riemann_embeding.py +++ b/fedot_ind/core/models/manifold/riemann_embeding.py @@ -79,18 +79,22 @@ def extract_riemann_features(self, input_data: InputData) -> InputData: SPD = self.shinkage.fit_transform(SPD) ref_point = self.tangent_projector.fit_transform(SPD) self.fit_stage = False + self.classes_ = np.unique(input_data.target) return ref_point def extract_centroid_distance(self, input_data: InputData): - self.classes_ = np.unique(input_data.target) - if not self.fit_stage: - SPD = self.covarince_transformer.transform(input_data.features) + input_data.target = input_data.target.astype(int) + if self.fit_stage: + SPD = self.covarince_transformer.fit_transform(input_data.features, input_data.target) SPD = self.shinkage.transform(SPD) + else: - SPD = self.covarince_transformer.fit_transform(input_data.features, input_data.target) + SPD = self.covarince_transformer.transform(input_data.features) SPD = self.shinkage.fit_transform(SPD) - self.covmeans_ = [mean_covariance(SPD[input_data.target.flatten() == ll], metric=self.covariance_metric) - for ll in self.classes_] + + + self.covmeans_ = [mean_covariance(SPD[np.array(input_data.target == ll).flatten()], + metric=self.covariance_metric) for ll in self.classes_] n_centroids = len(self.covmeans_) dist = [distance(SPD, self.covmeans_[m], self.distance_metric) for m in range(n_centroids)] diff --git a/fedot_ind/core/repository/industrial_implementations/abstract.py b/fedot_ind/core/repository/industrial_implementations/abstract.py index 3fae8db5f..da126ada6 100644 --- a/fedot_ind/core/repository/industrial_implementations/abstract.py +++ b/fedot_ind/core/repository/industrial_implementations/abstract.py @@ -1,21 +1,159 @@ from copy import copy +from functools import partial + import pandas as pd +from fedot.core.constants import default_data_split_ratio_by_task from fedot.core.data.array_utilities import atleast_4d -from fedot.core.data.merge.data_merger import DataMerger +from fedot.core.data.cv_folds import cv_generator +from fedot.core.data.data_split import _split_input_data_by_indexes +from fedot.core.data.multi_modal import MultiModalData from fedot.core.operations.evaluation.operation_implementations.data_operations.ts_transformations import \ transform_features_and_target_into_lagged from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.optimisers.objective import DataSource from fedot.core.pipelines.tuning.tuner_builder import TunerBuilder from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum from fedot.preprocessing.data_types import TYPE_TO_ID +from sklearn.model_selection import train_test_split from fedot_ind.core.architecture.preprocessing.data_convertor import NumpyConverter from fedot_ind.core.architecture.settings.computational import backend_methods as np from fedot_ind.core.repository.constanst_repository import FEDOT_HEAD_ENSEMBLE from typing import Optional, Tuple, Union, Sequence, List, Dict -from golem.core.optimisers.timer import Timer from fedot.core.data.data import InputData, OutputData -from fedot.core.pipelines.node import PipelineNode +def split_any(data: InputData, + split_ratio: float, + shuffle: bool, + stratify: bool, + random_seed: int, + **kwargs): + """ Split any data except timeseries into train and test parts + + :param data: InputData object to split + :param split_ratio: share of train data between 0 and 1 + :param shuffle: is data needed to be shuffled or not + :param stratify: make stratified sample or not + :param random_seed: random_seed for shuffle + """ + + stratify_labels = data.target if stratify else None + def __split_loop(data, ratio, shuffle, stratify_labels): + train_ids, test_ids = train_test_split(np.arange(0, len(data.target)), + test_size=1 - ratio, + shuffle=shuffle, + random_state=random_seed, + stratify=stratify_labels) + + train_data = _split_input_data_by_indexes(data, index=train_ids) + test_data = _split_input_data_by_indexes(data, index=test_ids) + correct_split = np.unique(test_data.target).shape[0] == np.unique(train_data.target).shape[0] + return train_data, test_data, correct_split + + for ratio in [split_ratio, 0.6, 0.5, 0.4, 0.3, 0.1]: + train_data, test_data, correct_split = __split_loop(data, ratio, shuffle, stratify_labels) + if correct_split: + break + return train_data, test_data + + +def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ratio: float) -> bool: + """ Check that stratification may be done + :param data: data for split + :param split_ratio: relation between train data length and all data length + :return bool: stratification is allowed""" + + # check task_type + if data.task.task_type is not TaskTypesEnum.classification: + return False + else: + return True + + +def _are_cv_folds_allowed(data: Union[InputData, MultiModalData], split_ratio: float, cv_folds: int) -> bool: + try: + # fast way + classes = np.unique(data.target, return_counts=True) + except Exception: + # slow way + from collections import Counter + classes = Counter(data.target) + classes = [list(classes), list(classes.values())] + + # check that there are enough labels for two samples + if not all(x > 1 for x in classes[1]): + if __debug__: + # tests often use very small datasets that are not suitable for data splitting + # stratification is disabled for tests + return False + else: + raise ValueError(("There is the only value for some classes:" + f" {', '.join(str(val) for val, count in zip(*classes) if count == 1)}." + f" Data split can not be done for {data.task.task_type.name} task.")) + + # check that split ratio allows to set all classes to both samples + test_size = round(len(data.target) * (1. - split_ratio)) + labels_count = len(classes[0]) + if test_size < labels_count: + return None + else: + return cv_folds + +def _build(self, data: Union[InputData, MultiModalData]) -> DataSource: + # define split_ratio + self.split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] + + # Check cv_folds + if self.cv_folds is not None: + try: + self.cv_folds = int(self.cv_folds) + except ValueError: + raise ValueError(f"cv_folds is not integer: {self.cv_folds}") + if self.cv_folds < 2: + self.cv_folds = None + if self.cv_folds > data.target.shape[0] - 1: + raise ValueError((f"cv_folds ({self.cv_folds}) is greater than" + f" the maximum allowed count {data.target.shape[0] - 1}")) + + # Calculate the number of validation blocks for timeseries forecasting + if data.task.task_type is TaskTypesEnum.ts_forecasting and self.validation_blocks is None: + self._propose_cv_folds_and_validation_blocks(data) + + # Check split_ratio + if self.cv_folds is None and not (0 < self.split_ratio < 1): + raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') + + if self.stratify: + # check that stratification can be done + # for cross validation split ratio is defined as validation_size / all_data_size + split_ratio = self.split_ratio if self.cv_folds is None else (1 - 1 / (self.cv_folds + 1)) + self.stratify = _are_stratification_allowed(data, split_ratio) + self.cv_folds = _are_cv_folds_allowed(data, split_ratio, self.cv_folds) + if not self.stratify: + self.log.info("Stratificated splitting of data is disabled.") + + # Stratification can not be done without shuffle + self.shuffle |= self.stratify + + # Random seed depends on shuffle + self.random_seed = (self.random_seed or 42) if self.shuffle else None + + # Split data + if self.cv_folds is not None: + self.log.info("K-folds cross validation is applied.") + data_producer = partial(cv_generator, + data=data, + shuffle=self.shuffle, + cv_folds=self.cv_folds, + random_seed=self.random_seed, + stratify=self.stratify, + validation_blocks=self.validation_blocks) + else: + self.log.info("Hold out validation is applied.") + data_producer = self._build_holdout_producer(data) + + return data_producer + def build_tuner(self, model_to_tune, tuning_params, train_data, mode): @@ -42,7 +180,6 @@ def postprocess_predicts(self, merged_predicts: np.array) -> np.array: """ Post-process merged predictions (e.g. reshape). """ return merged_predicts - def transform_lagged(self, input_data: InputData): train_data = copy(input_data) forecast_length = train_data.task.task_params.forecast_length diff --git a/fedot_ind/core/repository/initializer_industrial_models.py b/fedot_ind/core/repository/initializer_industrial_models.py index 39a90025d..79cba4449 100644 --- a/fedot_ind/core/repository/initializer_industrial_models.py +++ b/fedot_ind/core/repository/initializer_industrial_models.py @@ -1,11 +1,14 @@ import pathlib +import types from fedot.api.api_utils.api_composer import ApiComposer from fedot.api.api_utils.api_params_repository import ApiParamsRepository +import fedot.core.data.data_split as fedot_data_split from fedot.core.data.merge.data_merger import ImageDataMerger, TSDataMerger from fedot.core.operations.evaluation.operation_implementations.data_operations.ts_transformations import \ LaggedImplementation, TsSmoothingImplementation from fedot.core.operations.operation import Operation +from fedot.core.optimisers.objective.data_source_splitter import DataSourceSplitter from fedot.core.pipelines.tuning.search_space import PipelineSearchSpace from fedot.core.pipelines.verification import class_rules from fedot.core.repository.operation_types_repository import OperationTypesRepository @@ -14,7 +17,7 @@ from fedot_ind.api.utils.path_lib import PROJECT_PATH from fedot_ind.core.repository.industrial_implementations.abstract import merge_predicts, preprocess_predicts, \ predict_for_fit, predict, predict_operation, postprocess_predicts, update_column_types, transform_lagged, \ - transform_lagged_for_fit, transform_smoothing + transform_lagged_for_fit, transform_smoothing, _build, split_any from fedot_ind.core.repository.industrial_implementations.optimisation import _get_default_industrial_mutations, \ MutationStrengthEnumIndustrial, has_no_data_flow_conflicts_in_industrial_pipeline, _crossover_by_type from fedot_ind.core.tuning.search_space import get_industrial_search_space @@ -52,21 +55,25 @@ def setup_repository(self): 'default_tags': []}}) OperationTypesRepository.assign_repo( 'model', self.industrial_model_path) - + ## replace mutations setattr(PipelineSearchSpace, "get_parameters_dict", get_industrial_search_space) setattr(ApiParamsRepository, "_get_default_mutations", _get_default_industrial_mutations) setattr(Crossover, '_crossover_by_type', _crossover_by_type) - + ## replace data merger setattr(ImageDataMerger, "preprocess_predicts", preprocess_predicts) setattr(ImageDataMerger, "merge_predicts", merge_predicts) setattr(TSDataMerger, 'postprocess_predicts', postprocess_predicts) - + ## replace data split + setattr(DataSourceSplitter, "build", _build) + setattr(fedot_data_split, "_split_any", split_any) + # setattr(TSDataMerger, 'postprocess_predicts', postprocess_predicts) + ## replace predict operations setattr(Operation, "_predict", predict_operation) setattr(Operation, "predict", predict) setattr(Operation, "predict_for_fit", predict_for_fit) - + ## replace ts forecasting operations setattr(LaggedImplementation, '_update_column_types', update_column_types) setattr(LaggedImplementation, 'transform', transform_lagged) diff --git a/fedot_ind/core/tuning/search_space.py b/fedot_ind/core/tuning/search_space.py index a417240ee..c5640bd11 100644 --- a/fedot_ind/core/tuning/search_space.py +++ b/fedot_ind/core/tuning/search_space.py @@ -26,12 +26,17 @@ 'riemann_extractor': {'estimator': {'hyperopt-dist': hp.choice, 'sampling-scope': [['corr', 'cov', 'lwf', 'mcd', 'hub']]}, - 'tangent_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [['euclid', 'logeuclid', - 'riemann', - 'wasserstein']]}, - 'SPD_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [['ale', 'alm', 'euclid', - 'identity', 'kullback_sym', - 'logeuclid', 'riemann', 'wasserstein']]}}, + 'tangent_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [[ + 'euclid', + 'logeuclid', + 'riemann' + ]]}, + 'SPD_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [[ + # 'ale', + # 'alm', + 'euclid', + 'identity', + 'logeuclid', 'riemann']]}}, 'recurrence_extractor': {'window_size': {'hyperopt-dist': hp.choice, 'sampling-scope': [[x for x in range(5, 50, 5)]]}, 'stride': {'hyperopt-dist': hp.choice, 'sampling-scope': [[x for x in range(1, 10, 1)]]},