Skip to content

Commit

Permalink
eclude wassertein metric. Refactoring stratify split
Browse files Browse the repository at this point in the history
  • Loading branch information
v1docq committed Mar 12, 2024
1 parent 8981efc commit 6b89059
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
if __name__ == "__main__":
benchmark = BenchmarkTSC(experiment_setup=experiment_setup,
custom_datasets=[
'EthanolConcentration',
#'EthanolConcentration',
'Handwriting',
'StandWalkJump',
'EigenWorms',
Expand Down
14 changes: 8 additions & 6 deletions fedot_ind/core/models/manifold/riemann_embeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,18 @@ def extract_riemann_features(self, input_data: InputData) -> InputData:
SPD = self.covarince_transformer.fit_transform(input_data.features, input_data.target)
ref_point = self.tangent_projector.fit_transform(SPD)
self.fit_stage = False
self.classes_ = np.unique(input_data.target)
return ref_point

def extract_centroid_distance(self, input_data: InputData):
self.classes_ = np.unique(input_data.target)
if not self.fit_stage:
SPD = self.covarince_transformer.transform(input_data.features)
else:
input_data.target = input_data.target.astype(int)
if self.fit_stage:
SPD = self.covarince_transformer.fit_transform(input_data.features, input_data.target)
self.covmeans_ = [mean_covariance(SPD[input_data.target == ll], metric=self.covariance_metric)
for ll in self.classes_]
else:
SPD = self.covarince_transformer.transform(input_data.features)

self.covmeans_ = [mean_covariance(SPD[np.array(input_data.target == ll).flatten()],
metric=self.covariance_metric) for ll in self.classes_]

n_centroids = len(self.covmeans_)
dist = [distance(SPD, self.covmeans_[m], self.distance_metric) for m in range(n_centroids)]
Expand Down
145 changes: 141 additions & 4 deletions fedot_ind/core/repository/industrial_implementations/abstract.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,159 @@
from copy import copy
from functools import partial

import pandas as pd
from fedot.core.constants import default_data_split_ratio_by_task
from fedot.core.data.array_utilities import atleast_4d
from fedot.core.data.merge.data_merger import DataMerger
from fedot.core.data.cv_folds import cv_generator
from fedot.core.data.data_split import _split_input_data_by_indexes
from fedot.core.data.multi_modal import MultiModalData
from fedot.core.operations.evaluation.operation_implementations.data_operations.ts_transformations import \
transform_features_and_target_into_lagged
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.optimisers.objective import DataSource
from fedot.core.pipelines.tuning.tuner_builder import TunerBuilder
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import TaskTypesEnum
from fedot.preprocessing.data_types import TYPE_TO_ID
from sklearn.model_selection import train_test_split

from fedot_ind.core.architecture.preprocessing.data_convertor import NumpyConverter
from fedot_ind.core.architecture.settings.computational import backend_methods as np
from fedot_ind.core.repository.constanst_repository import FEDOT_HEAD_ENSEMBLE
from typing import Optional, Tuple, Union, Sequence, List, Dict
from golem.core.optimisers.timer import Timer
from fedot.core.data.data import InputData, OutputData
from fedot.core.pipelines.node import PipelineNode
def split_any(data: InputData,
split_ratio: float,
shuffle: bool,
stratify: bool,
random_seed: int,
**kwargs):
""" Split any data except timeseries into train and test parts
:param data: InputData object to split
:param split_ratio: share of train data between 0 and 1
:param shuffle: is data needed to be shuffled or not
:param stratify: make stratified sample or not
:param random_seed: random_seed for shuffle
"""

stratify_labels = data.target if stratify else None
def __split_loop(data, ratio, shuffle, stratify_labels):
train_ids, test_ids = train_test_split(np.arange(0, len(data.target)),
test_size=1 - ratio,
shuffle=shuffle,
random_state=random_seed,
stratify=stratify_labels)

train_data = _split_input_data_by_indexes(data, index=train_ids)
test_data = _split_input_data_by_indexes(data, index=test_ids)
correct_split = np.unique(test_data.target).shape[0] == np.unique(train_data.target).shape[0]
return train_data, test_data, correct_split

for ratio in [split_ratio, 0.6, 0.5, 0.4, 0.3, 0.1]:
train_data, test_data, correct_split = __split_loop(data, ratio, shuffle, stratify_labels)
if correct_split:
break
return train_data, test_data


def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ratio: float) -> bool:
""" Check that stratification may be done
:param data: data for split
:param split_ratio: relation between train data length and all data length
:return bool: stratification is allowed"""

# check task_type
if data.task.task_type is not TaskTypesEnum.classification:
return False
else:
return True


def _are_cv_folds_allowed(data: Union[InputData, MultiModalData], split_ratio: float, cv_folds: int) -> bool:
try:
# fast way
classes = np.unique(data.target, return_counts=True)
except Exception:
# slow way
from collections import Counter
classes = Counter(data.target)
classes = [list(classes), list(classes.values())]

# check that there are enough labels for two samples
if not all(x > 1 for x in classes[1]):
if __debug__:
# tests often use very small datasets that are not suitable for data splitting
# stratification is disabled for tests
return False
else:
raise ValueError(("There is the only value for some classes:"
f" {', '.join(str(val) for val, count in zip(*classes) if count == 1)}."
f" Data split can not be done for {data.task.task_type.name} task."))

# check that split ratio allows to set all classes to both samples
test_size = round(len(data.target) * (1. - split_ratio))
labels_count = len(classes[0])
if test_size < labels_count:
return None
else:
return cv_folds

def _build(self, data: Union[InputData, MultiModalData]) -> DataSource:
# define split_ratio
self.split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type]

# Check cv_folds
if self.cv_folds is not None:
try:
self.cv_folds = int(self.cv_folds)
except ValueError:
raise ValueError(f"cv_folds is not integer: {self.cv_folds}")
if self.cv_folds < 2:
self.cv_folds = None
if self.cv_folds > data.target.shape[0] - 1:
raise ValueError((f"cv_folds ({self.cv_folds}) is greater than"
f" the maximum allowed count {data.target.shape[0] - 1}"))

# Calculate the number of validation blocks for timeseries forecasting
if data.task.task_type is TaskTypesEnum.ts_forecasting and self.validation_blocks is None:
self._propose_cv_folds_and_validation_blocks(data)

# Check split_ratio
if self.cv_folds is None and not (0 < self.split_ratio < 1):
raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1')

if self.stratify:
# check that stratification can be done
# for cross validation split ratio is defined as validation_size / all_data_size
split_ratio = self.split_ratio if self.cv_folds is None else (1 - 1 / (self.cv_folds + 1))
self.stratify = _are_stratification_allowed(data, split_ratio)
self.cv_folds = _are_cv_folds_allowed(data, split_ratio, self.cv_folds)
if not self.stratify:
self.log.info("Stratificated splitting of data is disabled.")

# Stratification can not be done without shuffle
self.shuffle |= self.stratify

# Random seed depends on shuffle
self.random_seed = (self.random_seed or 42) if self.shuffle else None

# Split data
if self.cv_folds is not None:
self.log.info("K-folds cross validation is applied.")
data_producer = partial(cv_generator,
data=data,
shuffle=self.shuffle,
cv_folds=self.cv_folds,
random_seed=self.random_seed,
stratify=self.stratify,
validation_blocks=self.validation_blocks)
else:
self.log.info("Hold out validation is applied.")
data_producer = self._build_holdout_producer(data)

return data_producer



def build_tuner(self, model_to_tune, tuning_params, train_data, mode):
Expand All @@ -42,7 +180,6 @@ def postprocess_predicts(self, merged_predicts: np.array) -> np.array:
""" Post-process merged predictions (e.g. reshape). """
return merged_predicts


def transform_lagged(self, input_data: InputData):
train_data = copy(input_data)
forecast_length = train_data.task.task_params.forecast_length
Expand Down
17 changes: 12 additions & 5 deletions fedot_ind/core/repository/initializer_industrial_models.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import pathlib
import types

from fedot.api.api_utils.api_composer import ApiComposer
from fedot.api.api_utils.api_params_repository import ApiParamsRepository
import fedot.core.data.data_split as fedot_data_split
from fedot.core.data.merge.data_merger import ImageDataMerger, TSDataMerger
from fedot.core.operations.evaluation.operation_implementations.data_operations.ts_transformations import \
LaggedImplementation, TsSmoothingImplementation
from fedot.core.operations.operation import Operation
from fedot.core.optimisers.objective.data_source_splitter import DataSourceSplitter
from fedot.core.pipelines.tuning.search_space import PipelineSearchSpace
from fedot.core.pipelines.verification import class_rules
from fedot.core.repository.operation_types_repository import OperationTypesRepository
Expand All @@ -14,7 +17,7 @@
from fedot_ind.api.utils.path_lib import PROJECT_PATH
from fedot_ind.core.repository.industrial_implementations.abstract import merge_predicts, preprocess_predicts, \
predict_for_fit, predict, predict_operation, postprocess_predicts, update_column_types, transform_lagged, \
transform_lagged_for_fit, transform_smoothing
transform_lagged_for_fit, transform_smoothing, _build, split_any
from fedot_ind.core.repository.industrial_implementations.optimisation import _get_default_industrial_mutations, \
MutationStrengthEnumIndustrial, has_no_data_flow_conflicts_in_industrial_pipeline, _crossover_by_type
from fedot_ind.core.tuning.search_space import get_industrial_search_space
Expand Down Expand Up @@ -52,21 +55,25 @@ def setup_repository(self):
'default_tags': []}})
OperationTypesRepository.assign_repo(
'model', self.industrial_model_path)

## replace mutations
setattr(PipelineSearchSpace, "get_parameters_dict",
get_industrial_search_space)
setattr(ApiParamsRepository, "_get_default_mutations",
_get_default_industrial_mutations)
setattr(Crossover, '_crossover_by_type', _crossover_by_type)

## replace data merger
setattr(ImageDataMerger, "preprocess_predicts", preprocess_predicts)
setattr(ImageDataMerger, "merge_predicts", merge_predicts)
setattr(TSDataMerger, 'postprocess_predicts', postprocess_predicts)

## replace data split
setattr(DataSourceSplitter, "build", _build)
setattr(fedot_data_split, "_split_any", split_any)
# setattr(TSDataMerger, 'postprocess_predicts', postprocess_predicts)
## replace predict operations
setattr(Operation, "_predict", predict_operation)
setattr(Operation, "predict", predict)
setattr(Operation, "predict_for_fit", predict_for_fit)

## replace ts forecasting operations
setattr(LaggedImplementation,
'_update_column_types', update_column_types)
setattr(LaggedImplementation, 'transform', transform_lagged)
Expand Down
17 changes: 11 additions & 6 deletions fedot_ind/core/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,17 @@
'riemann_extractor':
{'estimator': {'hyperopt-dist': hp.choice, 'sampling-scope': [['corr',
'cov', 'lwf', 'mcd', 'hub']]},
'tangent_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [['euclid', 'logeuclid',
'riemann',
'wasserstein']]},
'SPD_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [['ale', 'alm', 'euclid',
'identity', 'kullback_sym',
'logeuclid', 'riemann', 'wasserstein']]}},
'tangent_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [[
'euclid',
'logeuclid',
'riemann'
]]},
'SPD_metric': {'hyperopt-dist': hp.choice, 'sampling-scope': [[
# 'ale',
# 'alm',
'euclid',
'identity',
'logeuclid', 'riemann']]}},
'recurrence_extractor':
{'window_size': {'hyperopt-dist': hp.choice, 'sampling-scope': [[x for x in range(5, 50, 5)]]},
'stride': {'hyperopt-dist': hp.choice, 'sampling-scope': [[x for x in range(1, 10, 1)]]},
Expand Down

0 comments on commit 6b89059

Please sign in to comment.