Skip to content

Commit

Permalink
channel filtration, knee method to rsvd, update api examples
Browse files Browse the repository at this point in the history
  • Loading branch information
v1docq committed Apr 2, 2024
1 parent 0dddde4 commit 1837164
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from fedot_ind.tools.loader import DataLoader

if __name__ == "__main__":
dataset_name = 'PhonemeSpectra'
dataset_name = 'Handwriting'
finetune = True
initial_assumption = PipelineBuilder().add_node('channel_filtration').add_node('quantile_extractor').add_node('rf')
initial_assumption = PipelineBuilder().add_node('channel_filtration').\
add_node('quantile_extractor').add_node('rf')

industrial = FedotIndustrial(problem='classification',
metric='f1',
Expand All @@ -26,5 +27,5 @@
metrics = industrial.get_metrics(target=test_data[1],
rounding_order=3,
metric_names=['f1', 'accuracy', 'precision', 'roc_auc'])
# industrial.finetune(train_data)
print(metrics)
_ = 1
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
from fedot.core.pipelines.pipeline_builder import PipelineBuilder

from fedot_ind.api.main import FedotIndustrial
from fedot_ind.tools.loader import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

if __name__ == "__main__":
dataset_name = 'AppliancesEnergy'
dataset_name = 'IEEEPPG' #BeijingPM10Quality
finetune = True
initial_assumption = PipelineBuilder().add_node('channel_filtration').add_node('quantile_extractor').add_node('treg')

industrial = FedotIndustrial(problem='regression',
metric='rmse',
timeout=5,
initial_assumption=initial_assumption,
n_jobs=2,
logging_level=20)

train_data, test_data = DataLoader(dataset_name=dataset_name).load_data()
if finetune:
model = industrial.finetune(train_data)
else:
model = industrial.fit(train_data)

model = industrial.fit(train_data)

y_predicted = industrial.predict(test_data)

print('Metrics:')
print(
f'RMSE: {round(mean_squared_error(test_data[1], y_predicted, squared=False), 3)}')
print(
f'MAPE: {round(mean_absolute_percentage_error(test_data[1], y_predicted), 3)}')
labels = industrial.predict(test_data)
probs = industrial.predict_proba(test_data)
metrics = industrial.get_metrics(target=test_data[1],
rounding_order=3,
metric_names=('r2', 'rmse', 'mae'))
print(metrics)
_ = 1
16 changes: 12 additions & 4 deletions fedot_ind/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,9 @@ def finetune(self,
"""

train_data = DataCheck(input_data=train_data, task=self.config_dict['problem']).check_input_data()
input_preproc = DataCheck(input_data=train_data, task=self.config_dict['problem'])
train_data = input_preproc.check_input_data()
self.target_encoder = input_preproc.get_target_encoder()
tuning_params = ApiConverter.tuning_params_is_none(tuning_params)
tuned_metric = 0
tuning_params['metric'] = FEDOT_TUNING_METRICS[self.config_dict['problem']]
Expand Down Expand Up @@ -328,11 +330,17 @@ def get_metrics(self,
'Predicted probabilities are not available. Use `predict_proba()` method first')

valid_shape = target.shape
return FEDOT_GET_METRICS[problem](target=target.flatten(),
if self.condition_check.solver_have_target_encoder(self.target_encoder):
new_target = self.target_encoder.transform(target.flatten())
labels = self.target_encoder.transform(self.predicted_labels).reshape(valid_shape)
else:
new_target = target.flatten()
labels = self.predicted_labels.reshape(valid_shape)

return FEDOT_GET_METRICS[problem](target=new_target,
metric_names=metric_names,
rounding_order=rounding_order,
labels=self.predicted_labels.reshape(
valid_shape),
labels=labels,
probs=self.predicted_probs)

def save_predict(self, predicted_data, **kwargs) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from fedot_ind.core.operation.transformation.regularization.spectrum import singular_value_hard_threshold, \
sv_to_explained_variance_ratio
import math
from fedot_ind.core.operation.filtration.channel_filtration import _detect_knee_point


class RSVDDecomposition:
Expand Down Expand Up @@ -42,11 +43,13 @@ def _matrix_approx_regularization(self, low_rank, Ut, block, tensor):
list_of_rank = list(range(1, low_rank + 1, 1))
reconstr_matrix = [self._compute_matrix_approximation(
Ut, block, tensor, rank) for rank in list_of_rank]
fro_norms = [abs(np.linalg.norm(tensor - reconstr_m, 'fro')/np.linalg.norm(tensor)*100)
fro_norms = [abs(np.linalg.norm(tensor - reconstr_m, 'fro') / np.linalg.norm(tensor) * 100)
for reconstr_m in reconstr_matrix]
deriviate_of_error = abs(np.diff(fro_norms))
regularized_rank = len(
deriviate_of_error[deriviate_of_error > 1]) + 1
regularized_rank = _detect_knee_point(values=fro_norms, indices=list(range(len(fro_norms))))
regularized_rank = len(regularized_rank)
# deriviate_of_error = abs(np.diff(fro_norms))
# regularized_rank = len(
# deriviate_of_error[deriviate_of_error > 1]) + 1
return regularized_rank

def rsvd(self,
Expand Down
65 changes: 39 additions & 26 deletions fedot_ind/core/operation/filtration/channel_filtration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,30 @@

from fedot_ind.core.architecture.settings.computational import backend_methods as np
from fedot_ind.core.operation.IndustrialCachableOperation import IndustrialCachableOperationImplementation
from fedot_ind.core.repository.constanst_repository import DISTANCE_METRICS


def _detect_knee_point(values, indices):
"""Find elbow point.The elbow cut method is a method to determine a point in
a curve where significant change can be observed, e.g., from a steep slope to almost flat curve"""
n_points = len(values) # number_of_channels
all_coords = np.vstack((range(n_points), values)).T # coordinate of each channel projected in chosen centroid
first_point = all_coords[0]
line_vec = all_coords[-1] - all_coords[0]
line_vec_norm = line_vec / np.sqrt(np.sum(line_vec ** 2))
vec_from_first = all_coords - first_point # line coord from first point to last
scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
# "angle" between each point and line
vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
vec_to_line = vec_from_first - vec_from_first_parallel
dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1)) # find distance from all points to line
knee_idx = np.argmax(dist_to_line)
knee = values[knee_idx]
best_dims = [idx for (elem, idx) in zip(values, indices) if elem > knee]
if len(best_dims) == 0:
return [knee_idx], knee_idx

return best_dims,


class ChannelCentroidFilter(IndustrialCachableOperationImplementation):
Expand Down Expand Up @@ -51,7 +75,9 @@ def __init__(self, params: Optional[OperationParameters] = None):

self.distance = params.get('distance', None) # “manhattan” “chebyshev”
self.shrink = params.get('shrink', 1e-5)
self.centroid_metric = params.get('centroid_metric', 'euclidean') # “manhattan” “chebyshev”
self.centroid_metric = params.get('centroid_metric', 'euclidean')
self.sample_metric = params.get('sample_metric', 'euclidean')
self.sample_metric = DISTANCE_METRICS[self.sample_metric]
self.channel_selection_strategy = params.get('selection_strategy', 'sum')
self.channels_selected = []

Expand All @@ -64,42 +90,23 @@ def __init__(self, params: Optional[OperationParameters] = None):
else:
self.distance_ = self.distance

def _detect_knee_point(self, values, indices):
"""Find elbow point.The elbow cut method is a method to determine a point in
a curve where significant change can be observed, e.g., from a steep slope to almost flat curve"""
n_points = len(values) # number_of_channels
all_coords = np.vstack((range(n_points), values)).T # coordinate of each channel projected in chosen centroid
first_point = all_coords[0]
line_vec = all_coords[-1] - all_coords[0]
line_vec_norm = line_vec / np.sqrt(np.sum(line_vec ** 2))
vec_from_first = all_coords - first_point # line coord from first point to last
scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
# "angle" between each point and line
vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
vec_to_line = vec_from_first - vec_from_first_parallel
dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1)) # find distance from all points to line
knee_idx = np.argmax(dist_to_line)
knee = values[knee_idx]
best_dims = [idx for (elem, idx) in zip(values, indices) if elem > knee]
if len(best_dims) == 0:
return [knee_idx], knee_idx

return best_dims,

def eval_distance_from_centroid(self, centroid_frame):
"""Create distance matrix."""
# distance from each class to each without repetitions. Number of pairs is n_cls(n_cls-1)/2
distance_pair = list(itertools.combinations(range(0, centroid_frame.shape[0]), 2))
# distance_metrics = []
# for metric in DISTANCE_METRICS.values():
distance_frame = pd.DataFrame()
for class_ in distance_pair:
class_pair = []
# calculate the distance of centroid here
for _, (q, t) in enumerate(zip(centroid_frame[class_[0], :],
centroid_frame[class_[1], :], )):
class_pair.append(euclidean(q, t))
class_pair.append(self.sample_metric(q, t))
dict_ = {f"Centroid_{[class_[0]]}_{[class_[1]]}": class_pair}

distance_frame = pd.concat([distance_frame, pd.DataFrame(dict_)], axis=1)
#distance_metrics.append(distance_frame)

return distance_frame

Expand All @@ -122,14 +129,14 @@ def _channel_sum(self):
self.distance_frame = pd.Series(self.distance_frame.sum(axis=1))
distance = self.distance_frame.sort_values(ascending=False).values
indices = self.distance_frame.sort_values(ascending=False).index
self.channels_selected = self._detect_knee_point(distance, indices)[0]
self.channels_selected = _detect_knee_point(distance, indices)[0]

def _channel_pairwise(self, centroids_by_channel):
self.distance_frame = self.eval_distance_from_centroid(centroids_by_channel)
for pairdistance in self.distance_frame.items():
distance = pairdistance[1].sort_values(ascending=False).values
indices = pairdistance[1].sort_values(ascending=False).index
self.channels_selected.extend(self._detect_knee_point(distance, indices)[0])
self.channels_selected.extend(_detect_knee_point(distance, indices)[0])
self.channels_selected = list(set(self.channels_selected))

def _transform(self, input_data: InputData):
Expand All @@ -150,6 +157,12 @@ def _transform(self, input_data: InputData):
return input_data.features
else:
if len(self.channels_selected) == 0:
if input_data.task.task_type.value == 'regression':
bins = [np.quantile(input_data.target, x) for x in np.arange(0, 1, 0.2)]
labels = [x for x in range(len(bins) - 1)]
input_data.target = pd.cut(input_data.target,
bins=bins,
labels=labels).codes
# step 1. create channel centroids
centroids_by_channel = self.create_centroid(input_data.features, input_data.target)
# step 2. create distance matrix
Expand Down
28 changes: 22 additions & 6 deletions fedot_ind/core/repository/constanst_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
PersistenceDiagramsExtractor, PersistenceEntropyFeature, RadiusAtMaxBNFeature, RelevantHolesNumber, \
SimultaneousAliveHolesFeature, SumHoleLifetimeFeature
from fedot_ind.core.operation.transformation.data.hankel import HankelMatrix
from scipy.spatial.distance import euclidean, cosine, cityblock, correlation, chebyshev, \
jensenshannon, mahalanobis, minkowski


def beta_thr(beta):
Expand Down Expand Up @@ -88,6 +90,16 @@ class FeatureConstant(Enum):
'petrosian_fractal_dimension_': pfd
}

METRICS_DICT = {'euclidean': euclidean,
'cosine': cosine,
'cityblock': cityblock,
'correlation': correlation,
'chebyshev': chebyshev,
#'jensenshannon': jensenshannon,
#'mahalanobis': mahalanobis,
'minkowski': minkowski
}

PERSISTENCE_DIAGRAM_FEATURES = {'HolesNumberFeature': HolesNumberFeature(),
'MaxHoleLifeTimeFeature': MaxHoleLifeTimeFeature(),
'RelevantHolesNumber': RelevantHolesNumber(),
Expand Down Expand Up @@ -195,10 +207,12 @@ class FedotOperationConstant(Enum):
'classification': calculate_classification_metric}
FEDOT_TUNING_METRICS = {'classification': ClassificationMetricsEnum.accuracy,
'regression': RegressionMetricsEnum.RMSE}
FEDOT_TUNER_STRATEGY = {'sequential': partial(SequentialTuner, inverse_node_order=True),
'simultaneous': SimultaneousTuner,
# 'IOptTuner': IOptTuner,
'optuna': OptunaTuner}
FEDOT_TUNER_STRATEGY = {
'sequential': partial(SequentialTuner, inverse_node_order=True),
'simultaneous': SimultaneousTuner,
# 'IOptTuner': IOptTuner,
'optuna': OptunaTuner
}
FEDOT_HEAD_ENSEMBLE = {'regression': 'treg',
'classification': 'logit'}
FEDOT_ATOMIZE_OPERATION = {'regression': 'fedot_regr',
Expand All @@ -223,8 +237,9 @@ class FedotOperationConstant(Enum):
]

FEDOT_ASSUMPTIONS = {
'classification': PipelineBuilder().add_node('quantile_extractor').add_node('logit'),
'regression': PipelineBuilder().add_node('quantile_extractor').add_node('treg'),
'classification': PipelineBuilder().add_node('channel_filtration').add_node('quantile_extractor').add_node(
'logit'),
'regression': PipelineBuilder().add_node('channel_filtration').add_node('quantile_extractor').add_node('treg'),
'ts_forecasting': PipelineBuilder().add_node('ssa_forecaster')
}

Expand Down Expand Up @@ -578,6 +593,7 @@ class BenchmarkDatasets(Enum):
WAVELET_SCALES = FeatureConstant.WAVELET_SCALES.value
SINGULAR_VALUE_MEDIAN_THR = FeatureConstant.SINGULAR_VALUE_MEDIAN_THR.value
SINGULAR_VALUE_BETA_THR = FeatureConstant.SINGULAR_VALUE_BETA_THR
DISTANCE_METRICS = FeatureConstant.METRICS_DICT.value

AVAILABLE_REG_OPERATIONS = FedotOperationConstant.AVAILABLE_REG_OPERATIONS.value
AVAILABLE_CLS_OPERATIONS = FedotOperationConstant.AVAILABLE_CLS_OPERATIONS.value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _are_cv_folds_allowed(data: Union[InputData, MultiModalData], split_ratio: f
if __debug__:
# tests often use very small datasets that are not suitable for data splitting
# stratification is disabled for tests
return False
return None
else:
raise ValueError(("There is the only value for some classes:"
f" {', '.join(str(val) for val, count in zip(*classes) if count == 1)}."
Expand Down
5 changes: 5 additions & 0 deletions fedot_ind/core/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from hyperopt import hp

from fedot_ind.core.repository.constanst_repository import DISTANCE_METRICS

NESTED_PARAMS_LABEL = 'nested_label'

industrial_search_space = {
Expand Down Expand Up @@ -51,6 +53,9 @@
'sampling-scope': [['manhattan', 'euclidean', 'chebyshev']]},
'centroid_metric': {'hyperopt-dist': hp.choice,
'sampling-scope': [['manhattan', 'euclidean', 'chebyshev']]},
'sample_metric': {'hyperopt-dist': hp.choice,
'sampling-scope': [list(DISTANCE_METRICS.keys())]},

'selection_strategy': {'hyperopt-dist': hp.choice,
'sampling-scope': [['sum', 'pairwise']]}
},
Expand Down

0 comments on commit 1837164

Please sign in to comment.