From a087ed6e8cd8fc1352058142a23069c809258a8a Mon Sep 17 00:00:00 2001 From: Christian O'Leary Date: Thu, 8 Aug 2024 20:09:24 +0100 Subject: [PATCH] minor changes, fix typo --- .github/workflows/pylint.yml | 2 +- src/emmv/metrics.py | 49 ++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 2629955..fbba871 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tests/requirements.txt + pip install -r tests/requirements.txt pip install -e . - name: Analysing the code with pylint run: | diff --git a/src/emmv/metrics.py b/src/emmv/metrics.py index 768e7fd..428b7df 100644 --- a/src/emmv/metrics.py +++ b/src/emmv/metrics.py @@ -38,7 +38,7 @@ def emmv_scores( scoring_func = default_scoring_func # Specify limits, volume, and levels for uniform sampling - lim_inf, lim_sup, volume_support, levels = calculate_limits(df) + lim_inf, lim_sup, volume, levels = calculate_limits(df) # Perform uniform sampling try: @@ -52,8 +52,8 @@ def emmv_scores( # Calculate and return EM and MV scores return ( - excess_mass(levels, em_min, volume_support, uniform_scores, anomaly_scores), - mass_volume(alpha_min, alpha_max, volume_support, uniform_scores, anomaly_scores), + float(np.mean(excess_mass(levels, em_min, volume, uniform_scores, anomaly_scores))), + float(np.mean(mass_volume(alpha_min, alpha_max, volume, uniform_scores, anomaly_scores))), ) @@ -69,22 +69,22 @@ def calculate_limits(df: pd.DataFrame, offset: float = 1e-60) -> tuple: lim_sup = df.max(axis=0) # Volume of rectangle containing all data in X - volume_support = float((lim_sup - lim_inf).prod()) + offset + volume = float((lim_sup - lim_inf).prod()) + offset # An "array of levels, on which we want to evaluate # EM_s(t) on samples X from an underlying density f." - levels = np.arange(0, 100 / volume_support, 0.01 / volume_support) + levels = np.arange(0, 100 / volume, 0.01 / volume) - return lim_inf, lim_sup, volume_support, levels + return lim_inf, lim_sup, volume, levels def excess_mass( levels: np.ndarray, em_min: float, - volume_support: float, + volume: float, uniform_scores: np.ndarray, anomaly_scores: np.ndarray, -) -> float: +) -> np.ndarray: """Calculate Excess-Mass scores. Variables explained here: https://github.com/ngoix/EMMV_benchmarks/issues/2 @@ -96,38 +96,38 @@ def excess_mass( :param np.ndarray levels: Levels on which to evaluate EM_s(t) on samples X :param float em_min: Beginning of EM curve. - :param float volume_support: Volume of rectangle containing all data in X. + :param float volume: Volume of rectangle containing all data in X. :param np.ndarray uniform_scores: s(U), used to estimate Leb(s>t) :param np.ndarray anomaly_scores: s(X), s evaluated on a sample, used to estimate P(s>t) - :return float: Mean EM score + :return np.ndarray: EM scores """ n_samples = anomaly_scores.shape[0] unique_anomaly_scores = np.unique(anomaly_scores) excess_mass_scores = np.zeros(levels.shape[0]) excess_mass_scores[0] = 1.0 - for u in unique_anomaly_scores: - excess_mass_scores = np.maximum( - excess_mass_scores, - 1.0 / n_samples * (anomaly_scores > u).sum() - - levels * (uniform_scores > u).sum() / len(uniform_scores) * volume_support, - ) + for score in unique_anomaly_scores: + anomaly_fraction = 1.0 / n_samples * (anomaly_scores > score).sum() + uniform = levels * (uniform_scores > score).sum() / len(uniform_scores) + excess_mass_scores = np.maximum(excess_mass_scores, anomaly_fraction - (uniform * volume)) + index = int(np.argmax(excess_mass_scores <= em_min).flatten()[0]) + 1 if index == 1: logger.warning('Failed to achieve em_min') index = -1 # em_auc = auc(levels[:index], excess_mass_scores[:index]) - return float(np.mean(excess_mass_scores)) + return excess_mass_scores def mass_volume( alpha_min: float, alpha_max: float, - volume_support: float, + volume: float, uniform_scores: np.ndarray, anomaly_scores: np.ndarray, -) -> float: + alpha_count: int = 1000, +) -> np.ndarray: """Calculate Mass-Volume (MV) scores. Variables explained here: https://github.com/ngoix/EMMV_benchmarks/issues/2 @@ -139,10 +139,11 @@ def mass_volume( :param float alpha_min: Minimum alpha axis value :param float alpha_max: Maximum alpha axis value - :param float volume_support: Volume of rectangle containing all data in X. + :param float volume: Volume of rectangle containing all data in X. :param np.ndarray uniform_scores: s(U), used to estimate Leb(s>t) :param np.ndarray anomaly_scores: s(X), s evaluated on a sample, used to estimate P(s>t) - :return tuple: Mean MV score + :param int alpha_count: Number of levels + :return np.ndarray: MV scores """ n_samples = anomaly_scores.shape[0] @@ -150,7 +151,7 @@ def mass_volume( mass = 0.0 count = 0 score = anomaly_scores[sorted_indices[-1]] # i.e. 'u' - axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) + axis_alpha = np.linspace(alpha_min, alpha_max, alpha_count) # Calculate MV scores mv_scores = np.zeros(axis_alpha.shape[0]) @@ -161,7 +162,7 @@ def mass_volume( mass = 1.0 / n_samples * count # sum(s_X > u) score_count = float((uniform_scores >= float(score)).sum()) - mv_scores[i] = (score_count / len(uniform_scores)) * volume_support + mv_scores[i] = (score_count / len(uniform_scores)) * volume # mv_auc = auc(axis_alpha, mv_scores) - return float(np.mean(mv_scores)) + return mv_scores