Skip to content

Commit

Permalink
minor changes, fix typo
Browse files Browse the repository at this point in the history
  • Loading branch information
christian-oleary committed Aug 8, 2024
1 parent e16ba36 commit a087ed6
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install tests/requirements.txt
pip install -r tests/requirements.txt
pip install -e .
- name: Analysing the code with pylint
run: |
Expand Down
49 changes: 25 additions & 24 deletions src/emmv/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def emmv_scores(
scoring_func = default_scoring_func

# Specify limits, volume, and levels for uniform sampling
lim_inf, lim_sup, volume_support, levels = calculate_limits(df)
lim_inf, lim_sup, volume, levels = calculate_limits(df)

# Perform uniform sampling
try:
Expand All @@ -52,8 +52,8 @@ def emmv_scores(

# Calculate and return EM and MV scores
return (
excess_mass(levels, em_min, volume_support, uniform_scores, anomaly_scores),
mass_volume(alpha_min, alpha_max, volume_support, uniform_scores, anomaly_scores),
float(np.mean(excess_mass(levels, em_min, volume, uniform_scores, anomaly_scores))),
float(np.mean(mass_volume(alpha_min, alpha_max, volume, uniform_scores, anomaly_scores))),
)


Expand All @@ -69,22 +69,22 @@ def calculate_limits(df: pd.DataFrame, offset: float = 1e-60) -> tuple:
lim_sup = df.max(axis=0)

# Volume of rectangle containing all data in X
volume_support = float((lim_sup - lim_inf).prod()) + offset
volume = float((lim_sup - lim_inf).prod()) + offset

# An "array of levels, on which we want to evaluate
# EM_s(t) on samples X from an underlying density f."
levels = np.arange(0, 100 / volume_support, 0.01 / volume_support)
levels = np.arange(0, 100 / volume, 0.01 / volume)

return lim_inf, lim_sup, volume_support, levels
return lim_inf, lim_sup, volume, levels


def excess_mass(
levels: np.ndarray,
em_min: float,
volume_support: float,
volume: float,
uniform_scores: np.ndarray,
anomaly_scores: np.ndarray,
) -> float:
) -> np.ndarray:
"""Calculate Excess-Mass scores.
Variables explained here: https://github.com/ngoix/EMMV_benchmarks/issues/2
Expand All @@ -96,38 +96,38 @@ def excess_mass(
:param np.ndarray levels: Levels on which to evaluate EM_s(t) on samples X
:param float em_min: Beginning of EM curve.
:param float volume_support: Volume of rectangle containing all data in X.
:param float volume: Volume of rectangle containing all data in X.
:param np.ndarray uniform_scores: s(U), used to estimate Leb(s>t)
:param np.ndarray anomaly_scores: s(X), s evaluated on a sample, used to estimate P(s>t)
:return float: Mean EM score
:return np.ndarray: EM scores
"""
n_samples = anomaly_scores.shape[0]
unique_anomaly_scores = np.unique(anomaly_scores)
excess_mass_scores = np.zeros(levels.shape[0])
excess_mass_scores[0] = 1.0

for u in unique_anomaly_scores:
excess_mass_scores = np.maximum(
excess_mass_scores,
1.0 / n_samples * (anomaly_scores > u).sum()
- levels * (uniform_scores > u).sum() / len(uniform_scores) * volume_support,
)
for score in unique_anomaly_scores:
anomaly_fraction = 1.0 / n_samples * (anomaly_scores > score).sum()
uniform = levels * (uniform_scores > score).sum() / len(uniform_scores)
excess_mass_scores = np.maximum(excess_mass_scores, anomaly_fraction - (uniform * volume))

index = int(np.argmax(excess_mass_scores <= em_min).flatten()[0]) + 1
if index == 1:
logger.warning('Failed to achieve em_min')
index = -1

# em_auc = auc(levels[:index], excess_mass_scores[:index])
return float(np.mean(excess_mass_scores))
return excess_mass_scores


def mass_volume(
alpha_min: float,
alpha_max: float,
volume_support: float,
volume: float,
uniform_scores: np.ndarray,
anomaly_scores: np.ndarray,
) -> float:
alpha_count: int = 1000,
) -> np.ndarray:
"""Calculate Mass-Volume (MV) scores.
Variables explained here: https://github.com/ngoix/EMMV_benchmarks/issues/2
Expand All @@ -139,18 +139,19 @@ def mass_volume(
:param float alpha_min: Minimum alpha axis value
:param float alpha_max: Maximum alpha axis value
:param float volume_support: Volume of rectangle containing all data in X.
:param float volume: Volume of rectangle containing all data in X.
:param np.ndarray uniform_scores: s(U), used to estimate Leb(s>t)
:param np.ndarray anomaly_scores: s(X), s evaluated on a sample, used to estimate P(s>t)
:return tuple: Mean MV score
:param int alpha_count: Number of levels
:return np.ndarray: MV scores
"""

n_samples = anomaly_scores.shape[0]
sorted_indices = anomaly_scores.argsort()
mass = 0.0
count = 0
score = anomaly_scores[sorted_indices[-1]] # i.e. 'u'
axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
axis_alpha = np.linspace(alpha_min, alpha_max, alpha_count)

# Calculate MV scores
mv_scores = np.zeros(axis_alpha.shape[0])
Expand All @@ -161,7 +162,7 @@ def mass_volume(
mass = 1.0 / n_samples * count # sum(s_X > u)

score_count = float((uniform_scores >= float(score)).sum())
mv_scores[i] = (score_count / len(uniform_scores)) * volume_support
mv_scores[i] = (score_count / len(uniform_scores)) * volume

# mv_auc = auc(axis_alpha, mv_scores)
return float(np.mean(mv_scores))
return mv_scores

0 comments on commit a087ed6

Please sign in to comment.