-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy patheval_utils.py
135 lines (119 loc) · 3.88 KB
/
eval_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""
evalulation
"""
import numpy as np
from typing import List, Tuple, Dict
from sklearn.metrics import f1_score
SUBSET2IDS = {
'easy': [0, 7, 19],
'medium': [1, 3, 4, 6, 9, 10, 11, 12, 13, 14, 15, 16, 18, 22, 26],
'hard': [2, 5, 8, 17, 20, 21, 23, 24, 25, 27],
'object': [0, 3, 10, 11, 12, 16, 23],
'context': [7, 8],
"other": [1, 2, 4, 5, 6, 9, 13, 17, 18, 19, 20, 22, 25, 27, 14, 15, 21, 24, 26],
}
def eval_all_metrics(
val_scores: np.ndarray,
test_scores: np.ndarray,
val_targets: List[List[int]],
test_targets: List[List[int]]
) -> dict:
"""
compute validation and test results
args:
val_scores: np.ndarray of shape (val_num, num_classes),
test_scores: np.ndarray of shape (test_num, num_classes),
val_targets: List[List[int]],
test_targets: List[List[int]]
"""
# get optimal threshold using val set
multihot_targets = multihot(val_targets, 28)
f1_dict = get_best_f1_scores(multihot_targets, val_scores)
# get results using the threshold found
multihot_targets = multihot(test_targets, 28)
test_micro, test_samples, test_macro, test_none = compute_f1(multihot_targets, test_scores, f1_dict["threshold"])
return {
"val_micro": f1_dict["micro"], "val_samples": f1_dict["samples"],
"val_macro": f1_dict["macro"], "val_none": f1_dict["none"],
"test_micro": test_micro, "test_samples": test_samples,
"test_macro": test_macro, "test_none": test_none,
}
def get_best_f1_scores(
multihot_targets: np.ndarray,
scores: np.ndarray,
threshold_end: float = 0.05
) -> Dict[str, float]:
"""
get the optimal macro f1 score by tuning threshold
"""
thrs = np.linspace(
threshold_end, 0.95, int(np.round((0.95 - threshold_end) / 0.05)) + 1,
endpoint=True
)
f1_micros = []
f1_macros = []
f1_samples = []
f1_none = []
for thr in thrs:
_micros, _samples, _macros, _none = compute_f1(multihot_targets, scores, thr)
f1_micros.append(_micros)
f1_samples.append(_samples)
f1_macros.append(_macros)
f1_none.append(_none)
f1_macros_m = max(f1_macros)
b_thr = np.argmax(f1_macros)
f1_micros_m = f1_micros[b_thr]
f1_samples_m = f1_samples[b_thr]
f1_none_m = f1_none[b_thr]
f1 = {}
f1["micro"] = f1_micros_m
f1["macro"] = f1_macros_m
f1["samples"] = f1_samples_m
f1["threshold"] = thrs[b_thr]
f1["none"] = f1_none_m
return f1
def compute_f1(
multihot_targets: np.ndarray, scores: np.ndarray, threshold: float = 0.5
) -> Tuple[float, float, float]:
# change scores to predict_labels
predict_labels = scores > threshold
predict_labels = predict_labels.astype(np.int)
# get f1 scores
f1 = {}
f1["micro"] = f1_score(
y_true=multihot_targets,
y_pred=predict_labels,
average="micro"
)
f1["samples"] = f1_score(
y_true=multihot_targets,
y_pred=predict_labels,
average="samples"
)
f1["macro"] = f1_score(
y_true=multihot_targets,
y_pred=predict_labels,
average="macro"
)
f1["none"] = f1_score(
y_true=multihot_targets,
y_pred=predict_labels,
average=None
)
return f1["micro"], f1["samples"], f1["macro"], f1["none"]
def multihot(x: List[List[int]], nb_classes: int) -> np.ndarray:
"""transform to multihot encoding
Arguments:
x: list of multi-class integer labels, in the range
[0, nb_classes-1]
nb_classes: number of classes for the multi-hot vector
Returns:
multihot: multihot vector of type int, (num_samples, nb_classes)
"""
num_samples = len(x)
multihot = np.zeros((num_samples, nb_classes), dtype=np.int32)
for idx, labs in enumerate(x):
for lab in labs:
multihot[idx, lab] = 1
return multihot.astype(np.int)