-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmetrics.py
95 lines (74 loc) · 2.87 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from tqdm import tqdm
import json
import argparse
from nltk import word_tokenize
import os
from simcse import SimCSE
import numpy as np
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--run_name", default="llama-7b-greedy", type=str)
parser.set_defaults(bottleneck=True)
parser.set_defaults(augment=True)
args = parser.parse_args()
return args
def compute_rep_n(text, n):
tokens = word_tokenize(text)
ngrams = [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]
rep_n = 100 * (1.0 - len(set(ngrams)) / (len(ngrams) + 1))
return rep_n
def compute_diversity(text):
diversity = 1.0
for n in range(2, 5):
rep_n_val = compute_rep_n(text, n)
diversity *= 1.0 - rep_n_val / 100
return diversity
def clean(text, sep="###"):
return text.split(sep)[0]
def average(entries):
return sum(entries) / len(entries)
def compute_coherence(prompts, responses):
model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")
similarities = np.array(model.similarity(prompts, responses))
return similarities.trace() / len(similarities)
if __name__ == "__main__":
args = get_args()
path = os.path.join("outputs", f"{args.run_name}.json")
generations = json.load(open(path, "r"))
entries = []
for generation in tqdm(generations):
prompt = generation["prompt"]
response = clean(clean(generation["response"][len(prompt) :], "###Human:"), "\n\nHuman:")
if len(response) == 0:
response = " "
rep_2 = compute_rep_n(response, 2)
rep_3 = compute_rep_n(response, 3)
rep_4 = compute_rep_n(response, 4)
diversity = compute_diversity(response)
entries.append(
{
"prompt": prompt,
"response": response,
"original_response": generation["response"][len(prompt) :],
"rep_2": rep_2,
"rep_3": rep_3,
"rep_4": rep_4,
"diversity": diversity,
"response_length": len(response),
"elapsed": generation["elapsed"],
}
)
evaluations = {
"rep_2": average([entry["rep_2"] for entry in entries]),
"rep_3": average([entry["rep_3"] for entry in entries]),
"rep_4": average([entry["rep_4"] for entry in entries]),
"diversity": average([entry["diversity"] for entry in entries]),
"coherence": compute_coherence(
[entry["prompt"] for entry in entries], [entry["response"] for entry in entries]
),
"response_length": average([entry["response_length"] for entry in entries]),
"elapsed": average([entry["elapsed"] for entry in entries]),
"entries": entries,
}
eval_path = os.path.join("evaluations", f"{args.run_name}.json")
json.dump(evaluations, open(eval_path, "w"), indent=2)