-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt4-eval.py
212 lines (161 loc) · 6.4 KB
/
gpt4-eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import pandas as pd
from nltk.stem import SnowballStemmer
from termcolor import colored
import random
import os
import torch
from openai import OpenAI
from typing import List
from transformers import GenerationConfig, AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from tqdm import tqdm
import os
import json
import openai
from ds.supported import load_dataset
from metrics.rouge import Rouge
import openai
import tiktoken
from tqdm import tqdm
import nltk
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Globals
DATA_DIR = "data/"
DATA_FILENAME = f"pubmed_eval_samples123.csv"
openai_api_key = ""
import os
import requests
import pandas as pd
from openai import OpenAI
def prompt(user_prompt,system_prompt):
temperature = 0
model = "gpt-4"
model= "gpt-3.5-turbo-0613"
#tok = tiktoken.encoding_for_model(model)
client = OpenAI(
# This is the default and can be omitted
api_key=openai_api_key,
)
response = client.chat.completions.create(
messages=[
{
"role": "user",
"content": user_prompt,
},
{
"role": "system",
"content": system_prompt,
}
],
model="gpt-3.5-turbo",
max_tokens=5,
)
reply = response.choices[0].message.content
return reply
# load data
df = pd.read_csv(f"{DATA_DIR}/{DATA_FILENAME}")
print(df.columns)
print("# of samples: ", len(df))
# Evaluation prompt template based on G-Eval
EVALUATION_SYSTEM_PROMPT_TEMPLATE = """
You will be given one summary written for an article. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully.
Please keep this document open while reviewing, and refer to it as needed.
Evaluation Criteria:
{criteria}
Evaluation Steps:
{steps}
"""
EVALUATION_USER_PROMPT_TEMPLATE = """
Example:
Source Text:
{document}
Summary:
{summary}
Evaluation Form:
Please provide your response in two parts. First the score as a numeric value followed by an explanation for the score. Please limit your response to 30 words
- {metric_name}
"""
# Metric 1: Coherence
COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""
COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""
# Metric 2: Fluency
FLUENCY_SCORE_CRITERIA = """
Fluency(1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
3: Good. The summary has few or no errors and is easy to read and follow.
"""
FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 3.
"""
# Metric 2: Domain-Adaptation
DOMAIN_ADAPTATION_SCORE_CRITERIA = """
Domain Adaptation(1-5) - the degree to which the summary adheres to the doamin specific language. \
A good summary employs domain-specific terminology and conveys the sense that model comprehends and encapsulates domain-specific knowledge.\
It resembles the content that would authored by a domain expert. \
Annotators were also asked to penalize summaries that didn't adhere to domain-specific knowledge, and rather used simple words.
"""
DOMAIN_ADAPTATION_SCORE_STEPS = """
1. Read the article carefully and understand the domain it belongs to.
2. Read the summary and check if it contains domain-specific terminologies and concepts, and if it is able to concisely summaries the domain specific concept in the article.
3. Assign a score for domain adaptation based on the Evaluation Criteria.
"""
def get_geval_score(
criteria: str, steps: str, document: str, summary: str, metric_name: str
):
system_prompt = EVALUATION_SYSTEM_PROMPT_TEMPLATE.format(
criteria=criteria,
steps=steps
)
user_prompt = EVALUATION_USER_PROMPT_TEMPLATE.format(
metric_name=metric_name,
document=document,
summary=summary,
)
response = prompt(
user_prompt=user_prompt,
system_prompt=system_prompt,
)
return response
evaluation_metrics = {
"Domain_Adaptation": (DOMAIN_ADAPTATION_SCORE_CRITERIA, DOMAIN_ADAPTATION_SCORE_STEPS),
"Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
"Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
}
models_to_evaluate = ['meta-llama-Llama-2-7b-chat-hf_2-SHOT', 'meta-llama-Llama-2-7b-hf-mtc-pubmed_0-SHOT',
'meta-llama-Llama-2-70b-chat-hf_2-SHOT', 'pegasusx_Finetuned']
eval_column_names = [f"{eval_metrics}_{model}" for model in models_to_evaluate for eval_metrics in
evaluation_metrics.keys()]
eval_column_names.insert(0, 'sample_id')
df_geval = pd.DataFrame(columns=eval_column_names)
for i, sample in df.iterrows():
eval_scores = []
eval_scores.append(sample['Unnamed: 0'])
for model in models_to_evaluate:
print(model)
# time.sleep(5)
article = sample.article
summary = sample[model]
for eval_type, (criteria, steps) in evaluation_metrics.items():
print(eval_type)
result = get_geval_score(criteria, steps, article, summary, eval_type)
eval_scores.append(result)
df_geval.loc[len(df_geval)] = eval_scores
df_geval.to_excel("pubmed_GPT4_Evaluation_final.xlsx")
print (f"Evaluation Score stored at {df_geval}")