-
Notifications
You must be signed in to change notification settings - Fork 7
/
clean_rule.py
237 lines (205 loc) · 10.4 KB
/
clean_rule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import argparse
import os
from data import *
from utils import *
import re
from difflib import get_close_matches
def extract_rules(content_list):
""" Extract the rules in the content without any explanation and the leading number if it has."""
rule_pattern = re.compile(r".*\(X,\s?Y\) <--.*") # The rule always has (X,Y) <--
extracted_rules = [s.strip() for s in content_list if rule_pattern.match(s)]
number_pattern = re.compile(r"^\d+\. ")
cleaned_rules = [number_pattern.sub('', s) for s in extracted_rules]
return list(set(cleaned_rules)) # Remove duplicates by converting to set and back to list
def summarize_rules_prompt(relname, k):
"""
Generate prompt for the relation in the content_list
"""
if k != 0:
prompt = f'\n\nPlease identify the most important {k} rules from the following rules for the rule head: "{relname}(X,Y)". '
else: # k ==0
prompt = f'\n\nPlease identify as many of the most important rules for the rule head: "{relname}(X,Y)" as possible. '
prompt += 'You can summarize the rules that have similar meanings as one rule, if you think they are important. ' \
'Return the rules only without any explanations. '
return prompt
def get_valid_rules(input_filepath, output_filepath, valid_response_filepath):
with open(input_filepath, "r") as f:
sum_rule_list = [line.strip() for line in f]
f.close()
valid_prompt = ("Logical rules define the relationship between two entities: X and Y.\n"
"Now please analyse this relation rule path step by step to check whether it is correct. \n"
"If the rules is correct please write (Correct) at the end of your analysis, otherwise please write (Incorrect).\n\n")
with open(output_filepath, "w") as f1, open(valid_response_filepath, 'w') as f2:
for sum_rule in sum_rule_list:
message = valid_prompt + sum_rule
response = query(message, model="gpt-4")
print(response)
f2.write("Input Rule: " + sum_rule + "\n")
f2.write("GPT-4 Response: \n" + response + '\n')
f2.write("\n=======================================\n")
if "incorrect" not in response.lower():
f1.write(sum_rule + '\n')
def check_sample_times(content_list):
"""
Determine the sample time, return True if only sample once
"""
sample_times = 0
for line in content_list:
match = re.search(r'Sample \d+ time:', line)
if match:
sample_times += 1
return sample_times == 1
def summarize_rule(file, args):
"""
Summarize the rules
"""
with open(file, 'r') as f: # Load files
content = f.read()
results = re.match(r"Rule_head:\s(.*)", content)
rel_name = results.group(1)
# rel_name = clean_symbol_in_rel(rel_name)
content_list = content.split('\n')
is_sample_once = check_sample_times(content_list)
rule_list = extract_rules(content_list) # Extract rules and remove any explanations
if (is_sample_once or args.model == 'none') and not args.force_summarize: # just return the whole rule_list
return rule_list
else: # Do summarization and correct the spelling error
summarize_prompt = summarize_rules_prompt(rel_name, args.k)
summarize_prompt_len = num_tokens_from_message(summarize_prompt, args.model)
list_of_rule_lists = shuffle_split_path_list(rule_list, summarize_prompt_len, args.model)
response_list = []
for rule_list in list_of_rule_lists:
message = '\n'.join(rule_list) + summarize_prompt
print('prompt: ', message)
response = query(message, model=args.model)
response_list.extend(response.split('\n'))
response_rules = extract_rules(response_list) # Extract rules and remove any explanations from summarized response
return response_rules
def clean_rules(summarized_file_path, all_rels):
"""
Clean error rules and remove rules with error relation.
"""
with open(summarized_file_path, 'r') as f:
input_rules = [line.strip() for line in f]
cleaned_rules = list()
# Correct spelling error/grammar error for the relation in the rules and Remove rules with error relation.
for rule in input_rules:
if rule == "":
continue
try:
# Get rule head
match = re.search(r'([\w\s\'-\.]+)\(X,\s?Y\)', rule)
if not match:
continue
head = match.group(1).strip()
if head not in all_rels:
best_match = get_close_matches(head, all_rels, n=1)
if not best_match:
print("Cannot correctify this rule, head not in relation: ", rule)
continue
head = best_match[0].strip()
# Get rule conditions and check if they are in the relation list
condition_string = rule.split('<--')[1].strip()
matches = re.findall(r"([\w\s'-\.]+)\((\w+),\s*(\w+)\)", condition_string)
last_subject = "X"
body_list = []
correctyfied = True if len(matches) > 0 else False
for match in matches:
predicate = match[0].strip()
subject = match[1].strip()
object = match[2].strip()
if predicate not in all_rels:
best_match = get_close_matches(predicate, all_rels, n=1)
if not best_match:
correctyfied = False
print(f"Cannot correctify this rule, body: {predicate} not in relaiton: ", rule)
break
predicate = best_match[0].strip()
# Make sure the rule is in the chain-like format
if subject == last_subject:
body_list.append(predicate)
last_subject = object
else:
last_subject = subject
if "inv_" in predicate:
body_list.append(predicate.replace("inv_", ""))
else:
body_list.append(f"inv_{predicate}")
# Add corrected rule to cleaned_rules if it's valid
if correctyfied:
cleaned_rules.append(f"{head} <-- {', '.join(body_list)}")
except Exception as e:
print(f"Processing {rule} failed.\n Error: {str(e)}")
return cleaned_rules
def write_clean_rules_to_file(cleaned_rules, output_filepath, all_rels):
"""
Write cleaned rules to output file in simplified format.
"""
with open(output_filepath, "w") as output_file:
for rule in cleaned_rules:
try:
match = re.search(r'([\w\s\'-\.]+)\(X,\s?Y\)', rule) # Get rule head
if match:
head = match.group(1).strip()
if head not in all_rels:
raise KeyError(f"Key {head} not found in all_rels dictionary")
else:
continue
# Get rule conditions and write to file in simplified format
condition_string = rule.split('<--')[1].strip()
matches = re.findall(r"([\w\s'-\.]+)\(", condition_string)
conditions = []
for match in matches:
match = match.strip()
if match in all_rels:
conditions.append(match)
else:
raise KeyError(f"Key {match} not found in all_rels dictionary")
# Write to file
output_file.write(f"{head} <-- {', '.join(conditions)}\n")
except KeyError as e:
print(f"Skipping rule {rule} due to error: {e}")
continue
def clean(args):
data_path = os.path.join(args.data_path, args.dataset) + '/'
dataset = Dataset(data_root=data_path, inv=True)
rdict = dataset.get_relation_dict()
all_rels = list(rdict.rel2idx.keys())
input_folder = os.path.join(args.rule_path, args.dataset, args.p)
output_folder = os.path.join(args.output_path, args.dataset, args.p, args.model)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith(".txt") and "query" not in filename:
input_filepath = os.path.join(input_folder, filename)
name, ext = os.path.splitext(filename)
summarized_filepath = os.path.join(output_folder, f"{name}_summarized_rules.txt")
clean_filename = name + '_cleaned_rules.txt'
clean_filepath = os.path.join(output_folder, clean_filename)
if not args.clean_only:
# Step 1: Summarize rules from the input file
print("Start summarize: ", filename)
# Summarize rules
summarized_rules = summarize_rule(input_filepath, args)
print("write file", summarized_filepath)
with open(summarized_filepath, "w") as f:
f.write('\n'.join(summarized_rules))
# Step 2: Clean summarized rules and keep format
print(f"Clean file {summarized_filepath} with keeping the format")
cleaned_rules = clean_rules(summarized_filepath, all_rels)
with open(clean_filepath, "w") as f:
f.write('\n'.join(cleaned_rules))
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument('--data_path', type=str, default='datasets', help='data directory')
args.add_argument("--rule_path", default="gen_rules", type=str, help="path to rule file")
args.add_argument("--output_path", default="clean_rules", type=str, help="path to output file")
args.add_argument('--dataset', default='family')
args.add_argument('--model', default='none', help='model name', choices=['none', 'gpt-4', 'gpt-3.5-turbo', 'gpt-3.5-turbo-16k'])
args.add_argument('-p', default='gpt-3.5-turbo-top-0-f-5-l-3', help='rule prefix')
args.add_argument('-k', type=int, default=0, help='Number of summarized rules')
args.add_argument('--clean_only', action='store_true', help='Load summarized rules then clean rules only')
args.add_argument('--valid_clean', action='store_true', help='gpt-4 validation for rules')
args.add_argument('--force_summarize', action='store_true', help='force summarize rules')
args = args.parse_args()
clean(args)