-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_original.py
153 lines (129 loc) · 5.75 KB
/
get_original.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import sys
import os
import spacy
import json
import ast
import time
from os import listdir
from bs4 import BeautifulSoup
from tqdm import tqdm
start_time = time.time()
gigaword_path = sys.argv[1]
output_dir = sys.argv[2]
#spacy.prefer_gpu()
spacy.require_gpu()
nlp = spacy.load('en_core_web_trf')#, disable=["tagger","attribute_ruler","lemmatizer","ner"])
gigawords_dirs = ["gigaword_eng_5_d1",
"gigaword_eng_5_d2", "gigaword_eng_5_d3"]
source2dir = {"afp": 0, "apw": 0, "cna": 1,
"ltw": 1, "nyt": 1, "wpb": 1, "xin": 2}
train_dirs = ["data/train/train_-11_10", "data/train/train_-31_30",
"data/train/train_-101_100", "data/train/train_-301_300"]
relations = ["0", "1", "3", "5", "7"]
train_files = [f"{d}_{r}.txt" for d in train_dirs for r in relations] + [
"data/train/train_negatives.txt"]
val_files = ["data/val/0.txt", "data/val/1.txt",
"data/val/3.txt", "data/val/5.txt", "data/val/7.txt"]
test_files = ["data/test/0.txt", "data/test/1.txt",
"data/test/3.txt", "data/test/5.txt", "data/test/7.txt"]
dynamic_val_files = ["data/dynamic_val/0.txt", "data/dynamic_val/1.txt",
"data/dynamic_val/3.txt", "data/dynamic_val/5.txt", "data/dynamic_val/7.txt"]
dynamic_test_files = ["data/dynamic_test/0.txt", "data/dynamic_test/1.txt",
"data/dynamic_test/3.txt", "data/dynamic_test/5.txt", "data/dynamic_test/7.txt"]
all_files = train_files + val_files + test_files + \
dynamic_val_files + dynamic_test_files
#all_files = dynamic_test_files
def get_article_path():
# collect gigaword paths and indices of all sentences from our data
path2docid2sentid = {}
for filename in all_files:
for line in open(filename, "r"):
dline = ast.literal_eval(line)
docid = dline["doc_id"]
sentid = dline["sentence_id"]
source = (docid.split("_")[0]).lower()
source_dir = gigawords_dirs[source2dir[source]]
source_filename = (docid.split(".")[0]).lower()[:-2]
source_filename_path = f"{gigaword_path}/{source_dir}/data/{source}_eng/{source_filename}"
if source_filename_path not in path2docid2sentid:
path2docid2sentid[source_filename_path] = {}
if docid not in path2docid2sentid[source_filename_path]:
path2docid2sentid[source_filename_path][docid] = []
path2docid2sentid[source_filename_path][docid].append(sentid)
for src in path2docid2sentid:
for docid in path2docid2sentid[src]:
path2docid2sentid[src][docid] = list(set(path2docid2sentid[src][docid]))
return path2docid2sentid
def process_articles(path2docid2sentid):
# Extract sentences' paths and indices in gigaword.
# Split document into sentences, then tokenize sentences that are in our dataset.
# Save sentence indices and their tokenized sequences into a dictionary
sentid2tokens = {}
for path in tqdm(path2docid2sentid):
docids = set(path2docid2sentid[path].keys())
content = open(path, encoding="utf8").read()
soup = BeautifulSoup(content, 'lxml')
for doc in soup.find_all('doc'):
docid = doc.attrs["id"]
if docid not in docids:
continue
paras = doc.find_all('p')
sentid = 0
for p in paras:
tmp = p.text
tmp = " ".join(tmp.replace("\n", " ").strip().split())
sentences = nlp(tmp).sents
for sentence in sentences:
sentence = str(sentence).strip()
if sentence == "":
continue
sentid += 1
if sentid not in path2docid2sentid[path][docid]:
continue
sentence = nlp(sentence)
tokens = (str(sentence)).split(" ")
sentid2tokens[f"{docid}_{sentid}"] = tokens
return sentid2tokens
def output_with_text(input_file, prefix, sentid2tokens):
fout_name = f"{output_dir}/{prefix}{input_file.split('/')[-1]}"
fout_name = fout_name.replace(".txt",".jsonl")
fout = open(fout_name, 'w')
inputlines = open(input_file).read().strip().split("\n")
for line in inputlines:
sentence = ast.literal_eval(line)
docid = sentence["doc_id"]
sentid = sentence["sentence_id"]
if docid + "_" + str(sentid) not in sentid2tokens:
print(f"{docid} {sentid} not exist in the sentid2tokens")
continue
tokens = sentid2tokens[f"{docid}_{sentid}"]
sentence["tokens"] = tokens
head_start, head_end = sentence["h"]["pos"][0], sentence["h"]["pos"][1]
tail_start, tail_end = sentence["t"]["pos"][0], sentence["t"]["pos"][1]
sentence["h"]["name"] = tokens[head_start:head_end]
sentence["t"]["name"] = tokens[tail_start:tail_end]
json.dump(sentence, fout)
fout.write(",\n")
fout.close()
print("collect all data paths")
sys.stdout.flush()
path2docid2sentid = get_article_path()
print("sentence splitting and tokenizing")
sys.stdout.flush()
sentid2tokens = process_articles(path2docid2sentid)
print("Tokenized")
sys.stdout.flush()
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for filepath in train_files:
output_with_text(filepath, "", sentid2tokens)
for filepath in val_files:
output_with_text(filepath, "val_", sentid2tokens)
for filepath in test_files:
output_with_text(filepath, "test_", sentid2tokens)
for filepath in dynamic_val_files:
output_with_text(filepath, "dynamic_val_", sentid2tokens)
for filepath in dynamic_test_files:
output_with_text(filepath, "dynamic_test_", sentid2tokens)
end_time = time.time()
print(f"Spent {(end_time - start_time) / 3600.0} hours in total")