-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabbr_preprocess_slovenian.py
220 lines (183 loc) · 11.6 KB
/
abbr_preprocess_slovenian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
from dataclasses import dataclass
from typing import List, Dict
import json
from collections import defaultdict
from sklearn.model_selection import train_test_split
from utils.abbr_utils import AnnotatedSentence, read_slovene_conll, get_masked_text
def _get_multitok_abbrs(tokens_abbr, tokens_exp, labels):
"""
This works better when the mode is exp2abbr, otherwise the mapping is just repetitions og the same expansion given a fragmented abbreviation
Example: abbr2exp: B. - jev ---> Burgarjev || exp2abbr: Burgarjev ---> B.-jev
"""
multitok = defaultdict(list)
open_exp = []
curr_abbr = ""
for i, l in enumerate(labels):
if len(open_exp) > 0:
# if l == 'I-ABBR' is the STRICTEST way to catch multi-token abbreviations, however annotations are inconsistent: some are with consecutive [B-ABBR, B-ABBR, ...] instead of [[B-ABBR, I-ABBR, ...]].
# l != 'O' catches ALL consecutive abbreviations, however this has a lot of FalsePositives and introduces more entropy, is better to treat each abbreviation individually
if l == 'I-ABBR':
open_exp.append(tokens_exp[i])
elif l == 'O':
if len(open_exp) > 1:
multitok[curr_abbr].append(" ".join(open_exp))
open_exp = []
elif l == 'B-ABBR':
open_exp.append(tokens_exp[i])
curr_abbr = tokens_abbr[i]
else:
continue
return multitok
def get_dataset_stats(dataset: List[AnnotatedSentence], verbose: bool, use_naive_tokens: bool) -> Dict:
"""Displays the structured data. For debug purposes (put verbose=True), it also returns global statistics information from the whole corpus
Args:
dataset (List[AnnotatedSentence]): The list of clean structured corpus examples
verbose (bool): Decide if print the entire corpus examples or not
Returns:
Dict: It includes some relevant corpus stats that might be useful for "global" methods...
"""
all_mappings = defaultdict(set)
mappings_counter = defaultdict(int)
# General Stats Counters
number_of_sentences, number_of_tokens, number_of_types = 0, 0, 0
all_tokens = []
all_abbr_candidates, all_abbr_expansions = [], []
all_multitok_abbrs = []
for sent in dataset:
number_of_sentences += 1
number_of_tokens += len(sent.tokens)
all_tokens += [tok.text for tok in sent.tokens]
for item in sent.mapping:
if sent.mode == 'abbr2exp':
index, abbrev, expan = item
all_mappings[abbrev].add(expan)
else:
index, expan, abbrev = item
all_mappings[expan].add(abbrev)
all_abbr_expansions.append(expan)
all_abbr_candidates.append(abbrev)
mappings_counter[(abbrev, expan)] += 1
abbr_toks, exp_toks, lbls = sent.get_labeled_sequences(use_naive_tokens=use_naive_tokens)
if verbose:
print(" ".join(abbr_toks))
print(" ".join(exp_toks))
print(" ".join(lbls))
print(sent.mapping)
print('-------')
multitok = _get_multitok_abbrs(abbr_toks, exp_toks, lbls)
if len(multitok) > 0:
all_multitok_abbrs.append(multitok)
if verbose:
for k,v in sorted(all_mappings.items(), key= lambda x: len(x[1]), reverse=True):
print(f"{k} --> {v}")
print("\n\n")
for k,v in sorted(mappings_counter.items(), key= lambda x: x[1], reverse=True):
print(f"{k} --> {v}")
number_of_types = len(set(all_tokens))
number_abbr_candidates = len(all_abbr_candidates)
number_unique_abbr_candidates = len(set(all_abbr_candidates))
number_abbr_expansions = len(all_abbr_expansions)
number_unique_expansions = len(set(all_abbr_expansions))
print(f"\tSentences = {number_of_sentences}\n\tTokens = {number_of_tokens}\n\tTypes = {number_of_types}\n\tAbbreviations = {number_abbr_candidates} (unique = {number_unique_abbr_candidates})\n\tExpansions = {number_abbr_expansions} (unique = {number_unique_expansions})")
print(f"\tAbbr->Expansion Pairs = {len(mappings_counter)}")
[print(x) for x in all_multitok_abbrs]
return {
"exp2abbr_mapping": all_mappings,
"exp_abbr_pairwise_mapping_counts": mappings_counter,
"all_abbr_candidates": all_abbr_candidates
}
def compare_data_partitions(train_stats: Dict, test_stats: Dict, label: str):
unique_test_abbrs = set(test_stats['all_abbr_candidates']).difference(train_stats['all_abbr_candidates'])
unique_candidates = set([abbr.lower() for abbr in unique_test_abbrs])
print(f"\nUnique Abbreviation Candidates in {label} (UNSEEN) = {len(unique_candidates)}")
def create_token_classification_data(data: List[AnnotatedSentence], use_naive_tokens: bool, output_path: str):
"""This function builds the dataset for STEP 1: Token Classification [ABBR, NO-ABBR] for each Token in the dataset
We save it in a file so it can be later loaded as a HuggingFace Dataset and make batched experiments with it!
Args:
data (List[AnnotatedSentence]): The list of clean structured corpus examples
use_naive_tokens (bool): If we can use the conll tokens or to do 'naive' space-based splitting
"""
with open(output_path, "w", encoding='utf-8') as fout:
for sent in data:
abbr_toks, exp_toks, lbls = sent.get_labeled_sequences(use_naive_tokens=use_naive_tokens)
for tok, lbl in zip(abbr_toks, lbls):
fout.write(json.dumps({"token": tok, "gold_label": lbl, "document_id": sent.doc_id, "sent_id": sent.sent_id}) + "\n")
def create_abbreviation_expansion_data(data: List[AnnotatedSentence], output_path: str, pre_expand_others: bool):
with open(output_path, "w", encoding='utf-8') as fout:
for doc in data:
masked_doc = get_masked_text(doc, mask_token='<mask>', pre_expand_others=pre_expand_others)
for sent_id, masked_sent in masked_doc.items():
data_obj = {'doc_id': doc.doc_id, 'sent_id':sent_id, 'masked_tokens': masked_sent['sentence'], 'candidate': masked_sent['candidate'],
'gold_expansion': masked_sent['gold_expansion'], 'mask_index': masked_sent['mask_index']}
fout.write(json.dumps(data_obj) + "\n")
def save_document_data(data: List[AnnotatedSentence], output_path: str):
with open(output_path, "w", encoding='utf-8') as fout:
for doc in data:
abbr_toks, exp_toks, labels = doc.get_labeled_sequences(use_naive_tokens=False)
naive_abbr_toks, naive_exp_toks, naive_labels = doc.get_labeled_sequences(use_naive_tokens=True)
data_obj = {
'document_id': doc.doc_id,
'original_text': doc.source_text,
'abbreviated_text': " ".join(naive_abbr_toks),
'expanded_text': " ".join(naive_exp_toks),
'mapping': doc.mapping,
'token_objects': [tok.asdict() for tok in doc.tokens],
'abbreviated_tokenized': abbr_toks,
'expanded_tokenized': exp_toks,
'token_labels': labels,
'token_labels_naive': naive_labels
}
fout.write(json.dumps(data_obj) + "\n")
def save_text_data(data: List[AnnotatedSentence], output_path: str):
with open(output_path, "w", encoding='utf-8') as fout:
for doc in data:
naive_abbr_toks, _, _ = doc.get_labeled_sequences(use_naive_tokens=True)
fout.write(" ".join(naive_abbr_toks) + "\n")
def save_dataset_split_ids(filepath: str, dataset: List[AnnotatedSentence]) -> None:
doc_dict = defaultdict(list)
for doc in dataset:
doc_dict[doc.doc_id].append(doc.sent_id)
with open(filepath, "w") as fout:
json.dump(doc_dict, fout, indent=4)
if __name__ == '__main__':
all_sentences = read_slovene_conll("resources/emnlp2022_data/sbl-51abbr-expan.conll", mode='exp2abbr')
X_train, X_test = train_test_split(all_sentences, test_size=0.2, random_state=4239)
X_train, X_dev = train_test_split(X_train, test_size=0.125, random_state=4239)
print(len(X_train), len(X_dev), len(X_test))
save_dataset_split_ids("data/sbl-51abbr.ids.train", X_train)
save_dataset_split_ids("data/sbl-51abbr.ids.dev", X_dev)
save_dataset_split_ids("data/sbl-51abbr.ids.test", X_test)
# Compute Statistics for each given portion of the dataset
use_naive_tokens=False # We want to report the GOLD statistics as they are manually curated on CoNLL Format
print(f"\nComputing stats for {len(X_train)} examples in TRAIN")
train_stats = get_dataset_stats(X_train, verbose=False, use_naive_tokens=use_naive_tokens)
print(f"\nComputing stats for {len(X_dev)} examples in DEVELOPMENT")
dev_stats = get_dataset_stats(X_dev, verbose=False, use_naive_tokens=use_naive_tokens)
print(f"\nComputing stats for {len(X_test)} examples in TEST")
test_stats = get_dataset_stats(X_test, verbose=False, use_naive_tokens=use_naive_tokens)
compare_data_partitions(train_stats, test_stats, label="TRAIN vs TEST")
compare_data_partitions(train_stats, dev_stats, label="TRAIN vs DEV")
compare_data_partitions(dev_stats, test_stats, label="DEV vs TEST")
# DATA FOR EXPERIMENT 1: A SINGLE TOKEN BINARY CLASSIFIER (BERT) for Abbreviation or No_Abbreviation
use_naive_tokens=True
# Train Set for STEP 1
create_token_classification_data(X_train, use_naive_tokens, output_path='data/sbl-51abbr.tok.train.json')
save_document_data(X_train, output_path='data/sbl-51abbr.sentences.train.json')
# Development Set for STEP 1
create_token_classification_data(X_dev, use_naive_tokens, output_path='data/sbl-51abbr.tok.dev.json')
save_document_data(X_dev, output_path='data/sbl-51abbr.sentences.dev.json')
save_text_data(X_dev, output_path='data/sbl-51abbr.sentences.dev.txt')
# Test Set for STEP 1
create_token_classification_data(X_test, use_naive_tokens, output_path='data/sbl-51abbr.tok.test.json')
save_document_data(X_test, output_path='data/sbl-51abbr.sentences.test.json')
save_text_data(X_test, output_path='data/sbl-51abbr.sentences.test.txt')
# EXPERIMENT 2: Save the Train/Dev/Test partitions with the documents. This will be used later to [MASK] them and prepare data (from STEP 1) for expansion prediction
# The files say 'upperbound' because we know all abbreviation candidates inside the sentences are GOLD (which we won't know in the real-world scenario).
# PreExpanded Dataset means that only one abbreviation at a time is treated. Assuming everything else has been correctly expanded already (meaning that the abbreviation expanded has more "explicit" context)
create_abbreviation_expansion_data(X_train, output_path='data/sbl-51abbr.masked.upperbound.preexp.train.json', pre_expand_others=True)
create_abbreviation_expansion_data(X_dev, output_path='data/sbl-51abbr.masked.upperbound.preexp.dev.json', pre_expand_others=True)
create_abbreviation_expansion_data(X_test, output_path='data/sbl-51abbr.masked.upperbound.preexp.test.json', pre_expand_others=True)
# Again, this is the realistic scenario, where each sentence has more than one abbreviation that needs to be identified
create_abbreviation_expansion_data(X_train, output_path='data/sbl-51abbr.masked.upperbound.train.json', pre_expand_others=False)
create_abbreviation_expansion_data(X_dev, output_path='data/sbl-51abbr.masked.upperbound.dev.json', pre_expand_others=False)
create_abbreviation_expansion_data(X_test, output_path='data/sbl-51abbr.masked.upperbound.test.json', pre_expand_others=False)