-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabbr_preprocess_german.py
167 lines (134 loc) · 8.74 KB
/
abbr_preprocess_german.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from dataclasses import dataclass
from typing import List, Dict
import json
from collections import defaultdict
from sklearn.model_selection import train_test_split
from utils.abbr_utils import AnnotatedApisSentence, read_apis_conll, get_masked_text
def compare_data_partitions(train_stats: Dict, test_stats: Dict, label: str):
unique_test_abbrs = set(test_stats['all_abbr_candidates']).difference(train_stats['all_abbr_candidates'])
unique_candidates = set([abbr.lower() for abbr in unique_test_abbrs])
print(f"\nUnique Abbreviation Candidates in {label} (UNSEEN) = {len(unique_candidates)}")
def get_dataset_stats(dataset: List[AnnotatedApisSentence], verbose: bool, use_naive_tokens: bool) -> Dict:
"""Displays the structured data. For debug purposes (put verbose=True), it also returns global statistics information from the whole corpus
Args:
dataset (List[AnnotatedSentence]): The list of clean structured corpus examples
verbose (bool): Decide if print the entire corpus examples or not
Returns:
Dict: It includes some relevant corpus stats that might be useful for "global" methods...
"""
all_mappings = defaultdict(set)
mappings_counter = defaultdict(int)
# General Stats Counters
number_of_sentences, number_of_tokens, number_of_types = 0, 0, 0
all_tokens = []
all_abbr_candidates, all_abbr_expansions = [], []
for sent in dataset:
number_of_sentences += 1
number_of_tokens += len(sent.tokens)
all_tokens += [tok.text for tok in sent.tokens]
sent.get_mapping()
for item in sent.mapping:
index, abbrev, expan = item
all_mappings[abbrev].add(expan)
all_abbr_expansions.append(expan)
all_abbr_candidates.append(abbrev)
mappings_counter[(abbrev, expan)] += 1
abbr_toks, exp_toks, lbls = sent.get_labeled_sequences(use_naive_tokens=use_naive_tokens)
if verbose:
print(" ".join(abbr_toks))
print(" ".join(exp_toks))
print(" ".join(lbls))
print(sent.mapping)
print('-------')
if verbose:
for k,v in sorted(all_mappings.items(), key= lambda x: len(x[1]), reverse=True):
print(f"{k} --> {v}")
print("\n\n")
for k,v in sorted(mappings_counter.items(), key= lambda x: x[1], reverse=True):
print(f"{k} --> {v}")
number_of_types = len(set(all_tokens))
number_abbr_candidates = len(all_abbr_candidates)
number_unique_abbr_candidates = len(set(all_abbr_candidates))
number_abbr_expansions = len(all_abbr_expansions)
number_unique_expansions = len(set(all_abbr_expansions))
print(f"\tSentences = {number_of_sentences}\n\tTokens = {number_of_tokens}\n\tTypes = {number_of_types}\n\tAbbreviations = {number_abbr_candidates} (unique = {number_unique_abbr_candidates})\n\tExpansions = {number_abbr_expansions} (unique = {number_unique_expansions})")
print(f"\tAbbr->Expansion Pairs = {len(mappings_counter)}")
return {
"exp2abbr_mapping": all_mappings,
"exp_abbr_pairwise_mapping_counts": mappings_counter,
"all_abbr_candidates": all_abbr_candidates
}
def save_dataset_split_ids(filepath: str, dataset: List[AnnotatedApisSentence]) -> None:
doc_dict = defaultdict(list)
for doc in dataset:
doc_dict[doc.doc_id].append(doc.sent_id)
with open(filepath, "w") as fout:
json.dump(doc_dict, fout, indent=4)
def create_token_classification_data(data: List[AnnotatedApisSentence], use_naive_tokens: bool, output_path: str):
"""This function builds the dataset for STEP 1: Token Classification [ABBR, NO-ABBR] for each Token in the dataset
We save it in a file so it can be later loaded as a HuggingFace Dataset and make batched experiments with it!
Args:
data (List[AnnotatedSentence]): The list of clean structured corpus examples
use_naive_tokens (bool): If we can use the conll tokens or to do 'naive' space-based splitting
"""
with open(output_path, "w", encoding='utf-8') as fout:
for sent in data:
abbr_toks, exp_toks, lbls = sent.get_labeled_sequences(use_naive_tokens=use_naive_tokens)
for tok, lbl in zip(abbr_toks, lbls):
fout.write(json.dumps({"token": tok, "gold_label": lbl, "document_id": sent.doc_id, "sent_id": sent.sent_id}) + "\n")
def save_document_data(data: List[AnnotatedApisSentence], output_path: str):
with open(output_path, "w", encoding='utf-8') as fout:
for doc in data:
abbr_toks, exp_toks, labels = doc.get_labeled_sequences(use_naive_tokens=False)
naive_abbr_toks, naive_exp_toks, naive_labels = doc.get_labeled_sequences(use_naive_tokens=True)
data_obj = {
'document_id': doc.doc_id,
'original_text': doc.source_text,
'abbreviated_text': " ".join(naive_abbr_toks),
'expanded_text': " ".join(naive_exp_toks),
'mapping': doc.mapping,
'token_objects': [tok.asdict() for tok in doc.tokens],
'abbreviated_tokenized': abbr_toks,
'expanded_tokenized': exp_toks,
'token_labels': labels,
'token_labels_naive': naive_labels
}
fout.write(json.dumps(data_obj) + "\n")
def save_text_data(data: List[AnnotatedApisSentence], output_path: str):
with open(output_path, "w", encoding='utf-8') as fout:
for doc in data:
naive_abbr_toks, _, _ = doc.get_labeled_sequences(use_naive_tokens=True)
fout.write(" ".join(naive_abbr_toks) + "\n")
if __name__ == '__main__':
INPUTS_PATH = "data/corpora/german/"
OUTPUTS_PATH = "data/outputs/german/"
apis_tokens, apis_sents, apis_docs_ids = read_apis_conll(f"{INPUTS_PATH}/t_3_2__apis_tokenized_annotated_cleaned_transformed.tsv")
X_train, X_test = train_test_split(apis_sents, test_size=0.2, random_state=4239)
print(len(X_train), len(X_test))
save_dataset_split_ids(f"{OUTPUTS_PATH}/apis-de-abbr.ids.train", X_train)
save_dataset_split_ids(f"{OUTPUTS_PATH}/apis-de-abbr.ids.test", X_test)
# Compute Statistics for each given portion of the dataset
use_naive_tokens=False # We want to report the GOLD statistics as they are manually curated on CoNLL Format
print(f"\nComputing stats for {len(X_train)} examples in TRAIN")
train_stats = get_dataset_stats(X_train, verbose=False, use_naive_tokens=use_naive_tokens)
print(f"\nComputing stats for {len(X_test)} examples in TEST")
test_stats = get_dataset_stats(X_test, verbose=False, use_naive_tokens=use_naive_tokens)
compare_data_partitions(train_stats, test_stats, label="TRAIN vs TEST")
# DATA FOR EXPERIMENT 1: A SINGLE TOKEN BINARY CLASSIFIER (BERT) for Abbreviation or No_Abbreviation
use_naive_tokens=True
# Train Set for STEP 1
create_token_classification_data(X_train, use_naive_tokens, output_path=f'{OUTPUTS_PATH}/apis-de-abbr.tok.train.json')
save_document_data(X_train, output_path=f'{OUTPUTS_PATH}/apis-de-abbr.sentences.train.json')
# Test Set for STEP 1
create_token_classification_data(X_test, use_naive_tokens, output_path=f'{OUTPUTS_PATH}/apis-de-abbr.tok.test.json')
save_document_data(X_test, output_path=f'{OUTPUTS_PATH}/apis-de-abbr.sentences.test.json')
# # EXPERIMENT 2: Save the Train/Dev/Test partitions with the documents. This will be used later to [MASK] them and prepare data (from STEP 1) for expansion prediction
# # The files say 'upperbound' because we know all abbreviation candidates inside the sentences are GOLD (which we won't know in the real-world scenario).
# # PreExpanded Dataset means that only one abbreviation at a time is treated. Assuming everything else has been correctly expanded already (meaning that the abbreviation expanded has more "explicit" context)
# create_abbreviation_expansion_data(X_train, output_path='data/sbl-51abbr.masked.upperbound.preexp.train.json', pre_expand_others=True)
# create_abbreviation_expansion_data(X_dev, output_path='data/sbl-51abbr.masked.upperbound.preexp.dev.json', pre_expand_others=True)
# create_abbreviation_expansion_data(X_test, output_path='data/sbl-51abbr.masked.upperbound.preexp.test.json', pre_expand_others=True)
# # Again, this is the realistic scenario, where each sentence has more than one abbreviation that needs to be identified
# create_abbreviation_expansion_data(X_train, output_path='data/sbl-51abbr.masked.upperbound.train.json', pre_expand_others=False)
# create_abbreviation_expansion_data(X_dev, output_path='data/sbl-51abbr.masked.upperbound.dev.json', pre_expand_others=False)
# create_abbreviation_expansion_data(X_test, output_path='data/sbl-51abbr.masked.upperbound.test.json', pre_expand_others=False)