-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
125 lines (101 loc) · 4.37 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import re
import os
# read in a raw abstract, find all the abbreviation and corresponding full names, return a dict
def extract_abbreviations_helper(doc):
dct = {}
for abrv in doc._.abbreviations:
# review and remove the invalid abbreviations: rule: if the full name contains parenthesis, then remove
fname = abrv._.long_form.text
if re.search(r'[\(\)]', fname) is None:
dct[str(abrv)] = fname
return dct
def extract_abbreviations(data_path, nlp):
abrv = {}
for file in os.listdir(data_path):
if file.endswith(".txt"):
file_path = os.path.join(data_path, file)
with open(file_path, 'r') as f:
text = f.read()
doc = nlp(text)
abrv[file.rstrip('.txt')] = extract_abbreviations_helper(doc)
return abrv
# read in the annotation file and extract the intervention snippets.
def extract_snippets_helper(read_path):
snippets = []
with open(read_path) as f_read:
for line in f_read:
if "Intervention" in line:
snippets.append(line.strip())
return snippets
def extract_snippets(data_path):
inter = {}
for file in os.listdir(data_path):
if file.endswith(".ann"):
file_path = os.path.join(data_path, file)
inter[file.rstrip(".ann")] = extract_snippets_helper(file_path)
return inter
# for each file, we replace the abbreviations in the intervention snippets with their full name
# replace the abbreviations with the full name, also remove the parenthesis
# the return would be a list of snippets, all attributes would be stored in a dict format
def remove_parenthesis(file_id, abrv, inter):
snippets = []
for item in inter:
snippet = {'file_id': file_id, 'snippet_id': item.split('\t')[0].strip(),
'start_pos': item.split('\t')[1].split(' ')[1].strip(),
'end_pos': item.split('\t')[1].split(' ')[2].strip(), 'raw_text': item.split('\t')[2].strip()}
# remove the abbreviation within the parenthesis (including parenthesis).
processed = snippet['raw_text']
for key, value in abrv.items():
# match the key in the parenthesis, also before and after the key in the parenthesis should have minimum
# num of characters
processed = re.sub(rf'\(\s*{key}\s*\)', '', processed)
# replace the abbreviation with full name
if key in processed:
processed = re.sub(rf'{key}', value, processed)
# directly remove the parenthesis as well as the content.
processed = re.sub(r'\(.*?\)', '', processed)
# replace two spaces with one space
processed = re.sub(r'( )+', ' ', processed)
snippet['processed'] = processed
if snippet['processed'] != snippet['raw_text']:
raw = snippet['raw_text']
processed = snippet['processed']
print(f'text changed:\n before:\t{raw}\nafter:\t{processed}')
snippets.append(snippet)
return snippets
# helper normalize the text
def normalize_text(text):
# remove extra '(' ')'
text = re.sub(r'[\(\)]', '', text)
# replace multiple whitespace with one
text = re.sub(r'( )+', ' ', text)
# remove '-' and ' .'
text = re.sub(r'-', ' ', text)
text = re.sub(r'\s\.', ' ', text)
# replace '+' with 'plus'
text = re.sub(r'\+', ' plus ', text)
# replace unicode whitespace
text = text.replace(u'\u00a0', ' ')
text = re.sub(r'\s+', ' ', text)
# the return result would be lower and stripped
return text.lower().strip()
# preprocess
def run(data_file, nlp):
# extract abbreviation
print('-' * 25 + 'extracting abbreviations' + '-' * 25)
abrv = extract_abbreviations(data_file, nlp)
# extract intervention snippets
print('-' * 25 + 'extracting intervention snippets' + '-' * 25)
inter = extract_snippets(data_file)
# remove parenthesis
snippets = []
print('-' * 25 + 'removing parenthesis' + '-' * 25)
for file in os.listdir(data_file):
if file.endswith(".ann"):
file_id = file.rstrip('.ann')
snippets.extend(remove_parenthesis(file_id, abrv[file_id], inter[file_id]))
# text normalization
print('-' * 25 + 'normalizing text' + '-' * 25)
for snippet in snippets:
snippet['processed'] = normalize_text(snippet['processed'])
return snippets