-
Notifications
You must be signed in to change notification settings - Fork 0
/
str_utils.py
66 lines (50 loc) · 1.62 KB
/
str_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
def normalize_sent(sent):
sent = str(sent).replace('``', '"')
sent = str(sent).replace("''", '"')
sent = str(sent).replace('-LRB-', '(')
sent = str(sent).replace('-RRB-', ')')
sent = str(sent).replace('-LSB-', '(')
sent = str(sent).replace('-RSB-', ')')
return sent
def collapse_role_type(role_type):
'''
collapse role types from 36 to 28 following Bishan Yang 2016
we also have to handle types like 'Beneficiary#Recipient'
:param role_type:
:return:
'''
if role_type.startswith('Time-'):
return 'Time'
idx = role_type.find('#')
if idx != -1:
role_type = role_type[:idx]
return role_type
# def normalize_tok(tok, lower=False, normalize_digits=False):
#
# if lower:
# tok = tok.lower()
# if normalize_digits:
# tok = re.sub(r"\d", "0", tok)
# tok = re.sub(r"^(\d+[,])*(\d+)$", "0", tok)
# return tok
def normalize_tok(tok, lower=True, normalize_digits=True):
if lower:
tok = normalize_sent(tok).lower().strip()
if normalize_digits:
RE_NUM = r"\b\d+(?:[\.,']\d+)?\b"
RE_PERCENTAGE = RE_NUM + "%"
tok = re.sub(r"\d", "0", tok)
tok = re.sub(r"^(\d+[,])*(\d+)$", "0", tok)
tok = re.sub(RE_NUM, '0', tok)
tok = re.sub(RE_PERCENTAGE, '0', tok)
tok = re.sub(r"%", 'percentage', tok)
tok = re.sub(r"e\.g\.", 'for example', tok)
tok = re.sub('0/0', '0', tok)
return tok
def capitalize_first_char(sent):
sent = str(sent[0]).upper() + sent[1:]
return sent
if __name__ == '__main__':
s = '0/0'
print(normalize_tok(s))