-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnormalize_lexicon.py
executable file
·77 lines (67 loc) · 2.55 KB
/
normalize_lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
import argparse
import sys
import codecs
from collections import defaultdict as dd
import re
import os.path
scriptdir = os.path.dirname(os.path.abspath(__file__))
def main():
parser = argparse.ArgumentParser(description="Given LRLP lexicon flat representation attempt to normalize it to short phrase form",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input lexicon file")
parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output instruction file")
parser.add_argument("--nosplit", "-n", action='store_true', default=False, help="don't split target on commas/semicolons/or/slash")
parser.add_argument("--targetlimit", "-l", type=int, default=4, help="maximum length of target entry after splitting")
parser.add_argument("--earlytargetlimit", "-L", type=int, default=20, help="maximum length of target entry before splitting")
try:
args = parser.parse_args()
except IOError as msg:
parser.error(str(msg))
infile = args.infile
outfile = args.outfile
stderr = sys.stderr
bad = 0
tmword = 0
smword = 0
wrote = 0
for line in infile:
# outfile.write("ORIG: "+line)
try:
src, pos, trgs = line.lstrip().rstrip().split("\t")
except:
stderr.write("Bad line: "+line)
bad+=1
continue
src = src.lower()
trgs = trgs.lower()
if len(trgs.split()) > args.earlytargetlimit:
tmword+=1
continue
src = re.sub(r'\([^\(\)]+\)', '', src).split()
# src singletons only
if len(src) != 1:
smword+=1
continue
src = src[0].lower()
# get rid of parentheticals and split on commas or semicolons
trgs = re.sub(r'\(.*\)', '', trgs) # harsh parenthetical stripping
trgs = re.sub(r'e\.g\..*', '', trgs) # e.g. comes before garbage
trgs = [trgs, ] if args.nosplit else re.split(r'[;,/]| or ', trgs)
for trg in trgs:
trg = trg.strip()
# nothing too long
if len(trg.split()) > args.targetlimit:
tmword+=1
continue
# OTHER HEURISTICS...
# eliminate initial "to" (to walk -> walk)
if re.match(r"to\b", trg, flags=re.I):
trg = ' '.join(trg.split()[1:])
if len(trg) == 0:
continue
outfile.write("%s\t%s\t%s\n" % (src, pos, trg))
wrote +=1
stderr.write("%d bad %d source mword %d target mword %d wrote\n" % (bad, smword, tmword, wrote))
if __name__ == '__main__':
main()