-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
131 lines (100 loc) · 4.42 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#import boudams
#import boudams.cli
import glob
from boudams.tagger import BoudamsTagger
import shutil
import pie.scripts.tag
from setuptools.glob import glob
from eval.eval import compute_score
import regex as re
# Get output
### To process
### This one stays almost as is
devs = glob.glob("01_htr/output/*-developpe_avec-espace_p48.txt")
for file in devs:
with open(file, 'r') as f:
cleaned = f.read().replace('\n', '').replace('/', '')
with open(file.replace("01_htr/output/", "04_evaluation/to_eval/"), 'w') as f:
f.write(cleaned)
### This one needs processing
htrout = glob.glob("01_htr/output/*_sans-espace_p48.txt")
for file in htrout:
with open(file, 'r') as f:
cleaned = f.read().replace('\n', '').replace('/', '')
with open(file.replace("01_htr/output/", "02_segmentation/input/"), 'w') as f:
f.write(cleaned)
######## Segment all
for model in glob.glob("02_segmentation/models/*"):
modelname = model.split("/")[-1]
model = BoudamsTagger.load(model, device="cpu")
remove_line = True
spaces = re.compile("\s+")
apos = re.compile("['’]")
for file in glob.glob("02_segmentation/input/*"):
out_name = file.replace(".txt", ("."+modelname+".txt")).replace("02_segmentation/input/", "02_segmentation/output/")
with open(file, 'r') as f:
content = f.read()
if remove_line:
content = spaces.sub("", content)
# Now, extract apostrophes, remove them, and reinject them
apos_positions = [i for i in range(len(content)) if content[i] in ["'", "’"]]
content = apos.sub("", content)
with open(out_name, "w") as out_io:
out = ''
for tokenized_string in model.annotate_text(content, batch_size=32):
out = out + tokenized_string + " "
# Reinject apostrophes
true_index = 0
for i in range(len(out) + len(apos_positions)):
if true_index in apos_positions:
out = out[:i] + "'" + out[i:]
true_index = true_index + 1
else:
if not out[i] == ' ':
true_index = true_index + 1
out_io.write(out)
### Now, we are done for Calfa files
for file in glob.glob("02_segmentation/output/*-developpe_sans-espace_p48.0*"):
shutil.copy(file,file.replace("02_segmentation/output/", "04_evaluation/to_eval/"))
### We can proceed to normalisation for the others
for file in glob.glob("02_segmentation/output/*abreg*"):
# tokenise on space
with open(file, 'r') as f:
input = re.sub(r'\s+', r'\n', ''.join(f.readlines()))
with open(file.replace("02_segmentation/output/", "03_normalisation/input/"), 'w') as f:
f.writelines(input)
files = glob.glob("03_normalisation/input/*.tar.txt")
for model in glob.glob("03_normalisation/models/*"):
modelspec = [(model, ['normalised'])]
for file in files:
# Need to seriously reduce batch_size which is huge for a default cpu
pie.scripts.tag.run(modelspec, file, beam_width=10, use_beam=True, keep_boundaries=True, device="cpu",
batch_size=2, lower=False, max_sent_len=35, vrt=True)
for file in glob.glob("03_normalisation/input/*-pie.txt"):
shutil.move(file, file.replace("-pie.txt", ("-pie" + model.split('/')[-1] + ".txt")).replace("input", "output") )
# Parse TSV files to output only normalised forms
for file in glob.glob("03_normalisation/output/*-pie*.txt"):
norms = []
with open(file, 'r') as f:
f.readline() # remove header
for line in f.readlines():
if len(line.split('\t')) > 1:
norms.append(re.sub('\s+', '', line.split('\t')[1].rstrip()) + ' ')
# and write to output
with open(file.replace("03_normalisation/output/", "04_evaluation/to_eval/"), 'w') as f:
f.writelines(norms)
for file in glob.glob("02_segmentation/train/src_expan/*.tsv"):
norms = []
with open(file, 'r') as f:
f.readline() # remove header
for line in f.readlines():
if len(line.split('\t')) > 1:
norms.append(re.sub('\s+', '', line.split('\t')[1].rstrip()) + ' ')
# and write to output
with open(file.replace(".tsv", ".txt"), 'w') as f:
f.writelines(norms)
# Do the evaluation
gt = "04_evaluation/test_Folio11.txt"
folder = "04_evaluation/to_eval/*"
df = compute_score(gt, folder)
df.to_csv("04_evaluation/eval.csv")