forked from max-lancaster/namsel
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathline_breaker.py
579 lines (468 loc) · 23.7 KB
/
line_breaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
# encoding: utf-8
'''Line breaking'''
from numpy import array, float64, argmax, argmin, uint8, ones, floor, mean, std, where, argsort
import cv2 as cv
from utils import check_for_overlap
from fast_utils import ftrim, fadd_padding
import sys
from bisect import bisect, bisect_right
from feature_extraction import normalize_and_extract_features
from classify import load_cls, label_chars
cls = load_cls('logistic-cls')
class LineCut(object):
'''Line Cutting object - breaks lines in a page where lines are separated
by empty whitespace
Parameters:
--------------------
shapes: page_element object, (see page_elements.py)
thresh_scale: float, default=.9995
A threshold value for determining the breakline in the event that
there is black pixel noise between lines. Should be set high to avoid
setting line breaks through characters themselves.
Attributes:
-----------
lines_chars: list of lists, length=number of lines on page. Each sub-list
contains the indices for the bounding boxes/contours assigned to
its corresponding line.
line_indices: list of int, indices of breaklines with respect to page_array
baselines: list of int, the index of the baseline for each line where
baseline here is usually a line that goes through all the thick "head"
(Tibetan: mgo) parts found on most Tibetan letters
Methods:
--------
get_box: return the bounding box for a given index
get_contour: return the contour for a given index
The get_box, get_contour methods are mostly here for API compatibility with
LineCluster.
'''
def __init__(self, shapes, thresh_scale=.9995):
self.shapes = shapes
self.baselines = []
### Inflate chars to avoid vowels breaking off their lines
inflated = shapes.img_arr.copy()
INFL_ITER = shapes.conf['line_cut_inflation']
inflated = cv.erode(inflated, None, iterations=INFL_ITER)
self.vsum = inflated.sum(axis=1)
### Determine line indices
vsum_max = self.vsum.max()
threshold = vsum_max * thresh_scale
self.line_indices = []
for i, s in enumerate(self.vsum):
if s < threshold and self.vsum[i - 1] >= threshold:
self.line_indices.append(i - 1)
li = self.line_indices
self.k = len(self.line_indices)
### Calculate heights of lines. Split out lines that are too tall
diffs = [li[i+1] - li[i] - len(where(self.vsum[li[i]:li[i+1]] == vsum_max)[0]) for i in range(len(li[:-2]))]
too_tall = []
for i in range(len(diffs)):
otherdiffs = diffs[:i] + diffs[i+1:]
odmean = mean(otherdiffs)
odstd = std(otherdiffs)
if diffs[i] > 2*odstd + odmean:
#TODO: use better criteria for above condition
too_tall.append(i)
if shapes.conf['stop_line_cut']:
if len(too_tall) > 0:
return
#
### Separate lines deemed too tall
# added_lines = 0
# for i in too_tall:
# i += added_lines
# padding = floor(diffs[i]*.25)
# top_bound = self.line_indices[i] + padding
# bottom_bound = self.line_indices[i+1] - padding
# # print top_bound, bottom_bound
# extra_line = top_bound + self.shapes.img_arr[top_bound:bottom_bound].sum(axis=1).argmax()
# insert_point = bisect(self.line_indices, extra_line)
# self.line_indices.insert(insert_point, extra_line)
# added_lines += 1
# li = self.line_indices
# diffs = [li[i+1] - li[i] for i in range(len(li[:-2]))]
############## Draw line breaks on original page
# for l in self.line_indices:
# self.shapes.img_arr[l] = 0
# import Image
# Image.fromarray(self.shapes.img_arr*255).show()
##############
self.assign_char_indices()
# self._remove_small_noise()
def get_baselines(self):
if not self.baselines:
for i, k in enumerate(self.line_indices[:-1]):
vsum_vals = self.vsum[k:self.line_indices[i+1]]
if vsum_vals.any():
self.baselines.append(k +\
argmin(vsum_vals))
self.baselines.append(self.line_indices[-1]+\
argmin(self.vsum[self.line_indices[-1]:]))
return self.baselines
def assign_char_indices(self):
'''
Notes:
------
Complexity:
nlogn for sorting
linear for zip and index extraction
bisect is log n
'''
char_tops = zip(self.shapes.get_tops(), self.shapes.get_indices())
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect(char_tops, (i - 1,))
for i in self.line_indices])
self.lines_chars = []
if not _line_insert_indxs: raise
for i, l in enumerate(_line_insert_indxs[:-1]):
self.lines_chars.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.lines_chars.append(sorted_indices[_line_insert_indxs[-1]:])
self.k = len(self.line_indices)
## Small contour insertion
cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.small_contour_indices]
char_tops = zip(cctops, self.shapes.small_contour_indices)
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
for i in self.line_indices])
self.small_cc_lines_chars = []
if not _line_insert_indxs: sys.exit()
for i, l in enumerate(_line_insert_indxs[:-1]):
self.small_cc_lines_chars.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.small_cc_lines_chars.append(sorted_indices[_line_insert_indxs[-1]:])
self.small_cc_lines_chars = [self.small_cc_lines_chars[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
if self.shapes.detect_o:
cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.naros]
char_tops = zip(cctops, self.shapes.naros)
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
for i in self.line_indices])
if not _line_insert_indxs: sys.exit()
self.line_naros = []
for i, l in enumerate(_line_insert_indxs[:-1]):
self.line_naros.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.line_naros.append(sorted_indices[_line_insert_indxs[-1]:])
self.line_naros = [self.line_naros[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
if self.shapes.low_ink:
cctops = [lib[1] for lib in self.shapes.low_ink_boxes]
char_tops = zip(cctops, self.shapes.low_ink_boxes)
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
for i in self.line_indices])
self.low_ink_boxes = []
if not _line_insert_indxs: sys.exit()
for i, l in enumerate(_line_insert_indxs[:-1]):
self.low_ink_boxes.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.low_ink_boxes.append(sorted_indices[_line_insert_indxs[-1]:])
self.low_ink_boxes= [self.low_ink_boxes[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
#
# for c in small_cc:
# t = self.shapes.get_boxes()[c][1]
# for i, line in enumerate(self.lines_chars):
# if i+1 < len(self.lines_chars):
# if self.lines_chars[i+1]:
# next_topmost = self.shapes.get_boxes()[self.lines_chars[i+1][0]][1]
# if topmost < t and t < next_topmost:
# self.small_cc_lines_chars[i].append(c)
# break
# topmost = next_topmost
#
def _remove_small_noise(self):
global_tsek_mean = self.shapes.tsek_mean
global_tsek_std = self.shapes.tsek_std
global_char_mean = self.shapes.char_mean
global_char_std = self.shapes.char_std
for l in self.lines_chars:
widths = [self.shapes.get_boxes()[i][2] for i in l]
char_mean, char_std, tsek_mean, tsek_std = self.shapes.char_gaussians(widths)
for i in l:
if self.shapes.get_boxes()[i][2] < (global_tsek_mean -
1 * global_tsek_std) and not char_mean < global_char_mean -.5* global_char_std:
l.remove(i)
# elif len(l) < 4 and global_tsek_mean - global_tsek_std <= self.shapes.get_boxes()[i][2] <= global_tsek_mean + global_tsek_std:
# l.remove(i)
# These two functions are for API compatability with LineCluster
def get_box(self, ind):
return self.shapes.get_boxes()[ind]
def get_contour(self, ind):
return self.shapes.contours[ind]
class LineCluster(object):
'''Line Cluster object - breaks lines by clustering according to tops of
bounding boxes. Useful in cases where it drawing a straight line between
page lines is difficult. Requires you know how many lines are on the page
beforehand.
Parameters:
--------------------
shapes: page_element object, (see page_elements.py)
k: int, required
number of lines on the page
Attributes:
-----------
lines_chars: list of lists, length=number of lines on page. Each sub-list
contains the indices for the bounding boxes/contours assigned to
its corresponding line.
line_indices: list of int, indices of breaklines with respect to page_array
baselines: list of int, the index of the baseline for each line where
baseline here is usually a line that goes through all the thick "head"
(Tibetan: mgo) parts found on most Tibetan letters
Methods:
--------
get_box: return the bounding box for a given index
get_contour: return the contour for a given index
The get_box, get_contour methods are mostly here for API compatibility with
LineCluster.
Notes:
------
This code is messy and in progress. Some of the logic in the end contains
hardcoded values particular to the Nyingma Gyudbum which is obviously
useless for general cases.
'''
def __init__(self, shapes, k):
from sklearn.cluster import KMeans
self.shapes = shapes
self.k = k
self.page_array = shapes.img_arr
if shapes.conf['line_cluster_pos'] == 'top':
tops = array(shapes.get_tops(), dtype=float64)
elif shapes.conf['line_cluster_pos'] == 'center':
tops = array(
[t[1] + .5*shapes.char_mean for t in shapes.get_boxes() if t[3] > 2* shapes.tsek_mean],
dtype=float64
)
else:
raise ValueError, "The line_cluster_pos argument must be either 'top' or 'center'"
tops.shape = (len(tops), 1)
kmeans = KMeans(n_clusters=k)
# print tops
kmeans.fit(tops)
##################
######## mark cluster centroids on original image and show them
# img_arr = shapes.img_arr.copy()
# for centroid in kmeans.cluster_centers_:
## print centroid[0]
# img_arr[centroid[0],:] = 0
#
# import Image
# Image.fromarray(img_arr*255).show()
#######################3
lines = [[] for i in range(k)]
ind = shapes.get_indices()
### Assign char pointers (ind) to the appropriate line ###
# [lines[kmeans.labels_[i]].append(ind[i]) for i in range(len(ind))]
[lines[kmeans.predict(shapes.get_boxes()[ind[i]][1])[0]].append(ind[i]) for i in range(len(ind))]
lines = [l for l in lines if l]
self.k = len(lines)
boxes = shapes.get_boxes()
### Sort indices so they are in order from top to bottom using y from the first box in each line
sort_inx = list(argsort([boxes[line[0]][1] for line in lines]))
lines.sort(key=lambda line: boxes[line[0]][1])
### Get breaklines for splitting up lines
### Uses the topmost box in each line cluster to determine breakline
try:
topmosts = [min([boxes[i][1] for i in line]) for line in lines]
except ValueError:
print 'failed to get topmosts...'
raise
vsums = self.page_array.sum(axis=1)
breaklines = []
delta = 25
for c in topmosts:
if c - delta < 0:
lower = 0
else:
lower = c-delta
e = argmax(vsums[lower:c+delta])
c = c - delta + e
if c < 0:
c = 0
breaklines.append(c)
breaklines.append(self.page_array.shape[0])
self.baselines = []
for i, br in enumerate(breaklines[:-1]):
try:
baseline_area = vsums[br:breaklines[i+1]]
if baseline_area.any():
self.baselines.append(br + argmin(baseline_area))
else:
print i
print 'No baseline info'
except ValueError:
print 'ValueError. exiting...HERE'
import traceback;traceback.print_exc()
raise
final_ind = dict((i, []) for i in range(len(lines)))
self.new_contours = {}
for j, br in enumerate(breaklines[1:-1]):
topcount = 0
bottomcount = 0
for i in lines[j]:
# if char extends into next line, break it
# 253 is roughly global line height avg + 1 std
# The following lines says that a box/char must be extending over
# breakline by a non trivial amount eg. 30 px and must itself
# be a tall-ish box (roughly the height of average line) in order
# for it to be broken.
# if (bounding[i][1] + bounding[i][3]) - br >= 30 and bounding[i][3] > 205:
if (boxes[i][1] + boxes[i][3]) - br >= 30 and \
(boxes[i][1] + boxes[i][3]) - topmosts[j] > self.shapes.char_mean*2.85:
chars = ones((boxes[i][3]+2, boxes[i][2]+2), dtype=uint8)
contours = shapes.contours
cv.drawContours(chars, [contours[i]], -1,0, \
thickness = -1, offset=(-boxes[i][0]+1,-boxes[i][1]+1))
cv.dilate(chars, None, chars)
y_offset = boxes[i][1]
new_br = br - y_offset
prd_cut = []
### Iterate through potential cut-points and
### and cut where top half has the highest probability
### that is not a tsek
# print 'bottom bound cut point', int(.75*shapes.tsek_mean)
for delta in range(-3, int(.75*shapes.tsek_mean), 1):
# for delta in range(-3, 100, 1):
cut_point = new_br + delta
# chars[cut_point, :] = 0
# import Image
# Image.fromarray(chars*255).show()
tchr = chars[:cut_point,:]
tchr = ftrim(tchr)
if not tchr.any():
continue
tchr = normalize_and_extract_features(tchr)
probs = cls.predict_proba(tchr)
max_prob_ind = argmax(probs)
chr = label_chars[max_prob_ind]
prd_cut.append((probs[0,max_prob_ind], chr, cut_point))
prd_cut = [q for q in prd_cut if q[1] != u'་']
try:
cut_point = max(prd_cut)[-1]
except:
print 'No max prob for vertical char break, using default breakline. Usually this means the top half of the attempted segmentation looks like a tsek blob'
cut_point = br-boxes[i][1]
#######FOLLWNG NOT WORKING ATTEMPTS TO GET A BETTER BREAK LINE
# br2 = br-bounding[i][1]
#
# csum = chars.sum(axis=1)
# bzone = csum[br2-25:br2+40]
# if bzone.any():
# br2 = np.argmax(bzone) + (br-25)
## print br, 'br'
# chars = chars*255
# nbr = br
# cv.line(chars, (0, br2), (chars.shape[1], br2), 0)
# Image.fromarray(chars).save('/tmp/outt.tiff')
# sys.exit()
#############
tarr = chars[:cut_point,:]
tarr, top_offset = ftrim(tarr, new_offset=True)
tarr = fadd_padding(tarr, 3)
barr = chars[cut_point:,:]
barr = ftrim(barr, sides='brt')
barr = fadd_padding(barr, 3)
c1, h = cv.findContours(image=tarr, mode=cv.RETR_LIST, method=cv.CHAIN_APPROX_SIMPLE, offset=(boxes[i][0]+top_offset['left'],boxes[i][1]))
c1 = c1[argmax([len(t) for t in c1])] # use the most complex contour
bnc1 = cv.boundingRect(c1)
c2, h = cv.findContours(barr, mode=cv.RETR_LIST,
method=cv.CHAIN_APPROX_SIMPLE,
offset=(boxes[i][0]-3,boxes[i][1]+cut_point-3))
c2 = c2[argmax([len(t) for t in c2])]
bnc2 = cv.boundingRect(c2)
topbox_name = 't%d_%d' % (j, topcount)
final_ind[j].append(topbox_name)
self.new_contours[topbox_name] = (bnc1, c1)
topcount += 1
if bnc2[-1] > 8: #only add bottom contour if not trivially small
bottombox_name = 'b%d_%d' % (j, bottomcount)
final_ind[j+1].append(bottombox_name)
self.new_contours[bottombox_name] = (bnc2, c2)
bottomcount += 1
else:
final_ind[j].append(i)
# Don't forget to include the last line
map(final_ind[len(lines)-1].append, lines[len(lines)-1])
self.lines_chars = final_ind
cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.small_contour_indices]
char_tops = zip(cctops, self.shapes.small_contour_indices)
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
for i in breaklines])
self.small_cc_lines_chars = []
if not _line_insert_indxs: sys.exit()
for i, l in enumerate(_line_insert_indxs[:-1]):
self.small_cc_lines_chars.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.small_cc_lines_chars.append(sorted_indices[_line_insert_indxs[-1]:])
self.small_cc_lines_chars = [self.small_cc_lines_chars[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.emph_symbols]
char_tops = zip(cctops, self.shapes.emph_symbols)
char_tops.sort(key=lambda x: x[0])
empred = [kmeans.predict(shapes.get_boxes()[i][1])[0] for i in self.shapes.emph_symbols]
self.emph_lines = [[] for i in range(k)]
for nn, e in enumerate(empred):
self.emph_lines[sort_inx.index(e)].append(self.shapes.emph_symbols[nn])
if self.shapes.detect_o:
cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.naros]
char_tops = zip(cctops, self.shapes.naros)
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
for i in breaklines])
if not _line_insert_indxs: sys.exit()
self.line_naros = []
for i, l in enumerate(_line_insert_indxs[:-1]):
#
self.line_naros.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.line_naros.append(sorted_indices[_line_insert_indxs[-1]:])
self.line_naros = [self.line_naros[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
self.line_naro_spans = []
for ll, mm in enumerate(self.line_naros):
thisline = []
for nn, naro in enumerate(mm):
box = self.get_box(naro)
thisline.append(box)
thisline.sort(key=lambda x: x[0])
self.line_naros[ll].sort(key=lambda x: self.get_box(x)[0])
self.line_naro_spans.append(thisline)
if self.shapes.low_ink:
cctops = [lib[1] for lib in self.shapes.low_ink_boxes]
char_tops = zip(cctops, self.shapes.low_ink_boxes)
char_tops.sort(key=lambda x: x[0])
sorted_indices = [i[1] for i in char_tops]
_line_insert_indxs = []
_line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
for i in breaklines])
self.low_ink_boxes = []
if not _line_insert_indxs: sys.exit()
for i, l in enumerate(_line_insert_indxs[:-1]):
self.low_ink_boxes.append(sorted_indices[l:_line_insert_indxs[i+1]])
self.low_ink_boxes.append(sorted_indices[_line_insert_indxs[-1]:])
self.low_ink_boxes = [self.low_ink_boxes[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
def check_naro_overlap(self, line_num, box):
line = self.line_naro_spans[line_num]
left_edges = [l[0] for l in line]
insert = bisect(left_edges, box[0])
for r in range(insert-1, insert + 1):
if 0 <= r < len(line):
sp = line[r]
if check_for_overlap(sp, box):
return self.line_naros[line_num][r]
return False
def get_box(self, ind):
try:
return self.shapes.get_boxes()[ind]
except (TypeError, IndexError):
return self.new_contours[ind][0]
def get_contour(self, ind):
try:
return self.shapes.contours[ind]
except (IndexError, TypeError):
return self.new_contours[ind][1]
def get_baselines(self):
return self.baselines