forked from fontify/fontify
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
executable file
·344 lines (282 loc) · 12.4 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#! /usr/bin/env python
# -*- coding: utf8 -*-
from __future__ import print_function, division
try: # python 2/3 compatibility
unicode
except NameError:
unicode = str
from math import ceil
import string
# --- Constants to configure the application
# Options, see https://github.com/JazzCore/python-pdfkit
TMPL_OPTIONS = {
# 'page-size': 'Letter',
# 'print-media-type': False,
'encoding': 'UTF-8',
'margin-top': '0.5in',
'margin-bottom': '0.5in',
'margin-left': '0.5in',
'margin-right': '0.5in',
# 'user-style-sheet': '/tmp/static/css/template.css',
}
# WARNING don't change now! everything works, DON'T CHANGE THIS!
ROWS = 9
COLUMNS = 9
# print("Shape of the characters: on {} rows and {} columns...".format(ROWS, COLUMNS)) # DEBUG
PERCENTAGE_TO_CROP_SCAN_IMG = 0.008
# Use the extended charset or not
EXTENDED = False # WARNING only for debugging
EXTENDED = True
# Use the full charset or not
FULL = False
# Use the Greek characters
GREEK = True
GREEK = False
# Use the French/Spanish characters
ACCENTS = True
ACCENTS = False
# Use extra French special characters
FRENCHSPECIALS = False
FRENCHSPECIALS = True
# Use extra Spanish special characters
SPANISHSPECIALS = True
SPANISHSPECIALS = False
# kwargs_get_data = dict(extended=EXTENDED, full=FULL, greek=GREEK, accents=ACCENTS, frenchspecials=FRENCHSPECIALS, spanishspecials=SPANISHSPECIALS)
CROPPED_IMG_NAME = "cropped_picture"
CROPPED_IMG_EXT = "bmp"
CUT_CHAR_IMGS_DIR = "cutting_output_images"
# --- utility function
def str_to_hex(one_or_more_char):
if isinstance(one_or_more_char, str) or len(one_or_more_char) == 1:
return hex(ord(one_or_more_char))
else:
return '_'.join(hex(ord(char)) for char in one_or_more_char)
def hex_of_str(one_or_more_hex):
if not '_' in one_or_more_hex:
return [int(one_or_more_hex, 0)]
else:
return [int(c, 0) for c in one_or_more_hex.split('_')]
def _ljust(input, width, fillchar=None):
"""Either ljust on a string or a list of string. Extend with fillchar."""
if fillchar is None:
fillchar = ' '
if isinstance(input, str):
return input.ljust(width, fillchar)
else:
delta_len = width - len(input)
if delta_len <= 0:
return input
else:
return input + [fillchar for _ in range(delta_len)]
# --- get list of unicode characters or ligatures
def _get_flat_chars(extended=EXTENDED, full=FULL, greek=GREEK, accents=ACCENTS, frenchspecials=FRENCHSPECIALS, spanishspecials=SPANISHSPECIALS):
""" Return a list of unicode characters for each single-width character."""
chars = []
# ASCII letters
chars += list(unicode(string.ascii_lowercase))
chars += list(unicode(string.ascii_uppercase))
# Numbers
chars += list(unicode(string.digits))
if not extended:
chars += list(unicode(string.punctuation))
# # French accents
# chars += list(unicode(u"àçèéêëîïôÿúüû"))
# chars += list(unicode(u"ÀÇÈÉÊËÎÏÔŸÚÜÛ"))
else:
# Punctuations and symbols
chars += list(unicode(u"!?\"$&'(),-.:;"))
chars += list(unicode(u"/\\#~{}[]|_@+*`§%^<>=£"))
# French and Spanish accents
if accents:
chars += list(unicode(u"àáâäçèéêëîíïñòóôöŷÿùúüû"))
chars += list(unicode(u"ÀÁÂÄÇÈÉÊËÎÍÏÑÒÓÔÖŶŸÙÚÜÛ"))
if frenchspecials:
# Some basic ligatures
chars += list(unicode(u"æœßÆŒ"))
# non ASCII symbols (currency etc)
chars += list(unicode(u"€…"))
chars += list(unicode(u"«»‘’“”"))
if spanishspecials:
chars += list(unicode(u"¡¿"))
# chars += list(unicode(u" ")) # space ' ' and unbreakable-space ' ' # XXX not needed anymore
if greek:
# Greek upper and lower
chars += list(unicode(u"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"))
chars += list(unicode(u"αβγδεζηθικλμνξοπρςστυφχψω"))
if not full:
return chars
# Special ligatures
# https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_list(unicode_(Latin_alphabets)
chars += list(unicode(u"fffiflffifflſtst"))
chars += list(unicode(u"🙰"))
# non ASCII symbols (currency etc)
chars += list(unicode(u"¥₩₹₺₽元"))
# Greek special characters, nobody use that!
# chars += list(unicode(u"΄΅Ά·ΈΉΊΌΎΏΐΰ"))
chars += list(unicode(u"ΪΫάέήίϊϋόύώ"))
# Maths
chars += list(unicode(u"ℕℝℂℙℤℚ∀∂∃∅∇∩∪∫≠≤≥⊂⊃"))
chars += list(unicode(u"°±×÷ø–—‰′″‴→↓↑←↔⇒⇔∈∉∏∑√∛∝∞∧∨∬∭∮∯∰∴∵≈≝≠≡≪≫⊄⊆⊈⊕"))
return chars
def _get_flat_ligatures(extended=EXTENDED, full=FULL, greek=GREEK, accents=ACCENTS, frenchspecials=FRENCHSPECIALS, spanishspecials=SPANISHSPECIALS):
""" Return a list of unicode characters for each ligature."""
ligatures = []
# FIXME I'm experimenting on this, cf https://github.com/Naereen/fontify/issues/3
ligatures += [u"i", u"j", ]
ligatures += [u"ij", ]
return ligatures
ligatures += [u"</", u"<!--", u"</>", u"/>", ]
return ligatures
# ligatures += [u"ff", u"fi", u"fl", u"ffi", u"ffl", u"ſt", u"st", u"🙰"]
ligatures += [u"ff", u"fi", u"fl", u"ffi", u"ffl", u"ft", u"st", u"et"]
# then from other ligatures
ligatures += [u"fa", u"fe", u"fj", u"fo", u"fr", u"fs", u"ft", u"fft", u"fb", u"ffb", u"fh", u"ffh", u"fu", u"fy"]
ligatures += [u"ct", u"ch", u"ck", u"tt", ]
if not extended:
return ligatures
# and now from FiraCode, line by line, code-related ligatures
# from https://github.com/tonsky/FiraCode/#solution
ligatures += [u".=", u"..=", u".-", u":=", u"=:=", u"=!=", u"__", ]
ligatures += [u"==", u"!=", u"===", u"!==", u"=/=", ]
ligatures += [u"<-<", u"<<-", u"<--", u"<-", u"<->", u"->", u"-->", u"->>", u">->", ]
ligatures += [u"<=<", u"<<=", u"<==", u"<=>", u"=>", u"==>", u"=>>", u">=>", ]
ligatures += [u">>=", u">>-", u">-", u"<~>", u"-<", u"-<<", u"=<<", ]
ligatures += [u"<~~", u"<~", u"~~", u"~>", u"~~>", ]
ligatures += [u"<<<", u"<<", u"<=", u"<>", u">=", u">>", u">>>", ]
ligatures += [u"{.", u"{|", u"[|", u"<:", u":>", u"|]", u"|}", u".}", ]
ligatures += [u"<|||", u"<||", u"<|", u"<|>", u"|>", u"||>", u"|||>", ]
ligatures += [u"<$", u"<$>", u"$>", ]
ligatures += [u"<+", u"<+>", u"+>", ]
ligatures += [u"<*", u"<*>", u"*>", ]
ligatures += [u"/*", u"*/", u"///", u"//", ]
ligatures += [u"</", u"<!--", u"</>", u"/>", ]
# XXX these are harder, we would need to work on contextual vertical align...
# ligatures += [u"0xf", u"10x10", ] # WARNING
# ligatures += [u"9:45", u"m+x", u"m-x", u"*ptr", ] # WARNING
ligatures += [u";;", u"::", u":::", u"[:]", u"..", u"...", u"..<", ]
ligatures += [u"!!", u"??", u"%%", u"&&", u"||", u"?.", u"?:", ]
ligatures += [u"++", u"+++", ]
ligatures += [u"--", u"---", ]
ligatures += [u"**", u"***", ]
ligatures += [u"~=", u"~-", u"www", u"-~", u"~@", ]
ligatures += [u"^=", u"?=", u"/=", u"/==", ]
ligatures += [u"+=", u"-=", u"*=", ] # mine
ligatures += [u"-|", u"_|_", u"|-", u"|=", u"||=", ]
ligatures += [u"#!", u"#=", u"##", u"#:", u"###", u"####", u"#####", u"######", ] # mine also
ligatures += [u"#{", u"#}", u"#[", u"]#", u"#(", u"#)", u"#?", u"#_", u"#_(", u"#_)", ] # some are extra
if not full:
return ligatures
# mine, extra... LaTeX related
ligatures += [u"TeX", u"LaTeX", u"KaTeX", u"XeLaTeX", ]
# FIXME don't do that... please...
ligatures += [u"Lilian", u"Besson", u"Naereen", ]
return ligatures
def get_flat_chars(**kwargs):
# FIXME remove
return _get_flat_chars(**kwargs)
return _get_flat_chars(**kwargs) + _get_flat_ligatures(**kwargs)
def get_flat_ligatures(**kwargs):
return _get_flat_ligatures(**kwargs)
# --- get grouped data (unused)
def _get_grouped(get_flat_input, **kwargs):
inputs = get_flat_input(**kwargs)
grouped_inputs = [
inputs[i: i + ROWS]
for i in range(0, len(inputs), ROWS)
]
# print("grouped_inputs =", grouped_inputs) # DEBUG
return grouped_inputs
def get_grouped_chars(**kwargs): return _get_grouped(get_flat_chars, **kwargs)
def get_grouped_ligatures(**kwargs): return _get_grouped(get_flat_ligatures, **kwargs)
# --- get grouped and padded data (non used)
def _get_grouped_and_padded(get_grouped_inputs, **kwargs):
inputs = get_grouped_inputs(**kwargs)
inputs[-1] = _ljust(inputs[-1], COLUMNS)
inputs.extend([
' ' * ROWS
for i in range(len(inputs), ROWS)
])
# print("inputs =", inputs) # DEBUG
return inputs
def get_chars(**kwargs): return _get_grouped_and_padded(get_grouped_chars, **kwargs)
def get_ligatures(**kwargs): return _get_grouped_and_padded(get_grouped_ligatures, **kwargs)
# --- get data grouped by pages
def _get_by_page(get_flat_inputs, **kwargs):
inputs = get_flat_inputs(**kwargs)
grouped_inputs = [
inputs[i: i + COLUMNS]
for i in range(0, len(inputs), COLUMNS)
]
grouped_inputs[-1] = _ljust(grouped_inputs[-1], COLUMNS)
grouped_inputs.extend([
' ' * ROWS
for i in range(len(grouped_inputs), ROWS)
])
# print("grouped_inputs =", grouped_inputs) # DEBUG
# print("len(grouped_inputs =", len(grouped_inputs)) # DEBUG
# print("ROWS =", ROWS) # DEBUG
nb_page = int(ceil(len(grouped_inputs) / ROWS))
# print("nb_page =", nb_page) # DEBUG
grouped_inputs_by_page = [
[
grouped_inputs[i]
for i in range(page*ROWS, min((page+1)*ROWS, len(grouped_inputs)))
]
for page in range(0, nb_page)
]
# print("grouped_inputs_by_page =", grouped_inputs_by_page) # DEBUG
return grouped_inputs_by_page
def get_chars_by_page(**kwargs): return _get_by_page(get_flat_chars, **kwargs)
def get_ligatures_by_page(**kwargs): return _get_by_page(get_flat_ligatures, **kwargs)
# --- sample characters (in light gray in the template)
def _get_sample_chars(test_unicode=True):
# WARNING the leading space ' ' char is because there is a space in the data
if test_unicode:
return iter(u" AaΩω")
else:
return iter(u" AaZz")
def get_sample_chars_no_unicode(): return _get_sample_chars(test_unicode=False)
def get_sample_chars(): return _get_sample_chars(test_unicode=True)
def get_sample_ligatures():
return iter([u"ff", u"st"])
# --- Now we test everything
if __name__ == '__main__':
print("DEBUG: data.py") # DEBUG
for f in [
# --- samples
get_sample_chars, get_sample_chars_no_unicode,
get_sample_ligatures,
# --- chars
get_flat_chars,
# get_grouped_chars,
# get_chars,
get_chars_by_page,
# --- ligatures
get_flat_ligatures,
# get_grouped_ligatures,
# get_ligatures,
get_ligatures_by_page,
]:
# get name and get data
name = f.__name__
data = f()
# convert the iterators to list...
if isinstance(data, type(iter(u'okok'))) or isinstance(data, type(iter([u'ok', u'ok']))):
data = [ i for i in f() ]
data = list(data)
# DEBUG
max_length_of_data = max(len(s) for s in data)
print("\nThe function '{}' gives output of type {} and length {}, which is seen as data of type {}, each of max length {}...".format(name, type(data), len(data), type(data[0]), max_length_of_data)) # DEBUG
# print(data) # DEBUG
# print the data correctly...
if isinstance(data[0], list):
if isinstance(data[0][0], list):
for page, page_data in enumerate(data):
joined_data = [ u", ".join(x) for x in page_data ]
print("Page {}/{} has {} columns and {} rows, with {} data of max length {}:".format(page + 1, len(data), len(page_data), len(joined_data), sum([len(s) for s in page_data]), max([max([len(c) for c in x]) for x in page_data]) )) # DEBUG
print(u"\t" + u"\n\t".join(joined_data)) # DEBUG
else:
for page, page_data in enumerate(data):
print(u", ".join(page_data)) # DEBUG
else:
print(u", ".join(data)) # DEBUG