-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
210 lines (159 loc) · 4.55 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Makefile of _Plu Glosa Nota_
# By Marcos Cruz (programandala.net)
# Last modified 201902231204
# See change log at the end of the file
# ==============================================================
# Requirements
# - make
# - asciidoctor
# - asciidoctor-pdf
# - pandoc
# - pdfimages
# - tesseract
# ==============================================================
# Config
target=target
VPATH=./src:./target
book=plu_glosa_nota
# ==============================================================
# Interface
.PHONY: all
all: epub html odt pdf
.PHONY: docbook
docbook: target/$(book).adoc.xml
.PHONY: epub
epub: target/$(book).adoc.xml.pandoc.epub
# XXX OLD
.PHONY: picdir
picdir:
ln --force --symbolic --target-directory=target ../src/pic
.PHONY: html
html: target/$(book).adoc.html target/$(book).adoc.plain.html target/$(book).adoc.xml.pandoc.html
.PHONY: odt
odt: target/$(book).adoc.xml.pandoc.odt
.PHONY: pdf
pdf: target/$(book).adoc.pdf
.PHONY: rtf
rtf: target/$(book).adoc.xml.pandoc.rtf
.PHONY: clean
clean:
rm -f \
target/*.epub \
target/*.html \
target/*.odt \
target/*.pdf \
target/*.rtf \
target/*.xml
# ==============================================================
# Extract the texts from the original scanned PDF
# NOTE:
#
# This is only a tool to facilite extracting texts from individual pages of the
# old issues of _Plu Glosa Nota_. The OCR-ed texts are not automatically
# integrated into the target documents.
#
# The original PDF files, from www.glosa.org, must be in the <original>
# directory.
# ----------------------------------------------
# OCR individual pages
# Usage example for extracting the text from page 5 of PGN 41:
#
# make original/pgn041-005.txt
original/%.txt: original/%.ppm
original/%-001.ppm: original/%.pdf
pdfimages -p -f 1 -l 1 $< $(basename $<)
mv $(basename $<)-001-000.ppm $@
original/%-002.ppm: original/%.pdf
pdfimages -p -f 2 -l 2 $< $@
mv $(basename $<)-002-000.ppm $@
original/%-003.ppm: original/%.pdf
pdfimages -p -f 3 -l 3 $< $@
mv $(basename $<)-003-000.ppm $@
original/%-004.ppm: original/%.pdf
pdfimages -p -f 4 -l 4 $< $@
mv $(basename $<)-004-000.ppm $@
original/%-005.ppm: original/%.pdf
pdfimages -p -f 5 -l 5 $< $@
mv $(basename $<)-005-000.ppm $@
original/%-006.ppm: original/%.pdf
pdfimages -p -f 6 -l 6 $< $@
mv $(basename $<)-006-000.ppm $@
original/%-007.ppm: original/%.pdf
pdfimages -p -f 7 -l 7 $< $@
mv $(basename $<)-007-000.ppm $@
original/%.txt: original/%.ppm
tesseract $< $(basename $@)
# ----------------------------------------------
# OCR all PDF
# XXX TODO -- Finish and test.
pdf_files=$(sort $(notdir $(basename $(wildcard original/*.pdf))))
.PHONY: ocr
ocr:
for file in $(pdf_files);\
do \
pdfimages original/$${file}.pdf original/$${file};\
for image in $$(ls original/$${file}*.ppm);\
do \
tesseract $${image} $${image};\
done;\
done
# ==============================================================
# Convert to DocBook
target/$(book).adoc.xml: $(book).adoc
asciidoctor --backend=docbook5 --out-file=$@ $<
# ==============================================================
# Convert to EPUB
target/$(book).adoc.xml.pandoc.epub: target/$(book).adoc.xml
pandoc \
--from=docbook \
--to=epub \
--output=$@ \
$<
# ==============================================================
# Convert to HTML
target/$(book).adoc.plain.html: $(book).adoc
adoc \
--attribute="stylesheet=none" \
--quiet \
--out-file=$@ \
$<
target/$(book).adoc.html: $(book).adoc
adoc --out-file=$@ $<
target/$(book).adoc.xml.pandoc.html: target/$(book).adoc.xml
pandoc \
--from=docbook \
--to=html \
--standalone \
--output=$@ \
$<
# ==============================================================
# Convert to ODT
target/$(book).adoc.xml.pandoc.odt: target/$(book).adoc.xml
pandoc \
+RTS -K15000000 -RTS \
--from=docbook \
--to=odt \
--output=$@ \
$<
# ==============================================================
# Convert to PDF
target/$(book).adoc.pdf: $(book).adoc
asciidoctor-pdf --out-file=$@ $<
# ==============================================================
# Convert to RTF
target/$(book).adoc.xml.pandoc.rtf: target/$(book).adoc.xml
pandoc \
--from=docbook \
--to=rtf \
--standalone \
--output=$@ \
$<
# ==============================================================
# Change log
# 2018-11-22: Start. Copy from the project _18 Steps to Fluency in Euro-Glosa_.
#
# 2018-12-10: Add rules to OCR the original PDFs.
#
# 2019-02-22: Fix RTF output: `--standalone` was missing.
#
# 2019-02-23: Don't create <picdir> link. Remove the 'target' variable.