-
Notifications
You must be signed in to change notification settings - Fork 562
/
Copy pathbbl2html.awk
495 lines (454 loc) · 17.1 KB
/
bbl2html.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#!/opt/local/bin/gawk -f
#
# bbl2html.awk v1.2c
#
# Released to the public domain (ie. use at your own risk)
# Rik Blok <rikblok@mail.com>
# December 13, 2000.
#
# Latest version available from
# http://rikblok.cjb.net/scripts/bbl2html.awk
#
# Converts a LaTeX .bbl file to (mostly) formatted html code. Probably
# also works if applied directly to a .tex file. Sets bookmarks
# to the keys so you can reference a citation from another page, eg.
# <a href="bib.html#key">[1]</a> will make a link to "key".
#
# bbl2html will use the default label unless you set (on the command-line)
# override=key or override=number in which case it will use the citation
# key or numeric format, respectively.
#
# I wrote this script out of dissatisfaction with other conversion tools
# available. Hopefully it will be of use to somebody. Feel free to
# modify the script to suit you. I've tested it with the bibliography
# styles abbrv, alpha, apalike, ieeetr, plain, prsty, siam, and unsrt
# and it works fairly well.
# For a sample of the output visit http://rikblok.cjb.net/lib/refs.html.
#
# Usage:
# awk -f bbl2html.awk head=<header> foot=<footer> \
# [override = key|number] [labelwidth=<width>]
# [bigtable = 0|1] [noabout=0|1] <infile> > <outfile>
#
# where
# <header> and <footer> may be formatted text (enclosed in escaped
# quotes if containing a space) to be placed at the beginning and end
# of the output, respectively. As a special case, if either begins with
# the symbol "@" it is assumed to be a filename and the text is read from
# the file specified. If neither a header nor a footer is specified
# <html><body> and </body></html> are used, respectively;
# override is an optional variable to change the displayed label to
# the citation key or numeric format;
# <width> is the width of the column label, in pixels or, if appended
# by "%", in percent (optional, defaults to 50 (pixels));
# bigtable is an optional variable which allows the page to be
# formatted as one big table (=1) or a separate table for each entry (=0)
# (multiple tables can be displayed incrementally as the page loads but
# a single table will be rendered faster. Optional, defaults to 0);
# noabout is an optional variable which tells bbl2html.awk not to
# print the "Generated by bbl2html.awk..." comment at the bottom of the
# page (optional, defaults to 0);
# <infile> is the name of the bibliography file (or LaTeX file?) to
# be converted; and
# <outfile> is the name of the html file to be generated.
#
# Sample usages:
# awk -f bbl2html.awk head=\<html\>\<body\> foot=\</body\>\</html\> bib.bbl > bib.html
# awk -f bbl2html.awk head=@bib.head foot=@bib.foot override=key bib.bbl > bib.html
# awk -f bbl2html.awk -f myOwnSubstitutions.awk bib.bbl > bib.html
# awk -f bbl2html.awk labelwidth=20% bigtable=1 bib.bbl > bib.html
#
# Notes:
#
# 1) You can add your own substitutions fairly easily by setting
# userfind[] and userreplace[] in the BEGIN action. You can also
# place the substitutions in a separate awk file (within a BEGIN
# action) to avoid modifying this script (see the last example above
# for a sample usage).
#
# 2) To generate a complete list of citations from a bibtex file
# myreferences.bib use the bbl file generated by this latex file:
#
# %%%% begin latex file
# \documentclass{article}
# \usepackage{url} % if your citations have any \url{} commands
# \begin{document}
# \nocite{*}
# \bibliographystyle{unsrt} % use whichever style you prefer
# \bibliography{myreferences} % use myreferences.bib
# \end{document}
# %%%% end latex file
#
# 3) bbl2html.awk needs GNU awk/gawk. On Solaris machines, neither
# /usr/bin/awk nor /usr/bin/nawk work.
#
# 4) If you download this file make sure it has the proper line-endings
# for your filesystem. Otherwise running the script will probably
# generate an "^ Invalid char" error. On Unix, process the script
# with dos2unix if you encounter this error.
#
# 5) Any occurrences of "<" and ">" must be escaped in header
# and footer. Eg. head=\<html\>\<body\> foot=\</body\>\<html\>
# (not needed in files header/footer may point to).
#
# Revisions:
# v1.2c December 13, 2000
# - added Unix shell header (!/opt/local/bin/gawk -f) to run script
# as a shell command. The path may need to be modified on your
# machine.
# - labelwidth now defaults to pixels. Append a percent symbol to
# use as percent (eg. "labelwidth=20%").
# - more linebreaks allowed in urls
# - fixed: end-of-line comments ("%\n") left in txt of \href{url}{txt}
# - adds "Generated by bbl2html.awk..." comment at bottom of file
# (can be disabled with "noabout=1" command-line parameter)
# v1.2b December 11, 2000
# - fixed: chokes on \href{url}%\n{text}
# - basic math support (italics, super- and sub-scripts)
# - now has bookmarks (<a name="...">) for both keys and labels
# (if different)
# v1.2 December 7, 2000
# - basic support for \href{url}{txt} (tries to guess how to format
# 'txt', either as text or as an url)
# - if neither header nor footer specified, defaults to
# head="<html><body>" and foot="</body></html>"
# v1.1 November 5, 2000
# - added labelwidth option (percentage)
# - rudimentary support for smallcaps
# - added notes 4 and 5
# v1.0c August 25, 2000
# - replaced userfind[]/userreplace[] indices with descriptive keys
# v1.0b July 31, 2000
# - replaceFormat() checks for multiple occurrences of formatting
# - now handles more general keys
# - supports most bibliography styles (that I know of)
# - defaults to using whatever labels are supplied, or numeric (if none)
# - can override label with override=key or override=number
# v1.0 July 28, 2000
# - initial release
#
# To do:
# - nothing urgent
#
# Thanks to:
# - Marc Mutz for bug-hunting and the math substitutions
#--------------------------------------------------------------------------
BEGIN {
# bbl2html.awk information
version = "1.2c";
home = "http://rikblok.cjb.net/scripts/index.html#bbl2html.awk";
# put user-defined substitutions here
# arXiv.org preprint archive
userfind["arxiv"] = "arXiv:([a-zA-Z\.\-]+\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9])";
userreplace["arxiv"] = "<a href=\"http://arXiv.org/abs/\\1\"><tt>arXiv:\\1</tt></a>";
# siam style uses a horizontal line in place of repeating author names
userfind["siam1"] = "\\\\leavevmode\\\\vrule height 2pt depth -1.6pt width 23pt";
userreplace["siam1"] = "<strike>\\ \\ \\ \\ \\ \\ \\ \\ </strike>";
# some trivial math
userfind["math"] = "([^\\\\])\\$([^\\$]*)\\$";
userreplace["math"] = "\\1<var>\\2</var>";
# for super and subscripts:
userfind["math.sub"] = "([^\\\\])(\\$|<var>)(.*)_([a-zA-Z0-9]|{.*}|\\\\[a-zA-Z]+)([^\\$]*)(\\$|</var>)";
userreplace["math.sub"] = "\\1<var>\\3<sub>\\4</sub>\\5</var>";
userfind["math.sup"] = "([^\\\\])(\\$|<var>)(.*)\\^([a-zA-Z0-9]|{.*}|\\\\[a-zA-Z]+)([^\\$]*)(\\$|</var>)";
userreplace["math.sup"] = "\\1<var>\\3<sup>\\4</sup>\\5</var>";
# strip out everything before "\begin{thebibliography}"
while (line !~/\\begin{thebibliography}/)
getline line;
NR=0;
RS = "\\\\bibitem"; # record separator = "\bibitem"
FS = "\\\\newblock[ \n]+"; # field separator = "\newblock "
}
NR == 1 { # on begin, after reading command-line parameters
if (!head && !foot) { # if both undefined then use defaults
head = "<html><body>";
foot = "</body></html>";
}
# put in header
printhf(head);
# default labelwidth = 50 (pixels)
if (!labelwidth) labelwidth = 50;
if (bigtable) print "<table width=\"100%\">";
}
# every record
{
label = ""; # erase label
}
# leading "[", set label and strip "[...]" from $1
$1 ~ /^\[/ {
right = matchBrace($1,1);
label = substr($1,2,right-2);
# alpha style
sub(/{\\etalchar{\+}}/,"+",label); # replace "{\etalchar{+}}" --> "+"
# apalike style
label = authorFormat(label);
# strip label from line
$1 = substr($1,right+1);
}
# leading "{"
$1 ~ /^{/ {
keycount++;
# if label not already set or override then set to number
if (!label || override=="number") label = keycount;
# get length of key from $1
right = matchBrace($1,1);
# set bookmark to key
key = substr($1,2,right-2);
# use key as label?
if (override=="key") label = key;
if (!bigtable) print "<table width=\"100%\">";
print "<tr><td width=\"" labelwidth "\" valign=\"top\">";
printf("<a name=\"" key "\">");
print "[" label "]";
if (key != label) printf("<a name=\"" label "\">");
printf("</td><td");
# fixing width=100% looks better when using multiple tables
if (!bigtable) printf(" width=\"100%\"");
printf(">");
# strip key out of first line
line = substr($1,right+1);
# process each line
lineno=1;
while (lineno<=NF) {
# if last line then check for "\end{thebibliography}"
if (lineno==NF) sub(/\n\\end{thebibliography}/,"",line);
# first take out any urls before any more processing
while ((left=match(line,/\\url{/))>0) {
right= matchBrace(line,RSTART+4);
if (right>left) {
urlcnt++;
url[urlcnt] = substr(line,left+5,right-left-5); # assumes just one url per line
line = substr(line,1,left-1) "__URL" urlcnt "__" substr(line,right+1);
}
}
# repeat for hrefs
while ((left=match(line,/\\href{/))>0) {
right= matchBrace(line,RSTART+5);
if (right>left) {
urlcnt++;
url[urlcnt] = substr(line,left+6,right-left-6); # assumes just one url per line
# now find href text, starting with next '{'
if ((left2 = match(substr(line,right+1),/{/))>0) {
left2 += right;
right = matchBrace(line,left2);
txt[urlcnt] = substr(line,left2+1,right-left2-1);
}
# replace
line = substr(line,1,left-1) "__URL" urlcnt "__" substr(line,right+1);
}
}
line = authorFormat(line);
# re-insert formatted urls
while (urlcnt) {
url[urlcnt] = urlFormat(url[urlcnt],txt[urlcnt]);
urlmark = "__URL" urlcnt "__";
# can't use sub() because url[urlcnt] may contain "&"
# sub(urlmark,url[urlcnt],line);
if (match(line,urlmark)) {
line = substr(line,1,RSTART-1) url[urlcnt] substr(line,RSTART+RLENGTH);
}
urlcnt--;
}
print line "<br>";
# get ready for next line
lineno++;
line = $lineno;
}
printf("</td></tr>");
if (!bigtable) print "</table>";
}
END {
if (bigtable) print "</table>";
if (!noabout) {
print "<hr><font size=\"-1\"><address>Generated by bbl2html.awk v" version "</address></font>";
}
if (foot) printhf(foot);
}
#--------------------------------------------------------------------------
function authorFormat( s,
left,right)
# substitute accents in author-type string s.
{
# first replace small-caps formatting style so accents are handled properly
s = replaceFormatSC(s);
# next, replace accents
s = gensub(/\\'([AEIOUYaeiouy])/, "\\&\\1acute;", "g", s);
s = gensub(/\\`([AEIOUaeiou])/, "\\&\\1grave;", "g", s);
s = gensub(/\\\^([AEIOUaeiou])/, "\\&\\1circ;", "g", s);
s = gensub(/\\~([AEINOUaeinou])/, "\\&\\1tilde;", "g", s);
s = gensub(/\\[\.]([AEIOUaeiou])/, "\\&\\1ring;", "g", s);
s = gensub(/\\\"([AEIOUaeiou])/, "\\&\\1uml;", "g", s);
s = gensub(/\\([Oo])/, "\\&\\1slash;", "g", s);
s = gensub(/\\(AE|ae)/, "\\&\\1lig;", "g", s);
gsub(/\\ss/,"\\ß",s); # German sharp s
gsub(/~/,"\\ ",s); # replace nonbreaking spaces: ~ -->
gsub(/\\[,@]/," ",s); # replace spaces
gsub(/``|''/,"\\"",s); # replace quotes
gsub(/---/,"-",s); # replace dashes
gsub(/--/,"-",s);
# these accents can't be displayed in HTML (with my charset) so delete 'em
s = gensub(/\\[bcduvH]{([a-zA-Z])}/, "\\1", "g", s); # eg. \u{o}
gsub(/{\\AA}/,"A",s); # \AA --> A
gsub(/{\\aa}/,"a",s); # \aa --> a
# user-defined substitutions
for (i in userfind) {
s = gensub(userfind[i], userreplace[i], "g", s);
}
# replace formatting styles
s = replaceFormat(s,"\\em","<em>","</em>"); # replace emphasis
s = replaceFormat(s,"\\bf","<b>","</b>"); # replace bold
s = replaceFormat(s,"\\it","<i>","</i>"); # replace italics
s = replaceFormat(s,"\\tt","<tt>","</tt>"); # replace teletype
gsub(/{|}/,"",s); # drop any remaining braces
gsub(/\\/,"",s); # drop any remaining slashes
return s;
}
#--------------------------------------------------------------------------
function urlFormat( url, # function parameters
display) # optional variables
# Format a url. If 'display' is passed then try to determine if it should
# be displayed formatted author-like or url-like.
{
# gsub(/\&/,"\\\\&",url); # escape "&"s (hmm, apparently I don't need this...)
gsub(/ /,"",url); # strip spaces
gsub(/%\n/,"",url); # strip end-of-line comments
gsub(/\n/,"",url); # strip other linebreaks
gsub(/%\n/,"",display); # also strip end-of-line comments in display
# guess how to format display, either as an url or authorFormat()
if (!display || tolower(display) ~ /:\/\/|^mailto:/) {
if (!display) display = url; # default display = formatted url
# allow linebreaks after punctuation symbols for display purposes
display = "<tt>" gensub(/([^A-Za-z0-9 ])/,"\\1<wbr>","g",display) "</tt>";
} else {
display = authorFormat(display);
}
return "<a href=\"" url "\">" display "</a>";
}
#--------------------------------------------------------------------------
function printhf( s, # function parameters
line) # local variables
# Prints s (head or foot). If s starts with "@" then is assumed to
# be a filename and prints the contents of the file
{
# if no leading "@" then just print s
if (s !~ /^@/) {
print s;
return;
}
# else print contents of file s
s = substr(s,2); # drop leading "@"
while ((getline line < s) > 0)
print line;
close(s);
}
#--------------------------------------------------------------------------
function replaceFormat( s,find,replaceleft,replaceright, # function parameters
left, right) # local variables
# Replace formatting style marks. Use to change things like {\it et al.}
# into {<it> et al.</it>} with the usage
# s = replaceFormat(s,"\\it","<i>","</i>");
{
while ((left = index(s,find))>0) {
right = matchBrace(s,left,"{"); # find "}" which matches assumed "{" at position 'left'
if (right>left) {
s = substr(s,1,right-1) replaceright substr(s,right);
}
s = substr(s,1,left-1) replaceleft substr(s,left+length(find));
}
return s;
}
#--------------------------------------------------------------------------
function replaceFormatSC( s, # function parameters
find,replaceleft,replaceright, # local variables
left, right,l,c,r,cout,i,capslock,ch)
# Replace small caps "{\sc ...}" formatting style marks.
# Should be called before accents are replaced (so that "\'a" --> "\'A",
# for example, instead of "á" --> "&AACUTE;".)
# This routine is not robust, it assumes only a small subset of LaTeX
# commands (such as accents) will be found in the text s. Unanticipated
# commands will probably be changed to uppercase (but, for now, this can
# probably be corrected with userfind[]/userreplace[] substitutions).
{
find = "\\sc";
replaceleft = "<font size=\"-1\">";
replaceright= "</font>";
while ((left = index(s,find))>0) {
right = matchBrace(s,left,"{"); # find "}" which matches assumed "{" at position 'left'
if (right<left) right = length(s);
# split s into parts
l = substr(s,1,left-1); # left
left += length(find);
c = substr(s,left,right-left+1)
r = substr(s,right+1);
# manipulate c
capslock=1; # start in uppercase
cout = "";
for (i=1; i<=length(c); i++) {
ch = substr(c,i,1);
if (ch ~ /[a-z]/) {
if (capslock) {
ch = replaceleft toupper(ch);
capslock = 0;
} else
ch = toupper(ch);
} else if (ch ~ /[A-Z0-9]/) {
if (!capslock) {
ch = replaceright ch;
capslock = 1;
}
}
cout = cout ch;
}
if (!capslock) cout = cout replaceright;
# correct mangled accents
gsub(/\\B{/, "\\b{", cout);
gsub(/\\C{/, "\\c{", cout);
gsub(/\\D{/, "\\d{", cout);
gsub(/\\U{/, "\\u{", cout);
gsub(/\\V{/, "\\v{", cout);
# recombine s
s = l cout r;
}
return s;
}
#@include matchBrace.awk
#--------------------------------------------------------------------------
# matchBrace.awk - library containing single function matchBrace()
function matchBrace( s,i, # function parameters
open, # optional parameters
brace,depth,left,right,either,pos,start) # local variables
# Finds the matching brace for the one at index i (or assume brace==open,
# if specified) in string s. Returns
# index of matching brace or zero if not found.
{
# error trap
if (!i) return 0;
# if open not specified then read from substr(s,i,1)
if (!open) open = substr(s,i,1);
# identify type of braces and put in left and right
left = "([{<`)]}>'";
right= ")]}>'([{<`";
pos = index(left,open);
if (!pos) { # not in list of braces
left = open; # set left and right to the same thing
right= left;
either = "[" left "|" right "]"; # regexp (not escaped)
} else {
left = substr(left, pos,1); # found in list, set match
right= substr(right,pos,1);
either = "[\\" left "|\\" right "]"; # regexp (escaped)
}
# find matching brace
pos = i;
depth = 1;
while (depth) {
start += pos;
s = substr(s,pos+1);
if ((pos = match(s,either))>0) { # another brace found
if (substr(s,pos,1)==right) depth--;
else depth++;
} else return 0; # no more braces, return zero
}
return start+pos;
}
#--------------------------------------------------------------------------