-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxmlcheck.rex
349 lines (320 loc) · 16.5 KB
/
xmlcheck.rex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
/* Usage: XMLCHECK file */
/* Purpose: Check that the given file is well formed XML. */
/* The file is not modified. XMLCHECK finds tag */
/* nesting errors, syntactically invalid NMTOKENs, */
/* syntactically invalid attributes (assuming type */
/* CDATA for attribute values), erroneous numeric */
/* character references (NCRs), any "&" outside of */
/* entities, and similar issues. */
/* Caveats: HTML is not XML, do not use XMLCHECK for HTML. */
/* The handling of "<!DOCTYPE ... >" even without */
/* DTD subset is incomplete. "<![CDATA[ ... ]]>" */
/* and "<!-- ... -->" comments work as expected. */
/* For end tags "</xyz>" white space after "</" is */
/* not yet supported and could be in fact invalid. */
/* White space including line breaks after "<" or */
/* before ">" is ignored. This might be invalid */
/* within "<!", "<?", or "?>". */
/* Unsupported: "<![IGNORE[ ... ]]>" and "<![INCLUDE ... ]]>" */
/* are only used in DTDs or DTD subsets. XMLCHECK */
/* does not check the syntax of "<!ATTLIST ... >", */
/* "<!ELEMENT ... >", or "<!ENTITY ... >" in DTDs. */
/* Procedures: BOMB accepts u+FEFF at begin of 1st line */
/* CDATA checks all NCRs, '&', and '<' */
/* CONTROL accepts '09'x, '0A'x, and '0D'x */
/* NMTOKEN accepts letters, ':_', digits, '-.' */
/* PARAM checks DTD parameter entity names */
/* SUBSET checks "<! ... >" in DTD subsets */
/* UNWELL reports missing closing tag source */
/* GARBAGE reports unexpected input data */
/* FINDME matches wanted string (or next '>') */
/* NICE progress indicator up to 4 MB input */
/* See also: <URL:http://purl.net/xyzzy/kex/xmlcheck.kex> */
/* <URL:http://purl.net/xyzzy/src/xmlcheck.rex> */
/* Requires: Classic or object REXX (Frank Ellermann, 2008) */
signal on novalue ; signal on notready
FILE = strip( strip( strip( arg( 1 )),, '"' ))
if FILE <> '' then FILE = stream( FILE, 'c', 'q exists' )
if FILE = '' then do
if arg( 1 ) <> '' then say 'not found:' arg( 1 )
parse source . . X ; say
say 'Usage:' X 'file' ; say
say 'to check that the given file is well-formed XML.'
exit 1
end
XCTL = xrange( x2c( 0E ), x2c( 1F )) || x2c( 7F )
XCTL = xrange( x2c( 0B ), x2c( 0C )) || XCTL
XCTL = xrange( x2c( 00 ), x2c( 08 )) || XCTL
D.0 = 0 /* number of open D.N tags */
L.0 = 0 /* max. tag nesting level */
ETAG = 0 ; ROOT = ''
WANT = '<' ; NEXT = ''
EXPO = 'XCTL LINE' /* expose global variables */
do LINE = 1 while sign( chars( FILE ))
DATA = linein( FILE ) ; call NICE length( DATA )
N = words( NEXT ) ; MORE = ''
if N > 0 then do
N = wordindex( NEXT, N ) ; MORE = substr( NEXT, N )
NEXT = left( NEXT, N - 1 )
end
do until DATA = '' /* remove spaces after '<' */
parse var DATA WORD DATA /* '[', or before '>', ']' */
X = pos( right( MORE, 1 ), '<[' )
N = pos( left( WORD, 1 ), '>]' )
if sign( X + N ) then MORE = MORE || WORD
else MORE = strip( MORE WORD )
end
if NEXT <> '' then do /* get rid of old NEXT for */
if WANT = '<' then do /* comments or text nodes */
if D.0 > 0 then call CDATA NEXT
else if ROOT = '.DTD' then call PARAM NEXT
else if BOMB( NEXT ) then nop
else exit GARBAGE( NEXT 'garbage outside of element' )
NEXT = '' /* text node must be CDATA */
end
else if WANT = '-->' then do
if pos( '--', NEXT ) > 0 then exit GARBAGE( '--' )
NEXT = '' /* no '--' in XML comments */
end
else if WANT = ']]>' | WANT = '?>' then do
call CONTROL NEXT ; NEXT = ''
end /* preserve anything else */
end /* until WANT string found */
NEXT = NEXT || MORE
parse value FINDME( WANT, NEXT ) with STOP DOCT ',' WANT
do while sign( STOP ) /* found next WANT string: */
parse var NEXT DATA (WANT) NEXT
DATA = strip( DATA ) ; NEXT = strip( NEXT )
if DATA <> '' & WANT = '<' then do
if D.0 > 0 then call CDATA DATA
else if ROOT = '.DTD' then call PARAM DATA
else if BOMB( DATA ) then nop
else exit GARBAGE( DATA 'garbage outside of element' )
end
if WANT = '<' then select
when NEXT = '!' | NEXT = '![' then do
NEXT = '<' || NEXT ; leave
end /* very dubious line break */
when abbrev( NEXT, '![CDATA[' ) then WANT = ']]>'
when abbrev( NEXT, '!--' ) then do
NEXT = substr( NEXT, 4 ) ; WANT = '-->'
end
when abbrev( NEXT, '!DOCTYPE' ) then do
if substr( NEXT, 9, 1 ) <> ' '
then exit GARBAGE( '<' || NEXT )
if L.0 = 0 & ROOT <> '.DTD'
then L.0 = -1 /* if unexpected <!DOCTYPE */
else exit GARBAGE( '<' || NEXT '- dupe' )
NEXT = substr( NEXT, 9 ) ; WANT = ' ['
end
when abbrev( NEXT, '!' ) then do
if L.0 = 0 then ROOT = '.DTD' ; WANT = '>'
if L.0 > 0 then exit GARBAGE( '<' || NEXT )
N = word( NEXT, 1 ) ; D.1 = '<' || N
N = wordpos( N, '!ATTLIST !ELEMENT !ENTITY' )
if N = 0 then exit GARBAGE( '<' || NEXT )
end
when abbrev( NEXT, '?' ) then WANT = '?>'
when NEXT = '' then do
NEXT = '<' ; leave
end /* fetches the missing tag */
when ROOT = '.DTD' /* cannot mix DTD and XML: */
then exit GARBAGE( '<' || NEXT 'after' D.1 )
/* else expecting ordinary XML <tag>, <tag />, </tag> */
when D.0 = 0 & L.0 > 0 /* too many root elements: */
then exit GARBAGE( '<' || NEXT '- got already' D.1 )
when abbrev( NEXT, '/' ) = 0 then do
N = D.0 + 1 ; D.N = NMTOKEN( NEXT )
D.0 = N ; L.0 = max( D.0, L.0 )
L.N = LINE ; WANT = '='
NEXT = substr( NEXT, 1 + length( D.N ))
end /* got NMTOKEN of open tag */
when D.0 = 0 /* missing a root element: */
then exit GARBAGE( '<' || NEXT '- missing root' )
otherwise /* match the last open tag */
N = D.0 ; D.0 = N - 1
ETAG = 1 ; WANT = '>'
if abbrev( NEXT, '/' || D.N ) = 0
then exit UNWELL( D.N, L.N )
NEXT = substr( NEXT, 2 + length( D.N ))
end
else if WANT <> '>' then select
when WANT = '"' | WANT = "'" then do
call CDATA DATA ; WANT = '='
end
when WANT = '=' then do
if NMTOKEN( DATA ) <> DATA
then exit GARBAGE( DATA || '=' || NEXT )
if NEXT = '' then do
NEXT = DATA '=' ; leave
end /* fetches attribute value */
WANT = left( NEXT, 1 )
if WANT = '"' | WANT = "'"
then NEXT = substr( NEXT, 2 )
else exit GARBAGE( DATA || '=' || NEXT )
end
when WANT = ']>' then do
call SUBSET DATA ; WANT = '<'
end
when CONTROL( DATA ) then OOPS = 0 / 0
when WANT = ' [' then do
ROOT = word( DATA, 1 ) ; WANT = ']>'
end
when WANT = '-->' then do
if pos( '--', DATA ) = 0 then WANT = '<'
else exit GARBAGE( '--' )
end
when WANT = ']]>' then WANT = '<'
when WANT = '?>' then WANT = '<'
end
else do ; WANT = '<'
select /* after old WANT was '>' */
when ETAG then do
if DATA <> '' then exit GARBAGE( DATA || '>' )
ETAG = 0 /* end tag has to be empty */
end
when DOCT | ROOT = '.DTD' then do
if DOCT then ROOT = word( DATA, 1 )
call CDATA DATA
end
when DATA = '/' then D.0 = D.0 - 1
when DATA <> '' then exit GARBAGE( DATA || '>' )
otherwise nop
end
end
parse value FINDME( WANT, NEXT ) with STOP DOCT ',' WANT
end
end LINE
N = D.0 ; LINE = LINE '(EOF)'
select
when N > 0 then exit UNWELL( D.N. L.N )
when WANT <> '<' then exit GARBAGE( ': missing' WANT )
when L.0 = 0 then if ROOT = '.DTD' then nop
else exit GARBAGE( ': no XML elements' )
when ROOT = '' then ROOT = 'XML'
when L.0 < 0 then exit GARBAGE( ': found no' ROOT )
when ROOT <> D.1 then exit GARBAGE( D.1 '- expected' ROOT )
otherwise nop
end
if ROOT <> '.DTD' /* intentional dot in .DTD */
then N = 'max.' || right( L.0, 3 ) 'nested tags in'
else N = 'apparently well-formed'
say strip( N ROOT ) 'file' FILE ; exit lineout( FILE )
/* -------------------------------------------------------------- */
NOTREADY: say 'cannot open' FILE ; exit 1
NOVALUE: say 'no value trap near line' sigl || ':'
say sourceline( sigl ) ; exit 1
NICE: procedure expose (EXPO) NICE
if symbol( 'NICE' ) <> 'VAR' then NICE = 0
OLD = NICE % 40000 ; NICE = NICE + arg( 1 )
NEW = NICE % 40000 ; if OLD = NEW then return
OLD = x2c( 0D ) /* up to 4 MB % 79 = 39819 */
NEW = left( copies( '.', NEW // 80 ), 79 ) || OLD
signal on syntax name NICE.TRAP ; call SysSleep 0
NICE.TRAP: /* ignore missing SysSleep */
return charout( /**/, OLD || NEW ) /* show progress indicator */
/* -------------------------------------------------------------- */
BOMB: procedure expose (EXPO) /* accept BOM u+FEFF if in */
if LINE > 1 then return 0 /* 1st line */
if arg( 1 ) = x2c( 'EFBBBF' ) then return 1 /* if UTF-8 */
if arg( 1 ) = x2c( '849F9E9F9F' ) then return 1 /* if UTF-4 */
return 0 /* other UTFs fail anyway */
CDATA: procedure expose (EXPO) /* check entities and '<': */
parse arg DATA ; POS = pos( '&', DATA ) + 1
do while POS > 1
DATA = substr( DATA, POS ) ; POS = pos( ';', DATA ) + 1
if POS > 1 then ENT = left( DATA, POS - 2 )
else ENT = '' /* missing ';' fails below */
DATA = substr( DATA, POS ) ; POS = pos( '&', DATA ) + 1
select /* get number of hex. NCR: */
when abbrev( ENT, '#x' ) then do
T = translate( substr( ENT, 3 ), '.', ' ' )
if datatype( T, 'x' ) then T = x2d( T )
else T = 0
end /* get number of dec. NCR: */
when abbrev( ENT, '#' ) then do
T = translate( substr( ENT, 2 ), '..', '+-' )
if datatype( T, 'w' ) then T = T + 0
else T = 0
end /* otherwise test NMTOKEN: */
otherwise T = 10 * ( ENT = NMTOKEN( ENT ))
end /* 0: bad token, 10: valid */
if wordpos( T, '0 9 10 13 133' ) = 0 then select
when T < 32 then T = 0 /* 0000...001F */
when T < 127 then nop
when T < 160 then T = 0 /* 007F...009F */
when T < 55296 then nop
when T < 57344 then T = 0 /* D800...DFFF */
when T < 64976 then nop
when T < 65008 then T = 0 /* FDD0...FDFF */
when T // 65536 > 65533 then T = 0 /* FFFE...FFFF */
when T <= 1114111 then nop
otherwise T = 0 /* if > 10FFFF */
end
if T = 0 then exit GARBAGE( '&' || ENT )
end
DATA = arg( 1 ) ; POS = pos( '<', DATA )
if POS = 0 then return CONTROL( DATA )
else exit GARBAGE( DATA )
CONTROL: procedure expose (EXPO) /* reject US-ASCII control */
parse arg DATA
N = verify( DATA, XCTL, 'M' ) ; if N = 0 then return 0
N = c2x( substr( DATA, N, 1 )) ; exit GARBAGE( '0x' || N )
GARBAGE: procedure expose (EXPO) /* report any other error: */
say 'unexpected' arg( 1 ) 'near line' LINE
return 1
UNWELL: procedure expose (EXPO) /* report invalid nesting: */
X = 'unnmatched <' || arg( 1 ) || '> from line' arg( 2 )
say X 'near line' LINE ; return 1
SUBSET: procedure expose (EXPO) /* check given DTD subset: */
parse arg SRC ; POS = pos( '<!', SRC )
do while POS > 0
TOP = left( SRC, POS - 1 ) ; SRC = substr( SRC, POS + 2 )
if TOP <> '' then call PARAM TOP
if abbrev( SRC, '--' ) then do
parse var SRC '--' TOP '--' SRC
POS = pos( '>', SRC )
if POS = 0 then exit GARBAGE( '<!--' TOP '--' SRC )
call CONTROL TOP ; TOP = left( SRC, POS - 1 )
if TOP <> '' then exit GARBAGE( '--' TOP '>' )
SRC = substr( SRC, POS + 1 )
POS = pos( '<!', SRC ) ; iterate
end
parse var SRC TOP ' ' SRC
if wordpos( TOP, 'ATTLIST ELEMENT ENTITY' ) > 0 then do
POS = pos( '>', SRC )
if POS = 0 then exit GARBAGE( '<!' || TOP SRC )
TOP = left( SRC, POS - 1 ) ; SRC = substr( SRC, POS + 1 )
call CDATA TOP ; POS = pos( '<!', SRC )
end
else exit GARBAGE( '<!' || TOP '(not implemented)' )
end
if SRC = '' then return ; else return PARAM SRC
PARAM: procedure expose (EXPO) /* accept parameter entity */
parse arg DATA ; DATA = strip( DATA )
do forever
parse var DATA X 2 P DATA ; N = length( P )
if X <> '%' | N < 2 then leave
X = right( P, 1 ) ; P = left( P, N - 1 )
if X <> ';' | ( P <> NMTOKEN( P )) then leave
if DATA = '' then return
end
exit GARBAGE( arg( 1 ) 'is no parameter entity' )
NMTOKEN: procedure expose (EXPO) /* assume tags are tokens: */
WORD = translate( arg( 1 ), 'XX99', ':_-.' )
if datatype( left( WORD, 1 ), 'M' ) = 0
then exit GARBAGE( arg( 1 ) '- expected NMTOKEN' )
do N = 2 to length( WORD ) /* letters, digits, ':_-.' */
if datatype( substr( WORD, N, 1 ), 'A' ) = 0 then leave N
end N
return left( arg( 1 ), N - 1 )
FINDME: procedure expose (EXPO) /* find next wanted string */
parse arg WANT, TEXT ; WPOS = pos( WANT, TEXT )
DOCT = ( WANT = ' [' ) ; DPOS = 0
if WANT = '=' | DOCT then DPOS = pos( '>' , TEXT )
select
when DPOS = 0 then return WPOS DOCT || ',' || WANT
when WPOS = 0 then return DPOS DOCT || ',' || '>'
when WPOS < DPOS then return WPOS DOCT || ',' || WANT
otherwise return DPOS DOCT || ',' || '>'
end