Skip to content

Commit c0f97b5

Browse files
authored
Plain text extraction (#203)
* add text-extraction API endpoints to the XML service * tweak to JSON serialization of page text
1 parent 4833f46 commit c0f97b5

6 files changed

+192
-28
lines changed

xml/html/extract-text-by-page.html

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<html xmlns="http://www.w3.org/1999/xhtml">
2+
<body>
3+
<h1>Extract a list of the plain text content of each page of a TEI document</h1>
4+
<p>Upload a TEI document to extract just the indexable plain text content.</p>
5+
<p>Content is returned as an <code>application/json</code> document.
6+
The document consists of a JSON array of pages in the form of JSON objects with
7+
<code>page</code> and <code>text</code> properties, whose values are strings
8+
containing the page's identifier and its plain text content, respectively.</p>
9+
<form action="" method="post" enctype="multipart/form-data" accept-charset="utf-8">
10+
<input type="hidden" name="_charset_" value="utf-8"/>
11+
<label for="source">TEI document</label>
12+
<input type="file" name="source" id="source" accept="application/xml"/>
13+
<button>Extract plain text</button>
14+
<input type="reset" value="Reset form"/>
15+
</form>
16+
</body>
17+
</html>

xml/html/extract-text.html

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<html xmlns="http://www.w3.org/1999/xhtml">
2+
<body>
3+
<h1>Extract plain text content from a TEI document</h1>
4+
<p>Upload a TEI document to extract just the indexable plain text content.</p>
5+
<p>Content is returned as a <code>text/plain</code> document.</p>
6+
<form action="" method="post" enctype="multipart/form-data" accept-charset="utf-8">
7+
<input type="hidden" name="_charset_" value="utf-8"/>
8+
<label for="source">TEI document</label>
9+
<input type="file" name="source" id="source" accept="application/xml"/>
10+
<button>Extract plain text</button>
11+
<input type="reset" value="Reset form"/>
12+
</form>
13+
</body>
14+
</html>

xml/html/index.html

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ <h1>Nyingarn XML-processing web service</h1>
66
<li><a href="ingest-json">Ingest JSON (previously converted from CSV)</a></li>
77
<li><a href="reconstitute-tei">Reconstitute a TEI document from an RO-Crate metadata file and a set of TEI surface files</a></li>
88
<li><a href="validate-with-schematron">Validate one or more XML documents with a Schematron schema</a></li>
9+
<li><a href="extract-text">Extract plain text from TEI</a></li>
10+
<li><a href="extract-text-by-page">Extract plain text of each page of TEI</a></li>
911
</ul>
1012
</body>
1113
</html>

xml/xproc/nyingarn.xpl

+87-28
Original file line numberDiff line numberDiff line change
@@ -46,39 +46,55 @@
4646

4747
<!--
4848
Package the source XML into an HTTP response:
49-
If it's JSON-XML, regard it as an error: convert it to JSON and return it with an HTTP 400 error code.
50-
If it's other XML including XHTML, regard it as a success: return it as XML with a 200 OK
49+
If it's a JSON-XML map containing a 'name' and a 'code' property, regard it as an error: convert it to JSON and return it with an HTTP 400 error code.
50+
If it's other JSON-XML, return it as JSON with a 200 OK status
51+
If it's XML including XHTML, return it as XML with a 200 OK
5152
-->
52-
<p:declare-step name="make-http-response" type="nyingarn:make-http-response">
53+
<p:declare-step name="make-http-response" type="nyingarn:make-http-response" xmlns:fn="http://www.w3.org/2005/xpath-functions">
5354
<p:input port="source"/>
5455
<p:output port="result"/>
55-
<p:choose>
56-
<p:when test="/fn:map" xmlns:fn="http://www.w3.org/2005/xpath-functions"><!-- an error -->
57-
<p:template name="http-response">
58-
<p:input port="parameters"><p:empty/></p:input>
59-
<p:input port="template">
60-
<p:inline>
61-
<c:response status="400">
62-
<c:body content-type="application/json">{xml-to-json(/*)}</c:body>
63-
</c:response>
64-
</p:inline>
65-
</p:input>
66-
</p:template>
67-
</p:when>
68-
<p:otherwise><!-- TEI or XHTML -->
69-
<z:make-http-response>
70-
<p:with-option name="content-type" select="
71-
if (/xhtml:html) then
72-
'application/xhtml+xml; charset=utf-8'
73-
else
74-
'application/xml; charset=utf-8'
75-
"/>
76-
</z:make-http-response>
77-
</p:otherwise>
78-
</p:choose>
56+
<!-- if the result to be returned is a JSON object containing both a "name" and a "code" property, then it's assumed to represent an error -->
57+
<p:variable name="status" select="
58+
if (/fn:map[fn:string/@key='name'][fn:string/@key='code']) then
59+
'400'
60+
else
61+
'200'
62+
"/>
63+
<p:variable name="content-type" select="
64+
if (/xhtml:html) then
65+
'application/xhtml+xml; charset=utf-8'
66+
else if (/fn:string) then
67+
'text/plain; charset=utf-8'
68+
else if (/fn:*) then
69+
'application/json'
70+
else
71+
'application/xml; charset=utf-8'
72+
"/>
73+
<p:template name="http-response">
74+
<p:with-param name="status" select="$status"/>
75+
<p:with-param name="content-type" select="$content-type"/>
76+
<p:input port="template">
77+
<p:inline>
78+
<c:response status="{$status}">
79+
<c:body content-type="{$content-type}">{
80+
if ($content-type='text/plain; charset=utf-8') then
81+
(: convert a single JSON-XML string to plain text :)
82+
string(/*)
83+
else if ($content-type='application/json') then
84+
(: convert other JSON-XML to JSON :)
85+
xml-to-json(/*)
86+
else
87+
(: copy other XML unchanged :)
88+
/*
89+
}</c:body>
90+
</c:response>
91+
</p:inline>
92+
</p:input>
93+
</p:template>
7994
</p:declare-step>
8095

8196
<p:variable name="uri-path" select="replace(/c:request/@href, 'http://.*?/([^?]*).*', '$1')"/>
97+
<p:variable name="method" select="/c:request/@method"/>
8298

8399
<z:dump href="/tmp/request.xml"/>
84100

@@ -165,6 +181,47 @@
165181
</p:otherwise>
166182
</p:choose>
167183
</p:when>
184+
<p:when test="$uri-path='nyingarn/extract-text'">
185+
<p:choose>
186+
<p:when test="//c:body/@disposition[starts-with(., 'form-data; name=&quot;source&quot;')]">
187+
<!-- pass the document to the text-extraction stylesheet -->
188+
<p:xslt name="extract-plain-text">
189+
<p:input port="parameters"><p:empty/></p:input>
190+
<p:input port="source" select="//c:body[starts-with(@disposition, 'form-data; name=&quot;source&quot;')]/*">
191+
<p:pipe step="nyingarn" port="source"/>
192+
</p:input>
193+
<p:input port="stylesheet">
194+
<p:document href="../xslt/extract-plain-text.xsl"/>
195+
</p:input>
196+
</p:xslt>
197+
</p:when>
198+
<p:otherwise>
199+
<!-- file was not uploaded; display an upload form, for manual testing -->
200+
<nyingarn:html-page page="extract-text"/>
201+
</p:otherwise>
202+
</p:choose>
203+
</p:when>
204+
205+
<p:when test="$uri-path='nyingarn/extract-text-by-page'">
206+
<p:choose>
207+
<p:when test="//c:body/@disposition[starts-with(., 'form-data; name=&quot;source&quot;')]">
208+
<!-- pass the document to the text-extraction stylesheet -->
209+
<p:xslt name="extract-plain-text">
210+
<p:input port="parameters"><p:empty/></p:input>
211+
<p:input port="source" select="//c:body[starts-with(@disposition, 'form-data; name=&quot;source&quot;')]/*">
212+
<p:pipe step="nyingarn" port="source"/>
213+
</p:input>
214+
<p:input port="stylesheet">
215+
<p:document href="../xslt/extract-plain-text-by-page.xsl"/>
216+
</p:input>
217+
</p:xslt>
218+
</p:when>
219+
<p:otherwise>
220+
<!-- file was not uploaded; display an upload form, for manual testing -->
221+
<nyingarn:html-page page="extract-text-by-page"/>
222+
</p:otherwise>
223+
</p:choose>
224+
</p:when>
168225
<p:when test="$uri-path='nyingarn/ingest-tei'">
169226
<p:choose>
170227
<p:when test="//c:body/@disposition[starts-with(., 'form-data; name=&quot;source&quot;')]">
@@ -257,7 +314,9 @@
257314
<cx:message name="log-request">
258315
<p:with-option name="message" select="
259316
concat(
260-
'Request URI [',
317+
'Method [',
318+
$method,
319+
'] URI [',
261320
$uri-path,
262321
'] returning status code [',
263322
/c:response/@status,
+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<xsl:transform version="3.0"
2+
xpath-default-namespace="http://www.tei-c.org/ns/1.0"
3+
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4+
xmlns:nyingarn="https://nyingarn.net/ns/functions"
5+
xmlns:err="http://www.w3.org/2005/xqt-errors">
6+
<!--
7+
Divides a TEI file into pages and produces an XML representation of a JSON object
8+
whose keys are page identifiers and whose values are the plain text content of each page.
9+
-->
10+
<xsl:import href="error.xsl"/>
11+
<xsl:key name="text-by-page-id" match="text()" use="preceding::pb[1]/@xml:id"/>
12+
<xsl:template match="/">
13+
<xsl:try>
14+
<array xmlns="http://www.w3.org/2005/xpath-functions">
15+
<xsl:for-each select="/TEI/text//pb">
16+
<map>
17+
<string xsl:expand-text="yes" key="page">{@xml:id}</string>
18+
<string xsl:expand-text="yes" key="text">{key('text-by-page-id', @xml:id) => string-join() => normalize-space()}</string>
19+
</map>
20+
</xsl:for-each>
21+
</array>
22+
<xsl:catch>
23+
<!-- Return any error as a JSON-XML version of a JSON object -->
24+
<!-- The web service layer which invokes this stylesheet will recognise this JSON-XML response
25+
as an error, convert it to JSON, and return it to the client as the body of an HTTP 400 response -->
26+
<!-- the JSON will then be deserialized by the JavaScript client and rethrown as an error. -->
27+
<xsl:sequence select="
28+
nyingarn:error-to-json-xml(
29+
$err:code,
30+
$err:description,
31+
$err:value,
32+
$err:module,
33+
$err:line-number,
34+
$err:column-number
35+
)
36+
"/>
37+
</xsl:catch>
38+
</xsl:try>
39+
</xsl:template>
40+
</xsl:transform>

xml/xslt/extract-plain-text.xsl

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<xsl:transform version="3.0"
2+
xpath-default-namespace="http://www.tei-c.org/ns/1.0"
3+
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4+
xmlns:nyingarn="https://nyingarn.net/ns/functions"
5+
xmlns:err="http://www.w3.org/2005/xqt-errors">
6+
<!--
7+
Extracts full text content of the TEI file, as a JSON string in JSON-XML format
8+
-->
9+
<xsl:import href="error.xsl"/>
10+
<xsl:key name="text-by-page-id" match="text()" use="preceding::pb[1]/@xml:id"/>
11+
<xsl:template match="/">
12+
<xsl:try>
13+
<string xsl:expand-text="yes" xmlns="http://www.w3.org/2005/xpath-functions">{/TEI/text => normalize-space()}</string>
14+
<xsl:catch>
15+
<!-- Return any error as a JSON-XML version of a JSON object -->
16+
<!-- The web service layer which invokes this stylesheet will recognise this JSON-XML response
17+
as an error, convert it to JSON, and return it to the client as the body of an HTTP 400 response -->
18+
<!-- the JSON will then be deserialized by the JavaScript client and rethrown as an error. -->
19+
<xsl:sequence select="
20+
nyingarn:error-to-json-xml(
21+
$err:code,
22+
$err:description,
23+
$err:value,
24+
$err:module,
25+
$err:line-number,
26+
$err:column-number
27+
)
28+
"/>
29+
</xsl:catch>
30+
</xsl:try>
31+
</xsl:template>
32+
</xsl:transform>

0 commit comments

Comments
 (0)