-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathsearch.js
111 lines (106 loc) · 4.44 KB
/
search.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
* Sorry - this is ugly, OLIP uses "opensearch" which is XML based,
* and to avoid including overweight XML libraries,
* I will output stuff via templates
*/
const debug = require('debug')('dweb-mirror:search');
const ArchiveItem = require('./ArchiveItemPatched');
const ItemsPerPage = 75;
function doQuery(o, opts, config, cb) {
o.fetch_metadata(opts, (err, unusedAI) => { // Not passing noCache as query usually after a fetch_metadata
if (err) {
debug('streamQuery could not fetch metadata for %s', o.identifier);
cb(err);
} else {
o.fetch_query({ copyDirectory: opts.copyDirectory, wantFullResp: true, noCache: opts.noCache }, (err1, resp) => { // [ArchiveMember*]
if (err1) {
debug('streamQuery for q="%s" failed with %s', o.query, err1.message);
cb(err1);
} else if (!opts.wantCrawlInfo) {
// Note we are adding crawlinfo to o - the ArchiveItem, but the resp.response.docs
// is an array of pointers into same objects so its getting updated as well
cb(null, resp);
} else {
o.addCrawlInfo({ config, copyDirectory: opts.copyDirectory }, (unusederr, unusedmembers) => {
resp.response.downloaded = o.downloaded;
resp.response.crawl = o.crawl;
cb(null, resp);
});
}
});
}
});
}
// https://github.com/dewitt/opensearch/blob/master/opensearch-1-1-draft-6.md#opensearch-response-elements
// https://validator.w3.org/feed/docs/atom.html
function atomFeedInfo(resp, {query, opts}) {
// Skipping paging as OLIP not doing it
const {protoHost} = opts;
const queryString = query.q;
const encQuery = encodeURIComponent(queryString);
const now = new Date(Date.now()).toISOString();
return `
<title>Offline Internet Archive Search: ${queryString}</title>
<link href="${protoHost}/${encQuery}"/>
<updated>${now}</updated>
<author>
<name>Offline Internet Archive</name>
</author>
<id>http://archive.org/search/${encQuery}</id>
<opensearch:totalResults>${resp.response.numFound}</opensearch:totalResults>
<opensearch:startIndex>${resp.response.start}</opensearch:startIndex>
<opensearch:itemsPerPage>${ItemsPerPage}</opensearch:itemsPerPage>
<opensearch:Query role="request" searchTerms=${queryString} startPage="${Math.floor(resp.response.start / ItemsPerPage)}" />
<link rel="search" type="application/opensearchdescription+xml" href="${protoHost}/opensearch"/>
`;
}
function atomEntry(m, {protoHost}) {
// TODO note archive search results do not usually include description field as can be large so <content/> omitted
return `
<entry>
<title>${m.title}</title>
<link href="${protoHost}/details/${m.identifier}"/>
<id>https://archive.org/details/${m.identifier}</id>
<updated>${m.publicdate}</updated>
</entry>
`;
}
function atomFrom(resp, req) {
return `<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
${atomFeedInfo(resp, req)}
${resp.response.docs.map(m => atomEntry(m, req.opts)).join('')}
</feed>
`;
}
function XMLdescriptor({protoHost=undefined}={}) {
return `<?xml version="1.0" encoding="UTF-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
<ShortName>Offline Internet Archive</ShortName>
<Description>Offline Internet Archive search engine</Description>
<Url type="application/atom+xml" template="${protoHost}/opensearch?q={searchTerms}"/>
</OpenSearchDescription>
`;
}
function searchExpress(req, res, next) {
// req should have proto host set to e.g. http://localhost:4244 or https://www-dweb-mirror.dev.archive.org
if (!req.query.q) {
res.status(200).send(XMLdescriptor(req.opts));
} else {
const sort = "-downloads"; // Opensearch doesnt specify any other metric
const o = Object.assign(new ArchiveItem({sort, query: req.query.q}), { rows: ItemsPerPage, page: 1, });
doQuery(
o,
Object.assign(req.opts, {wantCrawlInfo: false}),
undefined, // config - used with crawl info - since have no way to return in Atom, for now its undefined
(err, searchResult) => { // Looks like Archive search result with downloaded and crawl fields added
if (err) {
next(err); // doQuery sends to debug
} else {
res.status(200).send(atomFrom(searchResult, req));
}
}
);
}
}
exports = module.exports = {searchExpress, doQuery};