-
Notifications
You must be signed in to change notification settings - Fork 31
/
mirrorHttp.js
executable file
·688 lines (647 loc) · 32.5 KB
/
mirrorHttp.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
/* global DwebTransports */
/* Serve the mirrored files via HTTP XXX
This is intended as a fairly generic server for a number of cases, with some configuration to allow for different situations,
See: https://github.com/mitra42/dweb-universal/blob/master/uri%20structure%20for%20http%20server.md (TODO which may be out of date)
See URL_MAPPING.md (TODO which may be out of date) for summary of below rules plus what they call
//TODO remove /archive/ forwards, work directly with those files as on archive.org and www-dweb.dev.archive.org
*/
// External packages
// Not debugging: express:*
// noinspection JSUnresolvedVariable
/* To add a new special page
Search on SEE-OTHER-ADD-SPECIAL-PAGE
Document in README.md and USING.md
*/
// TODO-URI add compatibility with archive.org standard urls scan this file first, should be a git issue but
const debug = require('debug')('dweb-mirror:mirrorHttp');
const url = require('url');
const express = require('express'); // http://expressjs.com/
const morgan = require('morgan'); // https://www.npmjs.com/package/morgan
const path = require('path');
const fs = require('fs'); // See https://nodejs.org/api/fs.html
const waterfall = require('async/waterfall');
const parallel = require('async/parallel');
const sharp = require('sharp');
// IA packages
const { RawBookReaderResponse, RawBookReaderJSONResponse, specialidentifiers, homeQuery, routed } = require('@internetarchive/dweb-archivecontroller');
// Local files
const MirrorFS = require('./MirrorFS');
const CrawlManager = require('./CrawlManager');
const ArchiveFile = require('./ArchiveFilePatched');
const ArchiveItem = require('./ArchiveItemPatched'); // Needed for fetch_metadata patch to use cache
const ArchiveMember = require('./ArchiveMemberPatched');
const { searchExpress, doQuery } = require('./search');
const httpOrHttps = 'http'; // This server is running on http, not https (at least currently)
const app = express();
// SEE-IDENTICAL-CODE-CANONICALIZETASKS in dweb-mirror.mirrorHttp and dweb-archive.LocalComponent
function canonicalizeTasks(tasks) {
/* Turn an array of tasks where identifiers may be arrays or singles into canonicalized form - one task per identifier */
// This turns each task into an array of tasks with one identifier per task, then flattens that array of arrays into a 1D array
return [].concat(...tasks.map(task => (Array.isArray(task.identifier)
? task.identifier.map(identifier => Object.assign({}, task, { identifier }))
: task)));
}
function mirrorHttp(config, cb0) {
debug('Starting HTTP server on %d, Caching in %o', config.apps.http.port, config.directories);
// noinspection JSUnresolvedVariable
app.use(morgan(config.apps.http.morgan)); // TODO write to a file then recycle that log file (see https://www.npmjs.com/package/morgan )
app.use(express.json());
// app.get('*/', (req, res, next) => { req.url = req.params[0]; next(); } // Strip trailing '/'
app.use((req, res, next) => {
// Pre Munging - applies to all queries
/* Turn the range headers on a req into an options parameter can use in streams */
req.opts = {};
const range = req.range(Infinity);
if (range && range[0] && range.type === 'bytes') {
Object.assign(req.opts, { start: range[0].start, end: range[0].end });
debug('Range request = %O', range);
}
// Detect if want server to skip cache
req.opts.noCache = req.headers['cache-control'] && ['no-cache', 'max-age=0'].includes(req.headers['cache-control']);
req.opts.copyDirectory = req.query.copyDirectory; // Usually undefined
debug('STARTING: %s %s %s %s', req.url,
(typeof req.opts.start !== 'undefined') ? `bytes ${req.opts.start}-${req.opts.end}` : '',
(req.opts.noCache ? 'NOCACHE' : ''),
(req.opts.copyDirectory ? req.opts.copyDirectory : ''));
req.opts.protoHost = process.env.APPLICATION_ROOT // OLIP
? process.env.APPLICATION_ROOT
: req.headers['x-forwarded-for']
? (req.headers['x-forwarded-proto'] + '://' + req.headers['x-forwarded-host'] + req.headers['x-forwarded-port'])
: (req.protocol + '://' + req.headers.host);
next();
});
function _newArchiveItem(identifier, config1, opts, cb) {
// Enhanced version of new ArchiveItem + fetch_metadata + handling special
const ai = new ArchiveItem({ identifier });
if (Object.keys(specialidentifiers).includes(identifier)) {
ai.metadata = {};
Object.entries(specialidentifiers[identifier]).forEach(kv => ai.metadata[kv[0]] = kv[1]); // Copy over
if (ai.identifier === 'local') {
ArchiveMember.expandMembers(
canonicalizeTasks(config1.apps.crawl.tasks)
.map(t => new ArchiveMember({
identifier: t.identifier,
query: t.query,
sort: t.sort, // Maybe undefined in which case specified in ArchiveItem.defaultSortStr
mediatype: t.query ? 'search' : undefined
}, { unexpanded: true })),
(err, res) => {
ai.membersFav = res;
cb(err, ai);
}
);
} else {
cb(null, ai);
}
} else {
ai.fetch_metadata(Object.assign({ darkOk: true }, opts), cb);
}
}
/**
* Redirect, passes query parameters in order of precedence .... queryParms; those in the query; transport & mirror
* @param queryParms { parameters to pass in query, can override {mirror, transport} and in query}
* @returns {function(req, res)}
* Identical code in dweb and dweb-mirror
*/
function redirectWithQuery(queryParms = {}) {
/* eslint-disable-next-line func-names */
return function (req, res) {
if (req.params.identifier) { queryParms.identifier = req.params.identifier; } // For urls that include identifier pass to queryParms
if (queryParms[0]) { queryParms[queryParms[0]] = req.params[queryParms[0]]; delete queryParms['0']; } // {0:"page"} means take the * in the url and pass as the page=
const redirUrl = url.format({
pathname: '/archive.html',
// New query parameters have defaults for mirror and transport which can be overridden and ... more which can't
query: Object.assign({ mirror: req.headers.host, transport: 'HTTP' }, req.query, queryParms)
});
debug('redirecting to: %s', redirUrl);
res.redirect(redirUrl);
};
}
function errAndNext(req, res, next, err) {
// There might be an easier way to do this, but this is how to handle something that could fail but want to try for others
if (err) {
if (!req.errs) { req.errs = []; }
req.errs.push(err);
}
next(); // Try next... next(err) would give an immediate error, the last step below will check for err and use it instead of generic 404
}
// Serving static (e.g. UI) files
// app.use('/download/', express.static(dir)); // Simplistic, better ...
function _sendFileOrError(req, res, next, filepath) {
// Note filepaths are going to be unix/OSX style TODO-WINDOWS will need to split and re-join params[0]
res.sendFile(filepath, (err) => {
if (err) {
debug('No file in: %s %s', filepath, err.message);
next(); // Drop through to next attempt - will probably fail
} else {
debug('sent file %s', filepath);
}
});
}
function _sendFileOrErrorPath(req, res, next) {
_sendFileOrError(req, res, next, config.archiveui.directory + req.path); // Note req.path starts with /
}
function _sendFileUrlArchive(req, res, next) {
// dir: Directory path, not starting or ending in /
_sendFileOrError(req, res, next, path.join(config.archiveui.directory, req.params[0]));
}
function _sendFileFromBookreader(req, res, next) {
// Urls like /bookreader/BookReader/*
_sendFileOrError(req, res, next, path.join(config.bookreader.directory, req.params[0]));
}
function _sendFileFromEpubreader(req, res, next) {
// Urls like /epubreader/*
_sendFileOrError(req, res, next, path.join(config.epubreader.directory, req.params[0]));
}
function sendRelated(req, res, unusedNext) {
// req.opts = { noCache, copyDirectory}
const identifier = req.params[0];
const ai = new ArchiveItem({ identifier });
waterfall([
(cb) => cb((identifier && !Object.keys(specialidentifiers).includes(identifier)) ? null : new Error(`ERROR There is no related info for special identifier ${identifier}`)),
(cb) => ai.fetch_metadata(req.opts, cb),
(ai1, cb) => ai1.relatedItems({ copyDirectory: req.opts.copyDirectory, wantMembers: false, noCache: req.opts.noCache }, cb),
(rels, cb) => ArchiveItem.addCrawlInfoRelated(rels, { config, copyDirectory: req.opts.copyDirectory }, (err) => cb(err, rels)),
], (err, rels) => {
if (err) {
// next(err);
res.status(404).send(err.message);
} else {
res.json(rels);
}
});
}
function sendPlaylist(req, res, next) {
// req.opts = { noCache}
new ArchiveItem({ identifier: req.params.identifier })
.fetch_metadata(req.opts, (err, ai) => { // Note this will get playlist, (fetch_playlist requires this first anyway)
if (!err) {
res.json(ai.playlist); // Will be a cooked playlist, but all cooking of playlists is additive.
} else {
next(err); // Will 500 error
}
});
}
// Pass a stream to the result
function _proxy(req, res, next, err, s, headers) {
if (err) {
debug('Failed to proxy', err.message);
errAndNext(req, res, next, err);
} else {
res.status(200); // Assume error if do not get here
res.set(headers);
s.pipe(res);
s.on('error', err0 => { // Make sure to catch error though too late to do anything useful with it.
debug('Stream had error, %o', err0.message, err0);
next(err0); // Will generate immediate 400
// res.destroy(err); // Doesnt work - "Empty reply from server" no headers get sent
// Doesnt work as already sent headers ... errAndNext(req, res, next, err);
});
}
}
// There are a couple of proxies e.g. proxy-http-express but it disables streaming when headers are modified.
// Note req.url will start with "/"
// proxyUrl goes through DTS name mapping, so normally can be a raw URL to archive.org
// req.opts = { start, end, noCache}
// noinspection JSUnresolvedVariable
function proxyUrl(prefix, headers = {}) {
/* eslint-disable-next-line func-names */
return function (req, res, next) {
// Proxy a request to somewhere under urlbase, which should NOT end with /
// req.opts = { start, end, noCache}
const url0 = routed(prefix + req.url);
debug('Proxying from %s', url0);
DwebTransports.createReadStream(
url0,
Object.assign({}, req.opts, { preferredTransports: config.connect.preferredStreamTransports }), (err, s) => {
_proxy(req, res, next, err, s, headers);
}
);
};
}
// like proxyUrl but return proxy function that sets JSON content-type header
function proxyJson(prefix) { return proxyUrl(prefix, { 'Content-Type': 'application/json' }); }
function streamArchiveFile(req, res, next) {
// Note before this is called req.opts = {start, end}
// TODO-CACHE-AGING Look at cacheControl in options https://expressjs.com/en/4x/api.html#res.sendFile (maxAge, immutable)
// req.opts { start, end, noCache, copyDirectory }
try {
const filename = req.params[0]; // Use this form since filename may contain '/' so can't use :filename
const identifier = req.params.identifier;
const opts = Object.assign({}, req.opts, { wantStream: true });
let af; // Passed out from waterfall to end
debug('Sending ArchiveFile %s/%s', identifier, filename);
const ai = new ArchiveItem({ identifier });
waterfall([
(cb) => ai.fetch_metadata({ copyDirectory: req.opts.copyDirectory }, cb), // Do not pass on noCache, we'll be streaming after already fetched
(archiveitem, cb) => ArchiveFile.new({ archiveitem, filename, copyDirectory: req.opts.copyDirectory }, cb),
// Note will *not* cache if pass opts other than start:0 end:undefined|Infinity
(archivefile, cb) => {
af = archivefile;
archivefile.cacheAndOrStream(opts, cb);
},
],
(err, s) => { // Have stream of file or error
if (err) {
// Failed - report
debug('ERROR: streamArchiveFile failed for %s/%s: %s', identifier, filename, err.message);
res.status(404).send(err.message);
} else {
// Succeeded - pipe back to user with headers
res.status(req.opts.end ? 206 : 200); // True if there was a range request
res.set('Accept-ranges', 'bytes');
if (req.opts.end) res.set('Content-Range', `bytes ${req.opts.start}-${Math.min(req.opts.end, af.metadata.size) - 1}/${af.metadata.size}`);
// noinspection JSUnresolvedVariable
res.set('Content-Type', af.mimetype()); // Not sure what happens if doesn't find it.
s.pipe(res);
}
});
} catch (err) {
debug('ERROR caught unhandled error in streamArchiveFile for %s: %s', req.url, err.message);
next(err);
}
}
function streamQuery(req, res, next) {
// req.opts = { noCache}
let wantCrawlInfo;
let o;
// especially: `/advancedsearch}?output=json&q=${encodeURIComponent(this.query)}&rows=${this.rows}&page=${this.page}&sort[]=${sort}&and[]=${this.and}&save=yes`;
if (req.query.q && req.query.q.startsWith('collection:') && req.query.q.includes('simplelists__items:')) { // Only interested in standardised q=collection:IDENTIFIER..
// TODO when Aaron has built entry point e.g. members/COLLECTION then rebuild this and dweb-archivecontroller.ArchiveItem._fetch_query to use it
// Special case: query just looking for members of a collection
// e.g. collection%3Amitratest%20OR%20simplelists__items%3Amitratest%20OR%20simplelists__holdings%3Amitratest%20OR%20simplelists__items%3Amitratest
const identifier = req.query.q.split(' OR ')[0].split(':')[1];
o = new ArchiveItem({ sort: req.query.sort, identifier }); // Do not set query, allow _fetch_query to build default
wantCrawlInfo = true;
// Another special case - a query just looking to expand identifiers
} else if (req.query.q && req.query.q.startsWith('identifier:')
&& !req.query.q.includes('*') // exclude eg identifier:electricsheep-flock*
&& (req.query.q.lastIndexOf(':(') === 10)) {
// Special case: query just looking for fields on a list of identifiers
const ids = req.query.q.slice(12, -1).split(' OR '); // ["foo","bar"]
o = new ArchiveItem();
o.membersFav = ids.map(identifier => ArchiveMember.fromIdentifier(identifier));
// The members will be expanded by fetch_query either from local cache or by querying upstream
wantCrawlInfo = false;
} else if (req.query.q === homeQuery) {
o = new ArchiveItem({ identifier: 'home', sort: req.query.sort, query: req.query.q });
wantCrawlInfo = true;
} else {
o = new ArchiveItem({ sort: req.query.sort, query: req.query.q });
wantCrawlInfo = true;
}
// By this point via any route above, we have o as an object with either a .query or .membersFav || .membersSearch
// as array of unexpanded members (which fetch_query|_fetch_query will get)
o.rows = parseInt(req.query.rows, 10) || 75;
o.page = parseInt(req.query.page, 10) || 1; // Page incrementing is done by anything iterating over pages, not at this point
o.and = req.query.and; // I do not believe this is used anywhere
req.opts.wantCrawlInfo = wantCrawlInfo;
doQuery(o, req.opts, config, (err, resp) => {
if (err) {
next(err); // doQuery will have reported the error
} else {
res.json(resp);
}
});
}
function streamThumbnail(req, res, next) {
/*
Stream back the icon,
In many cases we will have the icon, but nothing else for the item (eg its a tile on a collection), in that case just send the icon
Otherwise fetch metadata, find it in several possible places.
req.opts = {noCache}
*/
function sendJpegStream(s) {
// Stream back with appropriate status and Content-type
res.status(200); // Assume error if do not get here
res.set({ 'Content-Type': 'image/jpeg; charset=UTF-8' });
s.pipe(res);
}
const identifier = req.params.identifier;
debug('Sending Thumbnail for %s', identifier);
const noCache = req.opts.noCache;
if (Object.keys(specialidentifiers).includes(identifier)) { // See SEE-OTHER-ADD-SPECIAL-PAGE (this should be automatic once added to specialidentifiers)
res.redirect(url.format({
pathname: specialidentifiers[identifier].thumbnaillinks,
}));
} else {
MirrorFS.checkWhereValidFile(identifier + '/__ia_thumb.jpg', { noCache, copyDirectory: req.opts.copyDirectory }, (err, existingFilePath) => {
if (!err) {
sendJpegStream(fs.createReadStream(existingFilePath));
} else {
// We do not already have the file
const ai = new ArchiveItem({ identifier });
waterfall([
(cb) => ai.fetch_metadata({ noCache, copyDirectory: req.opts.copyDirectory }, cb),
(archiveitem, cb2) => archiveitem.saveThumbnail({ noCache, copyDirectory: req.opts.copyDirectory, wantStream: true, }, cb2)
],
(err1, s) => {
if (err1) {
debug('Failed to stream Thumbnail for %s: %s', identifier, err1.message);
next(err1);
} else {
sendJpegStream(s);
}
});
}
});
}
}
function sendInfo(req, res) {
DwebTransports.p_statuses((err, transportStatuses) => {
res.status(200).set('Accept-Ranges', 'bytes').json({ 'config': config.configOpts, transportStatuses, directories: config.directories });
});
}
function sendBookReaderJSIA(req, res, unusedNext) {
waterfall([
(cb) => new ArchiveItem({ identifier: req.query.id })
.fetch_metadata(req.opts, cb),
(ai, cb) => ai.fetch_bookreader(req.opts, cb)
], (err, ai) => {
if (err) {
res.status(404).send(err.message); // Its neither local, nor from server
} else {
res.json({
data: RawBookReaderResponse.fromArchiveItem(ai).cooked({
server: req.query.server,
// On www-dweb-mirror we are running thru ingress, so serer http but browser needs to use https - can use x-forwarded-proto to distinguish
protocol: req.headers['x-forwarded-proto'] || httpOrHttps
})
});
}
});
}
function sendBookReaderJSON(req, res, unusedNext) {
waterfall([
(cb) => new ArchiveItem({ identifier: req.params.identifier || req.query.identifier })
.fetch_metadata(req.opts, cb),
(ai, cb) => ai.fetch_bookreader(req.opts, cb)
], (err, ai) => {
if (err) {
res.status(404).send(err.message); // Its neither local, nor from server
} else {
// TODO need server from req I think
res.json(RawBookReaderJSONResponse.fromArchiveItem(ai, {
server: req.headers.host,
// On www-dweb-mirror we are running thru ingress, so serer http but browser needs to use https - can use x-forwarded-proto to distinguish
protocol: req.headers['x-forwarded-proto'] || httpOrHttps
}));
}
});
}
function sendBookReaderImages(req, res, next) {
// debug("sendBookReaderImages: item %s file %s scale %s rotate %s", req.query.zip.split('/')[3], req.query.file, req.query.scale, req.query.rotate)
// eg /BookReader/BookReaderImages.php?zip=/27/items/IDENTIFIER/unitednov65unit_jp2.zip&file=unitednov65unit_jp2/unitednov65unit_0006.jp2&scale=4&rotate=0
// or /download/IDENTIFIER/page/cover_t.jpg
// or /tutur-smara-bhuwana/page/leaf1_w2000.jpg from mediawiki/ArchiveLeaf
// or (book preview as e.g. already lent out): /BookReader/BookReaderPreview.php?id=bdrc-W1KG14545&subPrefix=bdrc-W1KG14545&itemPath=/34/items/bdrc-W1KG14545&server=ia803001.us.archive.org&page=leaf4&fail=preview&&scale=11.652542372881356&rotate=0 */
// Note the urls for Image and Preview are gratuitously different ! so need to capture both sets of parameters
// req.opts = { noCache}
const identifier = req.params.identifier || req.query.id || (req.query.zip ? req.query.zip.split('/')[3] : undefined);
new ArchiveItem({ identifier })
.fetch_page({
copyDirectory: req.opts.copyDirectory,
wantStream: true,
zip: req.query.zip, // Images only
page: req.params.page || req.query.page,
file: req.query.file, // Images ony
scale: req.query.scale, // Note this will be quantized
rotate: req.query.rotate,
itemPath: req.query.itemPath, // Preview only
subPrefix: req.query.subPrefix, // Preview only
id: req.query.id, // Seems to be required by datanode, and supplied in req by client, as of Dec2022 (see issue#372)
noCache: req.opts.cache
},
(err, s) => _proxy(req, res, next, err, s, { 'Content-Type': 'image/jpeg' }));
}
// Keep these lines in alphabetical order
// unless there is a reason not to (e.g. because capture specific before generic) in which case document in order!
app.get('/', (req, res, next) => {
res.sendFile(config.archiveui.directory + '/redir.html', (unusedErr) => { // Try standard redir.html
if (unusedErr) next(); // if no redir.html file then do a standard redirect.
});
});
app.get('/', redirectWithQuery({ identifier: 'local' }));
// Note app.get('/*'... is at the end after catch everything else
// Not currently used, but might be soon, ConfigDetailsComponent now uses admin/setconfig/IDENTIFIER/LEVEL
/*
app.post('/admin/setconfig', function (req, res, next) {
config.setAndWriteUser(req.body, err => {
if (err) {
next(err);
} else {
sendInfo(req, res); // Send info again, as UI will need to display this
}
});
});
*/
app.get('/admin/setconfig/:identifier/:level', (req, res, next) => {
const identifier = req.params.identifier === '_' ? undefined : req.params.identifier;
const delayTillReconsider = 3000; // ms time to wait for another key press before running crawl
const crawlid = 0; // Always setting on default crawl which will be '0'
// req.query.q is either "string to find" or structured like "collection:foo AND name:bar" it is NOT urlencoded by here
config.writeUserTaskLevel({ identifier, query: req.query.q, level: req.params.level }, err => {
if (err) {
next(err);
} else {
sendInfo(req, res); // Send info again, as UI will need to display this
if (CrawlManager.crawls.length) CrawlManager.crawls[crawlid].suspendAndReconsider({ identifier, delayTillReconsider, config });
}
});
});
function crawlManager(f) {
/* eslint-disable-next-line func-names */
return function (req, res) {
CrawlManager.crawls[req.params.crawlid][f]();
res.json(CrawlManager.crawls[req.params.crawlid].status());
};
}
// TODO-CRAWLCTL - see https://github.com/internetarchive/dweb-mirror/issues/132
// TODO refactor this to be a single service CrawlManager.app(req,res,next) which takes /admin/crawl/:cmd/:crawlid
app.get('/admin/crawl/restart/:crawlid', crawlManager('restart'));
app.get('/admin/crawl/pause/:crawlid', crawlManager('pause'));
app.get('/admin/crawl/resume/:crawlid', crawlManager('resume'));
app.get('/admin/crawl/empty/:crawlid', crawlManager('empty'));
app.get('/admin/crawl/status', (req, res) => res.json(CrawlManager.status()));
app.get('/admin/crawl/add', (req, res, next) => {
// Expect opts identifier, query, rows, level, copyDirectory, but could be adding search, related, in future
// Order is significant, config should NOT be overridable by query parameters.
CrawlManager.add({ ...req.query, config }, err => {
if (err) { // No errors expected
next(err);
} else {
sendInfo(req, res); // Send info again for UI
}
});
});
app.get('/admin/crawl/add/:identifier', (req, res, next) => {
CrawlManager.add({ config, identifier: req.params.identifier, copyDirectory: req.opts.copyDirectory }, err => {
if (err) { // No errors expected
next(err);
} else {
sendInfo(req, res); // Send info again for UI
}
});
});
app.get([
'/arc/archive.org',
'/details',
'/details/:identifier',
'/search',
'/search.php',
'/stream/:identifier/:unusedPrefix'
], redirectWithQuery());
app.get('/arc/archive.org/*', (req, res) => { res.redirect(req.originalUrl.slice(16)); }); // Moved to new pattern
app.get('/advancedsearch', streamQuery);
app.get('/advancedsearch.php', streamQuery);
// Bookreader passes page in a strange place in the URL e.g. page/n1/mode/2up
app.get('/details/:identifier/page/*', redirectWithQuery({ 0: 'page' }));
app.get('/download/:identifier/page/:page', sendBookReaderImages);
app.get('/download/:identifier', redirectWithQuery({ download: 1 }));
app.get([
'/download/:identifier/*',
'/serve/:identifier/*'], streamArchiveFile);
app.get(['/epubreader/*', '/archive/epubreader/*'], _sendFileFromEpubreader);
app.get(['/images/*',
'/includes/*', // matches archive.org & dweb.archive.org but not dweb.me
'/jw/*', // matches archive.org but not dweb.me
'/components/*', // Web components - linked from places we have no control over
'/languages/*'], _sendFileOrErrorPath);
// metadata handles two cases - either the metadata exists in the cache, or if not is fetched and stored.
// TODO complete as part of https://github.com/internetarchive/dweb-mirror/issues/211
app.get('/metadata/:identifier', (req, res, unusedNext) => {
const identifier = req.params.identifier;
_newArchiveItem(identifier, config, req.opts, (err, ai) => {
if (err) {
res.status(404).send(err.message); // Its neither local, nor from server nor special
} else {
parallel([
cb => ai.addCrawlInfo({ config, copyDirectory: req.opts.copyDirectory }, cb),
cb => ai.addMagnetLink({ copyDirectory: req.opts.copyDirectory, config }, cb)
],
(err1, unusedArr) => {
if (err1) {
res.status(500).send(err1.message);
} else {
res.json(ai.exportMetadataAPI());
}
});
}
});
});
// Note this is metadata/IDENTIFIER/FILENAME because metadata/IDENTIFIER is caught above
// Note wont work as while goes explicitly to dweb.archive.org since pattern metadata/IDENTIFIER/FILE not handled by dweb-archivecontroller/Routing yet
// this will be diverted to dweb-metadata which cant handle this pattern yet - see https://github.com/internetarchive/dweb-archivecontroller/issues/11
// TODO should be retrieving. patching into main metadata and saving but note, not using on dweb-mirror when IPFS off
// app.get('/metadata/*', proxyJson("https://archive.org"));
app.get('/mds/v1/get_related/all/*', sendRelated);
// noinspection JSUnresolvedFunction
app.get('/mds/*', proxyJson('https://be-api.us.archive.org/'));
// Playlists are on a weird URL distinguished only by output=json
app.get('/embed/:identifier', (req, res, next) => {
if (req.query.output === 'json') {
sendPlaylist(req, res, next);
} else {
next();
}
});
app.get('/playlist/:identifier', sendPlaylist);
// Special URL from mediawiki e.g. https://archive.org/stream/bdrc-W1FPL497/bdrc-W1FPL497#page/n2/mode/1up
// see https://github.com/internetarchive/dweb-mirror/issues/289 as this might be temporary
app.get([
'/download/:identifier/__ia_thumb.jpg',
'/services/img/:identifier',
'/thumbnail/:identifier' // Deprecated in favor of services/img)
], streamThumbnail); // streamThumbnail will try archive.org/services/img/identifier if all else fails
app.get(['/bookreader/BookReader/*', '/archive/bookreader/BookReader/*'], _sendFileFromBookreader);
// e.g. '/BookReader/BookReaderJSIA.php?id=unitednov65unit&itemPath=undefined&server=undefined&format=jsonp&subPrefix=unitednov65unit&requestUri=/details/unitednov65unit')
app.get('/BookReader/BookReaderJSIA.php', sendBookReaderJSIA);
// e.g. http://ia802902.us.archive.org/BookReader/BookReaderJSON.php?itemPath=%2F28%2Fitems%2FArtOfCommunitySecondEdition&identifier=ArtOfCommunitySecondEdition&server=ia802902.us.archive.org
app.get(['/BookReader/BookReaderJSON.php',
'/books/:identifier/ia_manifest' // e.g. https://api.archivelab.org/books/ArtOfCommunitySecondEdition/ia_manifest
], sendBookReaderJSON);
app.get('/BookReader/BookReaderImages.php', sendBookReaderImages);
// e.g. /BookReader/BookReaderPreview.php?id=bdrc-W1KG14545&subPrefix=bdrc-W1KG14545&itemPath=/34/items/bdrc-W1KG14545&server=ia803001.us.archive.org&page=leaf4&fail=preview&&scale=11.652542372881356&rotate=0
app.get('/BookReader/BookReaderPreview.php', sendBookReaderImages);
app.get('/ipfs/*', proxyUrl('ipfs:')); // Will go to next if IPFS transport not running
// app.get('/ipfs/*', proxyUpstream); // TODO dweb.me doesnt support /ipfs see https://github.com/internetarchive/dweb-mirror/issues/101
app.get('/ipfs/*', proxyUrl('https://ipfs.io')); // Will go to next if IPFS transport not running
// Recognize unmodified archive URLs
// noinspection JSUnresolvedVariable
app.get('/favicon.ico', (req, res, unusedNext) => res.sendFile(config.archiveui.directory + '/favicon.ico', {
maxAge: '86400000',
immutable: true
}, (err) => (err ? debug('favicon.ico %s', err.message) : debug('sent /favicon.ico')))); // Do not go to Error, favicons often aborted
app.get('/info', sendInfo);
// IIF support e.g. https://iiif.archivelab.org/iiif/mantra-pangwisesan%240/1026,1245,2316,617/full/0/default.jpg
// TODO-IIIF move to a function, probably on ArchiveItem
app.get('/iiif/:identifierindex/:ltwh/full/0/default.jpg', (req, res, unusedNext) => {
res.status(200);
const [left, top, width, height] = req.params.ltwh.split(',');
const [identifier, index] = req.params.identifierindex.split('$');
const ai = new ArchiveItem({ identifier });
waterfall([
(cb) => ai.fetch_metadata(req.opts, cb),
(ai1, cb) => ai1.fetch_bookreader(req.opts, cb),
(ai1, cb) => {
const manifest = ai1.pageManifests()[index];
ai1.fetch_page(ai1.pageParms(manifest, {
copyDirectory: req.opts.copyDirectory,
wantStream: true,
scale: 1,
noCache: req.opts.cache
}), cb);
},
],
(err, s) => {
const sh = sharp();
s.pipe(sh);
sh.extract({
left: parseInt(left, 10), top: parseInt(top, 10), width: parseInt(width, 10), height: parseInt(height, 10)
}).pipe(res);
});
});
app.get('/opensearch', searchExpress);
// Echo back headers - for debugging
app.get('/echo', (req, res, unusedNext) => {
res.status(200);
res.json(req.headers);
});
/*
// For debugging routing
app.use('/xyzzy', (req, res, next) => {
// Expect for .get /xyzzy/*: /xyzzy/foo url=original=path=/xyzzy/foo baseUrl=undefined
// Expect for .use '/xyzzy : /xyzzy/foo url=path=/foo/bar baseUrl=/xyzzy original=/xyzzy/foo/bar
debug('xyzzy: .url=%s, .baseUrl=%s, .originalUrl=%s .path=%s .route=%s', req.url, req.baseUrl, req.originalUrl, req.path, req.route);
next();
});
*/
// ----------- Below here are intentionally at the end -------
// Lastly try a file - this will get archive.html, dweb-archive-bundle.js, favicon.ico, dweb-archive-styles.css
app.get('/archive/*', _sendFileUrlArchive); // Must be after /archive/bookreader etc and before '/*'
app.get('/*', _sendFileUrlArchive);
// TODO add generic fallback to use Transports.js to lookup name and forward - but might auto-fix things that should really be caught and thought about
app.use((req, res, next) => {
// See errAndNext() above which builds req.errs
debug('FAILING: %s', req.url);
if (req.errs && req.errs.length === 1) {
next(req.errs[0]); // Just one error, use it
} else if (req.errs && req.errs.length > 0) {
next(new Error(req.errs.map(err => err.message).join('\n'))); // return them all
} else {
next(); // Generic 404
}
});
// noinspection JSUnresolvedVariable
const server = app.listen(config.apps.http.port); // Intentionally same port as Python gateway defaults to, api should converge
server.on('error', (err) => {
if (err.code === 'EADDRINUSE') {
debug('A server, probably another copy of internetarchive is already listening on port %s', config.apps.http.port);
} else {
debug('Server hit error %o', err);
throw (err); // Will be uncaught exception
}
});
cb0(null, server); // Just in case this becomes async
}
exports = module.exports = mirrorHttp;