-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfileDownload.js
207 lines (188 loc) · 6.2 KB
/
fileDownload.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
const fs = require('fs')
const path = require('path')
const Crawler = require("crawler")
function template(strings, ...keys) {
return (function (...values) {
var dict = values[values.length - 1] || {};
var result = [strings[0]];
keys.forEach(function (key, i) {
var value = Number.isInteger(key) ? values[key] : dict[key];
result.push(value, strings[i + 1]);
});
return result.join('');
});
}
const fileDownload = ({ novelHome: novelHomeConfig, ChapterHome: chapterHomeConfig, articleHome: articleHomeConfig, css, contentClassName = [], replacers = [] }, {
maxConnections,
rateLimit,
dir,
host,
novelId,
staticId,
mod
}) => {
const novelObj = {
title: '',
desc: ''
}
const c = new Crawler({
rateLimit, // 两个任务之间的最小间隔
maxConnections // 最大的并发数
});
return {
getPageAsync(urls) {
if (typeof urls === 'string') {
urls = [urls]
}
return new Promise((resolve, reject) => {
const loop = urls.filter(item => !!item).map((url) => {
return new Promise((resolve, reject) => {
c.queue([{
uri: url,
/* userAgent: userAgent,
referer: referer, */
callback: async function (err, res, done) {
if (err || res.statusCode !== 200) {
reject('err')
throw new Error(err)
}
const $ = res.$;
resolve($)
done();
}
}]);
})
});
c.once('error', (error) => reject(error));
c.once('drain', () => {
Promise.all(loop).then(results => {
resolve(results)
})
})
});
},
writeFileAsync(fileName, data) {
return new Promise(function (resolve, reject) {
fs.writeFile(path.join(dir, fileName), data, 'utf-8', function (err) {
if (err) reject(err);
else resolve(data);
});
})
},
async writeContent(pageDom, index, url, title) {
console.log(`fetching url:${url}`)
const { contentSel, hasNextPage, totalReg, writeSinglePage } = articleHomeConfig
const hasNextPageDom = pageDom(hasNextPage).text()
let content = pageDom(contentSel).html()
// 替换器
replacers.forEach(item => {
const { repReg, repStr } = item
content = content.replace(repReg, repStr)
})
// 主体内容
const mainContent = `<h1>${title}</h1>
<br><br>
${content}`
// 包裹层
let wrapContent = ''
contentClassName.forEach((item, index) => {
wrapContent += `<div class=${item}>`
if (index === contentClassName.length - 1) {
wrapContent += mainContent
}
})
contentClassName.forEach(item => {
wrapContent += `</div>`
})
content = `
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
${css ? `<link rel="stylesheet" type="text/css" href="${css}">` : ''}
<title>${novelObj.title}</title>
</head>
<body>
${contentClassName.length ? wrapContent : mainContent}
</body>
</html>
`
if (hasNextPageDom.trim()) { // 有当前页数
const matchArr = hasNextPageDom.match(/\d/)
const num = parseInt(matchArr && matchArr[0]) // title内的当前页数
const totalStr = content.match(totalReg)
// 有页码, 最后一页没有页码
let total = parseInt(totalStr && totalStr[0])
await this.writeFileAsync(
`${mod === 'index' ? (String(index + 1).padStart(8, '0')) : title}_${num}.html`, content
)
writeSinglePage(pageDom, index, url, total, num + 1, contentSel, title, async (indexName, titleName, data) => {
await this.writeFileAsync(mod === 'index' ? indexName : titleName, data)
})
// FIXME:有可能还有其他情况
} else {
await this.writeFileAsync(
`${mod === 'index' ? (String(index + 1).padStart(8, '0') + '_0') : title}.html`, content
)
}
},
async getAllChapters(chapterArr) {
const urls = chapterArr.map(item => item.href)
for (let j = 0; j < urls.length;) {
const arr = []
for (let m = j; m < urls.length && m < j + maxConnections; m++) {
arr.push(urls[m])
}
await this.getPageAsync(arr).then(async (result) => {
for (let i = 0; i < result.length; i++) {
const item = result[i]
console.log(chapterArr[j + i].index, chapterArr[j + i].href, chapterArr[j + i].title)
await this.writeContent(item, chapterArr[j + i].index, chapterArr[j + i].href, chapterArr[j + i].title)
}
})
j += maxConnections
}
},
getAll() {
const str = 'data'
var compiled1 = template(novelHomeConfig.template.split(str), str)({ data: novelId });
const t = chapterHomeConfig.templateOfAll || chapterHomeConfig.template
var compiled2 = template(t.split(str), str)({ data: staticId || novelId })
let novelHome = compiled1.indexOf('http') > -1 ? compiled1 : (host + compiled1)
let ChapterHome = compiled2.indexOf('http') > -1 ? compiled2 : (host + compiled2)
console.log('\n书籍首页:', novelHome)
console.log('章节首页:', ChapterHome, '\n')
const { getArticleUrls } = chapterHomeConfig
if (getArticleUrls) {
this.getPageAsync(ChapterHome).then(async res => {
const [$] = res
const chapterArr = await getArticleUrls($, 1, [], this.getPageAsync)
// console.log(chapterArr)
this.getAllChapters(chapterArr)
})
return
}
this.getPageAsync([novelHome, ChapterHome]).then(res => {
const [novelHomePage, $] = res
novelObj.title = novelHomePage(novelHomeConfig.titleSel).text()
novelObj.desc = novelHomePage(novelHomeConfig.descSel).text()
const chDom = $(chapterHomeConfig.chapterSel)
let chapterArr = Array.from(chDom).map(function (el, index) {
const item = $(el)
let href = item.attr('href')
if (chapterHomeConfig.templateOfAll) {
const id = href.match(/\d+/)
href = host + template(chapterHomeConfig.template.split(str), str)({ data: id })
}
return {
href: href.indexOf('http') > -1 ? href : (host + href),
title: item.text(),
index
}
})
// console.log(chapterArr)
this.getAllChapters(chapterArr)
})
}
}
}
module.exports = fileDownload