-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmanga_crawler.py
65 lines (58 loc) · 2.43 KB
/
manga_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from mangafox.spiders.image_spider import MainSpider
# from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerProcess
# from config_parser import change_to_manga_dir
from twisted.internet import reactor, defer
from scrapy.settings import Settings
class Crawler:
def __init__(self):
settings = Settings()
settings.setdict({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'FEED_FORMAT': 'json',
'DOWNLOAD_DELAY': 2,
'REACTOR_THREADPOOL_MAXSIZE': 2,
'LOG_LEVEL': 'WARNING'
}, priority='project')
self.process = CrawlerProcess(settings=settings)
def crawl_image_from_chapter(self, manga_link, chapter, path=None):
self.process.crawl(MainSpider, manga_link,
chapter, root_path=path)
self.process.start()
@defer.inlineCallbacks
def crawl_multiple_str(self,
manga_link,
list_chapter_range,
path=None):
for mng in list_chapter_range:
if '.' in mng:
yield self.process.crawl(MainSpider, manga_link,
str(float(mng)), root_path=path)
else:
print(manga_link)
print(mng)
yield self.process.crawl(MainSpider, manga_link,
str(int(mng)), root_path=path)
reactor.stop()
@defer.inlineCallbacks
def crawl_multiple(self,
manga_link,
list_chapter_range,
path=None):
for mng in range(list_chapter_range[0], list_chapter_range[1]+1):
yield self.process.crawl(MainSpider, manga_link,
str(mng), root_path=path)
reactor.stop()
def crawl_image_from_chapters(self,
manga_link,
list_chapter_range,
path=None):
if isinstance(list_chapter_range[0], str):
self.crawl_multiple_str(manga_link, list_chapter_range, path=path)
else:
self.crawl_multiple(manga_link, list_chapter_range, path=path)
# d = self.process.join()
# d.addBoth(lambda _: reactor.stop())
reactor.run()
# self.process.start()
# name = manga_link.split('/')[4]