From 15111f1cd2fffcb00b20d468212be4779ef6c5cb Mon Sep 17 00:00:00 2001 From: Indrajith Indraprastham Date: Tue, 27 Aug 2024 21:42:09 +0530 Subject: [PATCH] feat: async crawl in parallel --- src/datacrawl/core/crawler.py | 18 +++++++++++------- tests/core/test_crawller.py | 2 ++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/datacrawl/core/crawler.py b/src/datacrawl/core/crawler.py index 52873bb..010ae5e 100644 --- a/src/datacrawl/core/crawler.py +++ b/src/datacrawl/core/crawler.py @@ -178,14 +178,18 @@ async def start(self) -> Dict[str, Dict[str, List[str]]]: await self.crawl(session, self.settings.root_url) while self.link_count < self.settings.max_links and self.crawl_set: - url = self.crawl_set.pop() # Pop the URL from crawl_set - tasks = [self.crawl(session, url)] - await asyncio.gather(*tasks) # Use asyncio.gather to run the tasks + tasks: List[asyncio.Task] = [] + + while self.crawl_set and len(tasks) < self.settings.max_workers: + url = self.crawl_set.pop() + tasks.append(asyncio.create_task(self.crawl(session, url))) + + await asyncio.gather(*tasks) await asyncio.sleep(self.settings.delay) - if self.settings.save_to_file: - await self.save_results() + if self.settings.save_to_file: + await self.save_results() - logger.debug("Exiting....") - return self.crawl_result + logger.debug("Exiting....") + return self.crawl_result diff --git a/tests/core/test_crawller.py b/tests/core/test_crawller.py index 22e2814..3e1ec1d 100644 --- a/tests/core/test_crawller.py +++ b/tests/core/test_crawller.py @@ -28,6 +28,8 @@ def crawler(crawl_settings: CrawlSettings) -> Datacrawl: @pytest.mark.asyncio async def test_crawl(crawler: Datacrawl) -> None: + crawler.settings.respect_robots_txt = False + with aioresponses() as m: m.get( root_url,