feat: crawl async without threading

DataCrawl-AI · Aug 27, 2024 · 0fc8d4a · 0fc8d4a
1 parent c0908c4
commit 0fc8d4a
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 17 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datacrawl"
-version = "0.6.0"
+version = "0.6.1"
 description = "A simple and efficient web crawler in Python."
 authors = ["Indrajith Indraprastham <indr4jith@gmail.com>"]
 license = "GPL-3.0-or-later"
@@ -27,6 +27,7 @@ types-requests = "^2.32.0.20240712"
 types-setuptools = "^71.1.0.20240726"
 pytest-asyncio = "^0.23.8"
 aioresponses = "^0.7.6"
+aiofiles = "^24.1.0"
 
 [tool.poetry.group.dev.dependencies]
 responses = "^0.13.4"

diff --git a/src/datacrawl/core/crawl_settings.py b/src/datacrawl/core/crawl_settings.py
@@ -52,7 +52,7 @@ class DataCrawlSettings:
     internal_links_only: bool = False
     external_links_only: bool = False
     respect_robots_txt: bool = True
-    max_retry_attempts: int = 5
+    max_retry_attempts: int = 2
 
 
 @dataclass

diff --git a/src/datacrawl/core/crawler.py b/src/datacrawl/core/crawler.py
@@ -38,7 +38,10 @@ class Datacrawl:
     settings: CrawlSettings
 
     crawl_result: Dict[str, Dict[str, Any]] = field(default_factory=dict)
-    update_callback: Optional[Callable[[str, Dict[str, Dict[str, Any]]], None]] = None
+    update_callback: Optional[Callable[[str, Dict[str, Dict[str, Any]], Optional[str]], None]] = (
+        None
+    )
+    job_id: str = ""
     crawl_set: Set[str] = field(default_factory=set)
     link_count: int = 0
 
@@ -115,7 +118,7 @@ async def crawl(self, session: aiohttp.ClientSession, url: str) -> None:
             logger.debug("Links crawled: %s", self.link_count)
 
         if self.update_callback:
-            self.update_callback(url, self.crawl_result[url])
+            self.update_callback(url, self.crawl_result[url], self.job_id)
 
     def _should_skip_link(self, pretty_url: str, url: str) -> bool:
         if not is_valid_url(pretty_url):
@@ -175,14 +178,11 @@ async def start(self) -> Dict[str, Dict[str, List[str]]]:
             await self.crawl(session, self.settings.root_url)
 
             while self.link_count < self.settings.max_links and self.crawl_set:
-                tasks = [
-                    self.crawl(session, url)
-                    for url in list(self.crawl_set)[: self.settings.max_workers]
-                ]
-
-                for task in asyncio.as_completed(tasks):
-                    await task
-                    await asyncio.sleep(self.settings.delay)
+                url = self.crawl_set.pop()  # Pop the URL from crawl_set
+                tasks = [self.crawl(session, url)]
+                await asyncio.gather(*tasks)  # Use asyncio.gather to run the tasks
+
+                await asyncio.sleep(self.settings.delay)
 
         if self.settings.save_to_file:
             await self.save_results()

diff --git a/src/datacrawl/networking/fetcher.py b/src/datacrawl/networking/fetcher.py
@@ -4,6 +4,7 @@
 
 import aiohttp
 import requests
+from aiohttp import ClientSession, ClientTimeout
 from bs4 import BeautifulSoup
 
 from datacrawl.logger import get_logger
@@ -40,11 +41,12 @@ def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSo
     return None
 
 
-async def fetch_url_async(session: aiohttp.ClientSession, url: str, retries: int) -> BeautifulSoup:
+async def fetch_url_async(session: ClientSession, url: str, retries: int) -> BeautifulSoup:
     attempts = 0
     while attempts <= retries:
         try:
-            async with session.get(url, timeout=10) as response:
+            timeout = ClientTimeout(total=10)
+            async with session.get(url, timeout=timeout) as response:
                 if response.status in TRANSIENT_ERRORS:
                     logger.error(
                         "Transient HTTP error occurred: %s. Retrying...",
@@ -64,6 +66,8 @@ async def fetch_url_async(session: aiohttp.ClientSession, url: str, retries: int
             logger.error("Timeout error occurred: %s", timeout_err)
         except aiohttp.ClientError as req_err:
             logger.error("Request error occurred: %s", req_err)
+        except Exception as err:
+            logger.error("An unexpected error occurred: %s", err)
         attempts += 1
         await asyncio.sleep(attempts)
     return None
diff --git a/tests/core/test_crawller.py b/tests/core/test_crawller.py
@@ -1,3 +1,4 @@
+import aiohttp
 import pytest
 from aioresponses import aioresponses
 from datacrawl.core.crawl_settings import CrawlSettings
@@ -50,5 +51,5 @@ async def test_crawl_invalid_url(crawler: Datacrawl) -> None:
     with aioresponses() as m:
         m.get("http://invalid-url", status=404)
 
-        await crawler.crawl(None, "http://invalid-url")
+        await crawler.crawl(aiohttp.ClientSession(), "http://invalid-url")
         assert "http://invalid-url" not in crawler.crawl_result
diff --git a/tests/core/test_spider.py b/tests/core/test_spider.py
@@ -467,7 +467,9 @@ def test_crawl_url_transient_retry(
         503,
     )
 
-    spider = Datacrawl(CrawlSettings(root_url=root_url, respect_robots_txt=False))
+    spider = Datacrawl(
+        CrawlSettings(root_url=root_url, respect_robots_txt=False, max_retry_attempts=5)
+    )
 
     with caplog.at_level(ERROR):
         spider.crawl(root_url)