Skip to content

Commit

Permalink
feat: crawl async without threading
Browse files Browse the repository at this point in the history
  • Loading branch information
indrajithi committed Aug 27, 2024
1 parent c0908c4 commit 0fc8d4a
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 17 deletions.
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "datacrawl"
version = "0.6.0"
version = "0.6.1"
description = "A simple and efficient web crawler in Python."
authors = ["Indrajith Indraprastham <indr4jith@gmail.com>"]
license = "GPL-3.0-or-later"
Expand All @@ -27,6 +27,7 @@ types-requests = "^2.32.0.20240712"
types-setuptools = "^71.1.0.20240726"
pytest-asyncio = "^0.23.8"
aioresponses = "^0.7.6"
aiofiles = "^24.1.0"

[tool.poetry.group.dev.dependencies]
responses = "^0.13.4"
Expand Down
2 changes: 1 addition & 1 deletion src/datacrawl/core/crawl_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class DataCrawlSettings:
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True
max_retry_attempts: int = 5
max_retry_attempts: int = 2


@dataclass
Expand Down
20 changes: 10 additions & 10 deletions src/datacrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ class Datacrawl:
settings: CrawlSettings

crawl_result: Dict[str, Dict[str, Any]] = field(default_factory=dict)
update_callback: Optional[Callable[[str, Dict[str, Dict[str, Any]]], None]] = None
update_callback: Optional[Callable[[str, Dict[str, Dict[str, Any]], Optional[str]], None]] = (
None
)
job_id: str = ""
crawl_set: Set[str] = field(default_factory=set)
link_count: int = 0

Expand Down Expand Up @@ -115,7 +118,7 @@ async def crawl(self, session: aiohttp.ClientSession, url: str) -> None:
logger.debug("Links crawled: %s", self.link_count)

if self.update_callback:
self.update_callback(url, self.crawl_result[url])
self.update_callback(url, self.crawl_result[url], self.job_id)

def _should_skip_link(self, pretty_url: str, url: str) -> bool:
if not is_valid_url(pretty_url):
Expand Down Expand Up @@ -175,14 +178,11 @@ async def start(self) -> Dict[str, Dict[str, List[str]]]:
await self.crawl(session, self.settings.root_url)

while self.link_count < self.settings.max_links and self.crawl_set:
tasks = [
self.crawl(session, url)
for url in list(self.crawl_set)[: self.settings.max_workers]
]

for task in asyncio.as_completed(tasks):
await task
await asyncio.sleep(self.settings.delay)
url = self.crawl_set.pop() # Pop the URL from crawl_set
tasks = [self.crawl(session, url)]
await asyncio.gather(*tasks) # Use asyncio.gather to run the tasks

await asyncio.sleep(self.settings.delay)

if self.settings.save_to_file:
await self.save_results()
Expand Down
8 changes: 6 additions & 2 deletions src/datacrawl/networking/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import aiohttp
import requests
from aiohttp import ClientSession, ClientTimeout
from bs4 import BeautifulSoup

from datacrawl.logger import get_logger
Expand Down Expand Up @@ -40,11 +41,12 @@ def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSo
return None


async def fetch_url_async(session: aiohttp.ClientSession, url: str, retries: int) -> BeautifulSoup:
async def fetch_url_async(session: ClientSession, url: str, retries: int) -> BeautifulSoup:
attempts = 0
while attempts <= retries:
try:
async with session.get(url, timeout=10) as response:
timeout = ClientTimeout(total=10)
async with session.get(url, timeout=timeout) as response:
if response.status in TRANSIENT_ERRORS:
logger.error(
"Transient HTTP error occurred: %s. Retrying...",
Expand All @@ -64,6 +66,8 @@ async def fetch_url_async(session: aiohttp.ClientSession, url: str, retries: int
logger.error("Timeout error occurred: %s", timeout_err)
except aiohttp.ClientError as req_err:
logger.error("Request error occurred: %s", req_err)
except Exception as err:
logger.error("An unexpected error occurred: %s", err)
attempts += 1
await asyncio.sleep(attempts)
return None
3 changes: 2 additions & 1 deletion tests/core/test_crawller.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import aiohttp
import pytest
from aioresponses import aioresponses
from datacrawl.core.crawl_settings import CrawlSettings
Expand Down Expand Up @@ -50,5 +51,5 @@ async def test_crawl_invalid_url(crawler: Datacrawl) -> None:
with aioresponses() as m:
m.get("http://invalid-url", status=404)

await crawler.crawl(None, "http://invalid-url")
await crawler.crawl(aiohttp.ClientSession(), "http://invalid-url")
assert "http://invalid-url" not in crawler.crawl_result
4 changes: 3 additions & 1 deletion tests/core/test_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ def test_crawl_url_transient_retry(
503,
)

spider = Datacrawl(CrawlSettings(root_url=root_url, respect_robots_txt=False))
spider = Datacrawl(
CrawlSettings(root_url=root_url, respect_robots_txt=False, max_retry_attempts=5)
)

with caplog.at_level(ERROR):
spider.crawl(root_url)
Expand Down

0 comments on commit 0fc8d4a

Please sign in to comment.