Merge pull request #1817 from dipu-bd/dev

Dev
dipu-bd · Dec 11, 2022 · f916c65 · f916c65
2 parents 9130a0d + 66ba3a1
commit f916c65
Show file tree

Hide file tree

Showing 15 changed files with 419 additions and 384 deletions.
diff --git a/README.md b/README.md
diff --git a/lncrawl/VERSION b/lncrawl/VERSION
@@ -1 +1 @@
-3.2.1
+3.2.2
diff --git a/lncrawl/bots/console/__init__.py b/lncrawl/bots/console/__init__.py
@@ -13,6 +13,7 @@ def __init__(self) -> None:
 
     from .get_crawler import (
         choose_a_novel,
+        confirm_guessed_novel,
         confirm_retry,
         get_crawlers_to_search,
         get_novel_url,

diff --git a/lncrawl/bots/console/get_crawler.py b/lncrawl/bots/console/get_crawler.py
@@ -40,9 +40,28 @@ def get_novel_url(self):
         raise LNException("Novel page url or query was not given")
 
 
-def get_crawlers_to_search(self) -> List[str]:
+def confirm_guessed_novel(self, guessed_title: str):
+    """Returns a novel page url from a novelupdates query"""
+    args = get_args()
+    if args.suppress:
+        return guessed_title
+
+    answer = prompt(
+        [
+            {
+                "type": "input",
+                "name": "novel",
+                "message": "Enter novelupdates query:",
+                "default": guessed_title,
+                "validate": lambda a: True if a else "Input should not be empty",
+            },
+        ]
+    )
+    return answer["novel"].strip()
+
+
+def get_crawlers_to_search(self, links: List[str]) -> List[str]:
     """Returns user choice to search the choosen sites for a novel"""
-    links = self.app.crawler_links
     if not links:
         return []
 

diff --git a/lncrawl/bots/console/integration.py b/lncrawl/bots/console/integration.py
@@ -6,8 +6,9 @@
 from ...core import display
 from ...core.app import App
 from ...core.arguments import get_args
+from ...core.crawler import Crawler
 from ...core.exeptions import LNException
-from ...core.sources import prepare_crawler, rejected_sources
+from ...core.sources import crawler_list, prepare_crawler, rejected_sources
 from .open_folder_prompt import display_open_folder
 from .resume_download import resume_session
 
@@ -37,25 +38,36 @@ def start(self):
 
     # Process user input
     self.app.user_input = self.get_novel_url()
-    try:
-        self.app.prepare_search()
-        self.search_mode = not self.app.crawler
-    except LNException as e:
-        raise e
-    except Exception as e:
-        if self.app.user_input.startswith("http"):
-            url = urlparse(self.app.user_input)
-            url = "%s://%s/" % (url.scheme, url.hostname)
-            if url in rejected_sources:
-                display.url_rejected(rejected_sources[url])
-            else:
-                display.url_not_recognized()
-
-        raise LNException(f"Fail to init crawler. Error: {e}")
+    if not self.app.user_input.startswith("http"):
+        logger.info("Detected query input")
+        search_links = [
+            str(link)
+            for link, crawler in crawler_list.items()
+            if crawler.search_novel != Crawler.search_novel
+        ]
+        self.search_mode = True
+    else:
+        url = urlparse(self.app.user_input)
+        url = "%s://%s/" % (url.scheme, url.hostname)
+        if url in rejected_sources:
+            display.url_rejected(rejected_sources[url])
+            raise LNException("Fail to init crawler: %s is rejected", url)
+        try:
+            logger.info("Detected URL input")
+            self.app.crawler = prepare_crawler(self.app.user_input)
+            self.search_mode = False
+        except Exception as e:
+            display.url_not_recognized()
+            logger.debug("Trying to find it in novelupdates", e)
+            guess = self.app.guess_novel_title(self.app.user_input)
+            display.guessed_url_for_novelupdates()
+            self.app.user_input = self.confirm_guessed_novel(guess)
+            search_links = ["https://www.novelupdates.com/"]
+            self.search_mode = True
 
     # Search for novels
     if self.search_mode:
-        self.app.crawler_links = self.get_crawlers_to_search()
+        self.app.crawler_links = self.get_crawlers_to_search(search_links)
         self.app.search_novel()
 
     def _download_novel():

diff --git a/lncrawl/bots/discord/message_handler.py b/lncrawl/bots/discord/message_handler.py
@@ -145,6 +145,8 @@ def handle_novel_url(self):
                         "Sorry! I do not recognize this sources yet.",
                         "See list of supported sources here:",
                         "https://github.com/dipu-bd/lightnovel-crawler#c3-supported-sources",
+                        "",
+                        "You can send the novelupdates link of the novel too.",
                     ]
                 )
             )

diff --git a/lncrawl/bots/telegram/__init__.py b/lncrawl/bots/telegram/__init__.py
@@ -211,6 +211,9 @@ def handle_novel_url(self, bot, update, user_data):
                 update.message.reply_text(
                     "Enter something again or send /cancel to stop."
                 )
+                update.message.reply_text(
+                    "You can send the novelupdates link of the novel too.",
+                )
                 return "handle_novel_url"
 
             if app.crawler:

diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
@@ -6,17 +6,21 @@
 from typing import Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 
+from readability import Document
 from slugify import slugify
 
 from .. import constants as C
 from ..binders import available_formats, generate_books
 from ..core.exeptions import LNException
 from ..core.sources import crawler_list, prepare_crawler
 from ..models import Chapter, CombinedSearchResult, OutputFormat
+from .browser import Browser
 from .crawler import Crawler
 from .downloader import fetch_chapter_body, fetch_chapter_images
+from .exeptions import ScraperErrorGroup
 from .novel_info import format_novel, save_metadata
 from .novel_search import search_novels
+from .scraper import Scraper
 
 logger = logging.getLogger(__name__)
 
@@ -77,6 +81,20 @@ def prepare_search(self):
                 if crawler.search_novel != Crawler.search_novel
             ]
 
+    def guess_novel_title(self, url: str) -> str:
+        try:
+            scraper = Scraper(url)
+            response = scraper.get_response(url)
+            reader = Document(response.text)
+        except ScraperErrorGroup as e:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.exception("Failed to get response: %s", e)
+            with Browser() as browser:
+                browser.visit(url)
+                browser.wait("body")
+                reader = Document(browser.html)
+        return reader.short_title()
+
     def search_novel(self):
         """Requires: user_input, crawler_links"""
         """Produces: search_results"""

diff --git a/lncrawl/core/display.py b/lncrawl/core/display.py
@@ -6,10 +6,10 @@
 from colorama import Fore, Style
 
 from ..assets.chars import Chars
-from ..utils.platforms import Platform
 from ..core.exeptions import LNException
 from ..models import CombinedSearchResult, SearchResult
 from ..models.meta import MetaInfo
+from ..utils.platforms import Platform
 
 LINE_SIZE = 80
 ENABLE_BANNER = not Platform.windows or Platform.java
@@ -163,6 +163,17 @@ def url_not_recognized():
     #       'https://github.com/dipu-bd/lightnovel-crawler/issues', Fore.RESET)
 
 
+def guessed_url_for_novelupdates():
+    print()
+    print(
+        Fore.GREEN,
+        Chars.CLOVER,
+        "You can search novelupdates to find this novel!",
+        Fore.RESET,
+    )
+    print()
+
+
 def url_rejected(reason):
     print()
     print(Fore.RED, Chars.ERROR, "Sorry! I do not support this website.", Fore.RESET)

diff --git a/lncrawl/core/sources.py b/lncrawl/core/sources.py
@@ -14,7 +14,6 @@
 from packaging import version
 
 from ..assets.version import get_version
-from ..templates.novelupdates import NovelupdatesTemplate
 from ..utils.platforms import Platform
 from .arguments import get_args
 from .crawler import Crawler
@@ -348,11 +347,7 @@ def prepare_crawler(url: str) -> Optional[Crawler]:
 
     CrawlerType = crawler_list.get(base_url)
     if not CrawlerType:
-        # raise LNException("No crawler found for " + base_url)
-        logger.info(f"No crawler for {base_url} |> Trying with www.novelupdates.com")
-        base_url = "https://www.novelupdates.com/"
-        CrawlerType = NovelupdatesTemplate
-        CrawlerType.base_url = [base_url]
+        raise LNException("No crawler found for " + base_url)
 
     logger.info(
         "Initializing crawler for: %s [%s]",

diff --git a/lncrawl/core/taskman.py b/lncrawl/core/taskman.py
@@ -11,7 +11,7 @@
 
 T = TypeVar("T")
 
-MAX_WORKER_COUNT = 10
+MAX_WORKER_COUNT = 5
 MAX_REQUESTS_PER_DOMAIN = 25
 
 _resolver = Semaphore(1)

diff --git a/lncrawl/templates/novelpub.py b/lncrawl/templates/novelpub.py
@@ -17,6 +17,7 @@
 
 class NovelPubTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate):
     def initialize(self) -> None:
+        self.init_executor(3)
         self.cleaner.bad_tags.update(["div"])
         self.cleaner.bad_css.update(
             [

diff --git a/lncrawl/templates/novelupdates.py b/lncrawl/templates/novelupdates.py
@@ -9,7 +9,6 @@
 
 from lncrawl.core.browser import EC
 from lncrawl.core.crawler import Crawler
-from lncrawl.core.exeptions import LNException
 from lncrawl.models import Chapter, SearchResult
 from lncrawl.templates.browser.chapter_only import ChapterOnlyBrowserTemplate
 from lncrawl.templates.browser.searchable import SearchableBrowserTemplate
@@ -57,7 +56,7 @@ def select_search_items(self, query: str):
     def select_search_items_in_browser(self, query: str):
         query = dict(sf=1, sh=query, sort="srank", order="desc")
         self.visit(f"https://www.novelupdates.com/series-finder/?{urlencode(query)}")
-        self.browser.wait(".search_main_box_nu")
+        self.browser.wait(".l-main .search_main_box_nu")
         yield from self.browser.soup.select(".l-main .search_main_box_nu")
 
     def parse_search_item(self, tag: Tag) -> SearchResult:
@@ -86,32 +85,6 @@ def parse_search_item(self, tag: Tag) -> SearchResult:
             url=self.absolute_url(a["href"]),
         )
 
-    def get_novel_soup(self) -> BeautifulSoup:
-        if self.novel_url.startswith("https://www.novelupdates.com"):
-            return self.get_soup(self.novel_url)
-        else:
-            return self.guess_novelupdates_link(self.novel_url)
-
-    def guess_novelupdates_link(self, url: str) -> str:
-        # Guess novel title
-        response = self.get_response(url)
-        reader = Document(response.text)
-        title = reader.short_title()
-        logger.info("Original title = %s", title)
-
-        title = title.rsplit("-", 1)[0].strip() or title
-        title = re.sub(r"[^\w\d ]+", " ", title.lower())
-        title = " ".join(title.split(" ")[:10])
-        logger.info("Guessed title = %s", title)
-
-        # Search by guessed title in novelupdates
-        novels = self.search_novel(title)
-        if len(novels) != 1:
-            raise LNException("Not supported for " + self.novel_url)
-
-        self.novel_url = novels[0].url
-        return self.get_soup(self.novel_url)
-
     def parse_title(self, soup: BeautifulSoup) -> str:
         return soup.select_one(".seriestitlenu").text
 

diff --git a/lncrawl/webdriver/local.py b/lncrawl/webdriver/local.py
@@ -13,7 +13,7 @@
 from webdriver_manager.chrome import ChromeDriverManager
 
 from ..core.soup import SoupMaker
-from ..utils.platforms import Platform, Screen
+from ..utils.platforms import Screen, has_display
 from .elements import WebElement
 from .job_queue import _acquire_queue, _release_queue
 
@@ -49,7 +49,7 @@ def create_local(
     if not options:
         options = ChromeOptions()
 
-    if not Platform.display:
+    if not has_display():
         headless = True
 
     # Set default language

diff --git a/sources/_index.json b/sources/_index.json