Skip to content

Commit

Permalink
Merge pull request #1817 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
dipu-bd authored Dec 11, 2022
2 parents 9130a0d + 66ba3a1 commit f916c65
Show file tree
Hide file tree
Showing 15 changed files with 419 additions and 384 deletions.
650 changes: 325 additions & 325 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.1
3.2.2
1 change: 1 addition & 0 deletions lncrawl/bots/console/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(self) -> None:

from .get_crawler import (
choose_a_novel,
confirm_guessed_novel,
confirm_retry,
get_crawlers_to_search,
get_novel_url,
Expand Down
23 changes: 21 additions & 2 deletions lncrawl/bots/console/get_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,28 @@ def get_novel_url(self):
raise LNException("Novel page url or query was not given")


def get_crawlers_to_search(self) -> List[str]:
def confirm_guessed_novel(self, guessed_title: str):
"""Returns a novel page url from a novelupdates query"""
args = get_args()
if args.suppress:
return guessed_title

answer = prompt(
[
{
"type": "input",
"name": "novel",
"message": "Enter novelupdates query:",
"default": guessed_title,
"validate": lambda a: True if a else "Input should not be empty",
},
]
)
return answer["novel"].strip()


def get_crawlers_to_search(self, links: List[str]) -> List[str]:
"""Returns user choice to search the choosen sites for a novel"""
links = self.app.crawler_links
if not links:
return []

Expand Down
46 changes: 29 additions & 17 deletions lncrawl/bots/console/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
from ...core import display
from ...core.app import App
from ...core.arguments import get_args
from ...core.crawler import Crawler
from ...core.exeptions import LNException
from ...core.sources import prepare_crawler, rejected_sources
from ...core.sources import crawler_list, prepare_crawler, rejected_sources
from .open_folder_prompt import display_open_folder
from .resume_download import resume_session

Expand Down Expand Up @@ -37,25 +38,36 @@ def start(self):

# Process user input
self.app.user_input = self.get_novel_url()
try:
self.app.prepare_search()
self.search_mode = not self.app.crawler
except LNException as e:
raise e
except Exception as e:
if self.app.user_input.startswith("http"):
url = urlparse(self.app.user_input)
url = "%s://%s/" % (url.scheme, url.hostname)
if url in rejected_sources:
display.url_rejected(rejected_sources[url])
else:
display.url_not_recognized()

raise LNException(f"Fail to init crawler. Error: {e}")
if not self.app.user_input.startswith("http"):
logger.info("Detected query input")
search_links = [
str(link)
for link, crawler in crawler_list.items()
if crawler.search_novel != Crawler.search_novel
]
self.search_mode = True
else:
url = urlparse(self.app.user_input)
url = "%s://%s/" % (url.scheme, url.hostname)
if url in rejected_sources:
display.url_rejected(rejected_sources[url])
raise LNException("Fail to init crawler: %s is rejected", url)
try:
logger.info("Detected URL input")
self.app.crawler = prepare_crawler(self.app.user_input)
self.search_mode = False
except Exception as e:
display.url_not_recognized()
logger.debug("Trying to find it in novelupdates", e)
guess = self.app.guess_novel_title(self.app.user_input)
display.guessed_url_for_novelupdates()
self.app.user_input = self.confirm_guessed_novel(guess)
search_links = ["https://www.novelupdates.com/"]
self.search_mode = True

# Search for novels
if self.search_mode:
self.app.crawler_links = self.get_crawlers_to_search()
self.app.crawler_links = self.get_crawlers_to_search(search_links)
self.app.search_novel()

def _download_novel():
Expand Down
2 changes: 2 additions & 0 deletions lncrawl/bots/discord/message_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ def handle_novel_url(self):
"Sorry! I do not recognize this sources yet.",
"See list of supported sources here:",
"https://github.com/dipu-bd/lightnovel-crawler#c3-supported-sources",
"",
"You can send the novelupdates link of the novel too.",
]
)
)
Expand Down
3 changes: 3 additions & 0 deletions lncrawl/bots/telegram/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ def handle_novel_url(self, bot, update, user_data):
update.message.reply_text(
"Enter something again or send /cancel to stop."
)
update.message.reply_text(
"You can send the novelupdates link of the novel too.",
)
return "handle_novel_url"

if app.crawler:
Expand Down
18 changes: 18 additions & 0 deletions lncrawl/core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,21 @@
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse

from readability import Document
from slugify import slugify

from .. import constants as C
from ..binders import available_formats, generate_books
from ..core.exeptions import LNException
from ..core.sources import crawler_list, prepare_crawler
from ..models import Chapter, CombinedSearchResult, OutputFormat
from .browser import Browser
from .crawler import Crawler
from .downloader import fetch_chapter_body, fetch_chapter_images
from .exeptions import ScraperErrorGroup
from .novel_info import format_novel, save_metadata
from .novel_search import search_novels
from .scraper import Scraper

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -77,6 +81,20 @@ def prepare_search(self):
if crawler.search_novel != Crawler.search_novel
]

def guess_novel_title(self, url: str) -> str:
try:
scraper = Scraper(url)
response = scraper.get_response(url)
reader = Document(response.text)
except ScraperErrorGroup as e:
if logger.isEnabledFor(logging.DEBUG):
logger.exception("Failed to get response: %s", e)
with Browser() as browser:
browser.visit(url)
browser.wait("body")
reader = Document(browser.html)
return reader.short_title()

def search_novel(self):
"""Requires: user_input, crawler_links"""
"""Produces: search_results"""
Expand Down
13 changes: 12 additions & 1 deletion lncrawl/core/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from colorama import Fore, Style

from ..assets.chars import Chars
from ..utils.platforms import Platform
from ..core.exeptions import LNException
from ..models import CombinedSearchResult, SearchResult
from ..models.meta import MetaInfo
from ..utils.platforms import Platform

LINE_SIZE = 80
ENABLE_BANNER = not Platform.windows or Platform.java
Expand Down Expand Up @@ -163,6 +163,17 @@ def url_not_recognized():
# 'https://github.com/dipu-bd/lightnovel-crawler/issues', Fore.RESET)


def guessed_url_for_novelupdates():
print()
print(
Fore.GREEN,
Chars.CLOVER,
"You can search novelupdates to find this novel!",
Fore.RESET,
)
print()


def url_rejected(reason):
print()
print(Fore.RED, Chars.ERROR, "Sorry! I do not support this website.", Fore.RESET)
Expand Down
7 changes: 1 addition & 6 deletions lncrawl/core/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from packaging import version

from ..assets.version import get_version
from ..templates.novelupdates import NovelupdatesTemplate
from ..utils.platforms import Platform
from .arguments import get_args
from .crawler import Crawler
Expand Down Expand Up @@ -348,11 +347,7 @@ def prepare_crawler(url: str) -> Optional[Crawler]:

CrawlerType = crawler_list.get(base_url)
if not CrawlerType:
# raise LNException("No crawler found for " + base_url)
logger.info(f"No crawler for {base_url} |> Trying with www.novelupdates.com")
base_url = "https://www.novelupdates.com/"
CrawlerType = NovelupdatesTemplate
CrawlerType.base_url = [base_url]
raise LNException("No crawler found for " + base_url)

logger.info(
"Initializing crawler for: %s [%s]",
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/core/taskman.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

T = TypeVar("T")

MAX_WORKER_COUNT = 10
MAX_WORKER_COUNT = 5
MAX_REQUESTS_PER_DOMAIN = 25

_resolver = Semaphore(1)
Expand Down
1 change: 1 addition & 0 deletions lncrawl/templates/novelpub.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

class NovelPubTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate):
def initialize(self) -> None:
self.init_executor(3)
self.cleaner.bad_tags.update(["div"])
self.cleaner.bad_css.update(
[
Expand Down
29 changes: 1 addition & 28 deletions lncrawl/templates/novelupdates.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from lncrawl.core.browser import EC
from lncrawl.core.crawler import Crawler
from lncrawl.core.exeptions import LNException
from lncrawl.models import Chapter, SearchResult
from lncrawl.templates.browser.chapter_only import ChapterOnlyBrowserTemplate
from lncrawl.templates.browser.searchable import SearchableBrowserTemplate
Expand Down Expand Up @@ -57,7 +56,7 @@ def select_search_items(self, query: str):
def select_search_items_in_browser(self, query: str):
query = dict(sf=1, sh=query, sort="srank", order="desc")
self.visit(f"https://www.novelupdates.com/series-finder/?{urlencode(query)}")
self.browser.wait(".search_main_box_nu")
self.browser.wait(".l-main .search_main_box_nu")
yield from self.browser.soup.select(".l-main .search_main_box_nu")

def parse_search_item(self, tag: Tag) -> SearchResult:
Expand Down Expand Up @@ -86,32 +85,6 @@ def parse_search_item(self, tag: Tag) -> SearchResult:
url=self.absolute_url(a["href"]),
)

def get_novel_soup(self) -> BeautifulSoup:
if self.novel_url.startswith("https://www.novelupdates.com"):
return self.get_soup(self.novel_url)
else:
return self.guess_novelupdates_link(self.novel_url)

def guess_novelupdates_link(self, url: str) -> str:
# Guess novel title
response = self.get_response(url)
reader = Document(response.text)
title = reader.short_title()
logger.info("Original title = %s", title)

title = title.rsplit("-", 1)[0].strip() or title
title = re.sub(r"[^\w\d ]+", " ", title.lower())
title = " ".join(title.split(" ")[:10])
logger.info("Guessed title = %s", title)

# Search by guessed title in novelupdates
novels = self.search_novel(title)
if len(novels) != 1:
raise LNException("Not supported for " + self.novel_url)

self.novel_url = novels[0].url
return self.get_soup(self.novel_url)

def parse_title(self, soup: BeautifulSoup) -> str:
return soup.select_one(".seriestitlenu").text

Expand Down
4 changes: 2 additions & 2 deletions lncrawl/webdriver/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from webdriver_manager.chrome import ChromeDriverManager

from ..core.soup import SoupMaker
from ..utils.platforms import Platform, Screen
from ..utils.platforms import Screen, has_display
from .elements import WebElement
from .job_queue import _acquire_queue, _release_queue

Expand Down Expand Up @@ -49,7 +49,7 @@ def create_local(
if not options:
options = ChromeOptions()

if not Platform.display:
if not has_display():
headless = True

# Set default language
Expand Down
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

0 comments on commit f916c65

Please sign in to comment.