From 77413287902a3c2319aaae7ec315e394c28a15fd Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Wed, 20 Dec 2023 14:43:08 +0100 Subject: [PATCH] Better yt channel-links command --- minet/cli/youtube/channel_links.py | 6 ++--- minet/youtube/scraper.py | 43 +++++++++++++++++++++++------- test/scraper_test.py | 4 ++- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/minet/cli/youtube/channel_links.py b/minet/cli/youtube/channel_links.py index 154c5d657e..5b2a3fd624 100644 --- a/minet/cli/youtube/channel_links.py +++ b/minet/cli/youtube/channel_links.py @@ -3,7 +3,7 @@ @with_enricher_and_loading_bar( - headers=["url"], + headers=["title", "url"], title="Retrieving channel links", unit="channels", sub_unit="links", @@ -19,5 +19,5 @@ def action(cli_args, enricher, loading_bar): if links is None: continue - for link in links: - enricher.writerow(row, [link]) + for title, link in links: + enricher.writerow(row, [title, link]) diff --git a/minet/youtube/scraper.py b/minet/youtube/scraper.py index a49ce9f020..50a798a295 100644 --- a/minet/youtube/scraper.py +++ b/minet/youtube/scraper.py @@ -1,10 +1,11 @@ -from typing import List, Set, Tuple, Optional +from typing import List, Tuple, Optional, Iterator import re import json from html import unescape from urllib.parse import unquote from ural import infer_redirection +from ebbe import getpath from minet.scrape import WonderfulSoup from minet.web import ( @@ -22,26 +23,42 @@ CAPTION_TRACKS_RE = re.compile(r'"captionTracks":(\[.*?\])') INITIAL_DATA_RE = re.compile( - rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+});" + rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+})\s*;" ) -def gather_url_endpoints(data): +def gather_external_links(data) -> Iterator[Tuple[str, str]]: if isinstance(data, dict): for k, v in data.items(): - if k == "urlEndpoint": + if k == "channelExternalLinkViewModel": if not isinstance(v, dict): return - yield infer_redirection(v["url"]) + yield ( + getpath(v, ("title", "content")), + infer_redirection( + getpath( + v, + ( + "link", + "commandRuns", + 0, + "onTap", + "innertubeCommand", + "urlEndpoint", + "url", + ), + ) + ), + ) return - yield from gather_url_endpoints(v) + yield from gather_external_links(v) elif isinstance(data, list): for v in data: - yield from gather_url_endpoints(v) + yield from gather_external_links(v) else: return @@ -152,7 +169,12 @@ def get_channel_id(self, channel_url: str) -> Optional[str]: return None - def get_channel_links(self, channel_url: str) -> Optional[Set[str]]: + def get_channel_links(self, channel_url: str) -> Optional[List[Tuple[str, str]]]: + # NOTE: for some weird reason, the /about page has more info in + # the ytInitialData global variable even if visual content is + # strictly identical. + channel_url = channel_url.split("?", 1)[0].split("#")[0].rstrip("/") + "/about" + response = self.request(channel_url, spoof_ua=True) match = INITIAL_DATA_RE.search(response.body) @@ -165,4 +187,7 @@ def get_channel_links(self, channel_url: str) -> Optional[Set[str]]: except json.JSONDecodeError: return None - return set(gather_url_endpoints(data)) + # with open("./dump.json", "w") as f: + # json.dump(data, f, ensure_ascii=False, indent=2) + + return list(gather_external_links(data)) diff --git a/test/scraper_test.py b/test/scraper_test.py index 7b354d1922..0e39b081dc 100644 --- a/test/scraper_test.py +++ b/test/scraper_test.py @@ -1085,4 +1085,6 @@ def basic_optional_scalar() -> Optional[str]: assert infer_fieldnames_from_function_return_type(basic_float) == ["value"] assert infer_fieldnames_from_function_return_type(basic_bool) == ["value"] assert infer_fieldnames_from_function_return_type(basic_void) == ["value"] - assert infer_fieldnames_from_function_return_type(basic_optional_scalar) == ["value"] \ No newline at end of file + assert infer_fieldnames_from_function_return_type(basic_optional_scalar) == [ + "value" + ]