From efd4b7b9d5e6c1607ad8de997959bcfc270bb1c5 Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 31 Jul 2020 17:52:00 +0530 Subject: [PATCH] support a class interface and add various features --- examples.py | 8 +- readme.md | 5 +- youtube_searcher/__init__.py | 796 +++++++++++++++++++++++------------ 3 files changed, 526 insertions(+), 283 deletions(-) diff --git a/examples.py b/examples.py index 93c9972..99caaaa 100644 --- a/examples.py +++ b/examples.py @@ -1,7 +1,9 @@ -from youtube_searcher import search_youtube +from youtube_searcher import YoutubeSearcher from pprint import pprint + +yts = YoutubeSearcher() query = "Ask a spaceman" -data = search_youtube(query) +data = yts.search_youtube(query) pprint(data) """ @@ -1183,4 +1185,4 @@ 'url': 'https://www.youtube.com/watch?v=1Ea4zENTZlQ', 'videoId': '1Ea4zENTZlQ'}]} -""" \ No newline at end of file +""" diff --git a/readme.md b/readme.md index 2bbfd03..2f24119 100644 --- a/readme.md +++ b/readme.md @@ -13,10 +13,11 @@ pip install youtube_searcher ```python -from youtube_searcher import search_youtube +from youtube_searcher import YoutubeSearcher +yts = YoutubeSearcher() query = "Rob Zombie" -data = search_youtube(query) +data = yts.search_youtube(query) """ {'featured_channel': {'title': 'robzombie', diff --git a/youtube_searcher/__init__.py b/youtube_searcher/__init__.py index f80c108..f010da3 100644 --- a/youtube_searcher/__init__.py +++ b/youtube_searcher/__init__.py @@ -3,308 +3,548 @@ import json from youtube_searcher.session import session +class YoutubeSearcher: + def __init__(self, location_code=None, user_agent=None): + if location_code: + self.location_code = location_code + else: + self.location_code = "US" + + # TODO make compatibile with mobile user_agents + if user_agent: + self.user_agent = user_agent + else: + self.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36" + + self.base_url = "https://www.youtube.com" + self.headers = { + 'User-Agent': self.user_agent + } + self.featured_channel = {"videos": [], "playlists": []} + self.data = {} + self.videos = [] + self.playlists = [] + self.related_to_search = [] + self.related_queries = [] + self.radio = [] + self.movies = [] + self.promoted = [] + self.videos_on_page = [] + self.corrected_query = None + self.contents = None + self.primary_contents = None + self.secondary_contents = None + self.primary_contents_page = None + + def search_youtube(self, query, render="all"): + params = {"search_query": query, + "gl": self.location_code} + + # TODO dont cache if no results found + html = session.get(self.base_url + "/results", + headers=self.headers, params=params).text + soup = bs4.BeautifulSoup(html, 'html.parser') + results = self.santize_soup_result(soup) + data = {"query": query, "corrected_query": query} + + contents = results['contents']['twoColumnSearchResultsRenderer'] + self.primary_contents = contents["primaryContents"]["sectionListRenderer"][ + "contents"][0]['itemSectionRenderer']['contents'] -def search_youtube(query, location_code="US", - user_agent='Mozilla/5.0 (X11; Linux x86_64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/57.0.2987.110 ' - 'Safari/537.36'): - base_url = "https://www.youtube.com" - headers = { - 'User-Agent': user_agent - } - params = {"search_query": query, - "gl": location_code} - - # TODO dont cache if no results found - html = session.get(base_url + "/results", - headers=headers, params=params).text - - soup = bs4.BeautifulSoup(html, 'html.parser') - blob = str(soup.find('script', string=re.compile('ytInitialData'))) - s = """window["ytInitialData"] = """ - e = """; - window["ytInitialPlayerResponse"] = null;""" - json_text = blob.split(s)[1].split(e)[0] - - results = json.loads(json_text) - - data = {"query": query, "corrected_query": query} - - videos = [] - playlists = [] - related_to_search = [] - related_queries = [] - radio = [] - movies = [] - promoted = [] - - contents = results['contents']['twoColumnSearchResultsRenderer'] - primary = contents["primaryContents"]["sectionListRenderer"][ - "contents"][0]['itemSectionRenderer']['contents'] - - featured_channel = {"videos": [], "playlists": []} + self.contents = contents + + if render == "all": + self.prepare_feature_channel_info() + self.prepare_videos_info() + self.prepare_playlistRender_info() + self.prepare_horizontalCardList_info() + self.prepare_radioRenderer_info() + self.prepare_movieRenderer_info() + self.prepare_carouselAdRenderer_info() + self.prepare_autoCorrectedQuery_info() + self.prepare_searchPyRenderer_info() + self.filter_for_secondaryContents() + + self.data["videos"] = self.videos + self.data["playlists"] = self.playlists + self.data["featured_channel"] = self.featured_channel + self.data["related_videos"] = self.related_to_search + self.data["related_queries"] = self.related_queries + self.data["full_movies"] = self.movies + self.data["promoted"] = self.promoted + + if render == "featured": + self.prepare_feature_channel_info() + self.prepare_videos_info() + self.filter_for_secondaryContents() + self.data["featured_channel"] = self.featured_channel + + if render == "videos": + self.prepare_videos_info() + self.data["videos"] = self.videos + + if render == "related": + self.prepare_videos_info() + self.prepare_horizontalCardList_info() + self.data["related_videos"] = self.related_to_search + self.data["related_queries"] = self.related_queries + + return self.primary_contents + + def page_search(self, page_type="trending"): + params = {"gl": self.location_code} + + # TODO dont cache if no results found + if page_type == "news": + page = "news" + elif page_type == "music": + page = "music" + elif page_type == "entertainment": + page = "entertainment" + else: + page = "feed/trending" + + html = session.get(self.base_url + "/" + page, + headers=self.headers, params=params).text + soup = bs4.BeautifulSoup(html, 'html.parser') + results = self.santize_soup_result(soup) + + contents = results['contents']['twoColumnBrowseResultsRenderer'] + self.primary_contents_page = contents['tabs'][0]['tabRenderer']['content'][ + 'sectionListRenderer']['contents'] + + if page == "feed/trending": + self.prepare_pageTrending_info() + else: + self.prepare_pageRequested_info() + + self.data["page_videos"] = self.videos_on_page + + return self.data + + def santize_soup_result(self, soup_blob): + # Make sure we always get the correct blob and santize it + blob = soup_blob.find('script', text=re.compile("ytInitialData")) + json_data = str(blob)[str(blob).find('{\"responseContext\"'):str(blob).find('module={}')] + json_data = re.split(r"\};", json_data)[0] + results = json.loads(json_data+"}") + return results - # because order is not assured we need to make 2 passes over the data - for vid in primary: - if 'channelRenderer' in vid: - vid = vid['channelRenderer'] - user = \ - vid['navigationEndpoint']['commandMetadata']['webCommandMetadata'][ + def prepare_feature_channel_info(self): + # because order is not assured we need to make 2 passes over the data + for vid in self.primary_contents: + if 'channelRenderer' in vid: + vid = vid['channelRenderer'] + user = \ + vid['navigationEndpoint']['commandMetadata']['webCommandMetadata'][ 'url'] - featured_channel["title"] = vid["title"]["simpleText"] - if 'descriptionSnippet' in vid: - d = [r["text"] for r in vid['descriptionSnippet']["runs"]] - else: # ocasionally happens? - d = vid["title"]["simpleText"].split(" ") - - featured_channel["description"] = " ".join(d) - featured_channel["user_url"] = base_url + user - break + + self.featured_channel["title"] = vid["title"]["simpleText"] + + if 'descriptionSnippet' in vid: + d = [r["text"] for r in vid['descriptionSnippet']["runs"]] + else: + d = vid["title"]["simpleText"].split(" ") + + self.featured_channel["description"] = " ".join(d) + self.featured_channel["user_url"] = self.base_url + user + + def prepare_videos_info(self): + for vid in self.primary_contents: + if 'videoRenderer' in vid: + vid = vid['videoRenderer'] + thumb = vid["thumbnail"]['thumbnails'] + + #Get video view count or live watch count + if "simpleText" in vid["shortViewCountText"]: + views = vid["shortViewCountText"]["simpleText"] + else: + views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] + + #Get video published_time assume if not available video is Live + if "publishedTimeText" in vid: + published_time = vid["publishedTimeText"]["simpleText"] + else: + published_time = "Live" + + title = " ".join([r["text"] for r in vid['title']["runs"]]) + if 'descriptionSnippet' in vid: + desc = " ".join([ + r["text"] for r in vid['descriptionSnippet']["runs"]]) + else: # ocasionally happens + desc = title + + #Length filter for live video + if "lengthText" in vid: + length_caption = \ + vid["lengthText"]['accessibility']["accessibilityData"][ + "label"] + length_txt = vid["lengthText"]['simpleText'] + else: + length_caption = "Live" + length_txt = "Live" - for vid in primary: - if 'videoRenderer' in vid: - vid = vid['videoRenderer'] - thumb = vid["thumbnail"]['thumbnails'] - title = " ".join([r["text"] for r in vid['title']["runs"]]) + videoId = vid['videoId'] + url = \ + vid['navigationEndpoint']['commandMetadata'][ + 'webCommandMetadata']['url'] + self.videos.append( + { + "url": self.base_url + url, + "title": title, + "length": length_txt, + "length_human": length_caption, + "views": views, + "published_time": published_time, + "videoId": videoId, + "thumbnails": thumb, + "description": desc + } + ) + elif 'shelfRenderer' in vid: + entries = vid['shelfRenderer'] + #most recent from channel {title_from_step_above} + #related to your search + + if "simpleText" in entries["title"]: + category = entries["title"]["simpleText"] + else: + category = entries["title"]["runs"][0]["text"] + + #TODO category localization + #this comes in lang from your ip address + #not good to use as dict keys, can assumptions be made about + #ordering and num of results? last item always seems to be + #related artists and first (if any) featured channel + ch = self.featured_channel.get("title", "") + + for vid in entries["content"]["verticalListRenderer"]['items']: + vid = vid['videoRenderer'] + thumb = vid["thumbnail"]['thumbnails'] + d = [r["text"] for r in vid['title']["runs"]] + title = " ".join(d) + + #Get video view count or live watch count + if "simpleText" in vid["shortViewCountText"]: + views = vid["viewCountText"]["simpleText"] + else: + views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] + + if "publishedTimeText" in vid: + published_time = vid["publishedTimeText"]["simpleText"] + else: + published_time = "Live" + + #Length filter for live video + if "lengthText" in vid: + length_caption = \ + vid["lengthText"]['accessibility']["accessibilityData"][ + "label"] + length_txt = vid["lengthText"]['simpleText'] + else: + length_caption = "Live" + length_txt = "Live" - if 'descriptionSnippet' in vid: - desc = " ".join([ - r["text"] for r in vid['descriptionSnippet']["runs"]]) - else: # ocasionally happens - desc = title + videoId = vid['videoId'] + url = vid['navigationEndpoint']['commandMetadata'][ + 'webCommandMetadata']['url'] + if ch and category.endswith(ch): + self.featured_channel["videos"].append( + { + "url": self.base_url + url, + "title": title, + "length": length_txt, + "length_human": length_caption, + "views": views, + "published_time": published_time, + "videoId": videoId, + "thumbnails": thumb + } + ) + else: + self.related_to_search.append( + { + "url": self.base_url + url, + "title": title, + "length": length_txt, + "length_human": length_caption, + "views": views, + "published_time": published_time, + "videoId": videoId, + "thumbnails": thumb, + "reason": category + } + ) - length_caption = \ - vid["lengthText"]['accessibility']["accessibilityData"][ - "label"] - length_txt = vid["lengthText"]['simpleText'] - videoId = vid['videoId'] - url = \ - vid['navigationEndpoint']['commandMetadata'][ - 'webCommandMetadata'][ - 'url'] + def prepare_playlistRender_info(self): + for vid in self.primary_contents: + if 'playlistRenderer' in vid: + vid = vid['playlistRenderer'] + playlist = { + "title": vid["title"]["simpleText"] + } + vid = vid['navigationEndpoint'] + playlist["url"] = \ + self.base_url + vid['commandMetadata']['webCommandMetadata']['url'] + playlist["videoId"] = vid['watchEndpoint']['videoId'] + playlist["playlistId"] = vid['watchEndpoint']['playlistId'] + self.playlists.append(playlist) - videos.append( - { - "url": base_url + url, + def prepare_horizontalCardList_info(self): + for vid in self.primary_contents: + if 'horizontalCardListRenderer' in vid: + for vid in vid['horizontalCardListRenderer']['cards']: + vid = vid['searchRefinementCardRenderer'] + url = \ + vid['searchEndpoint']['commandMetadata'][ + "webCommandMetadata"]["url"] + self.related_queries.append({ + "title": vid['searchEndpoint']['searchEndpoint']["query"], + "url": self.base_url + url, + "thumbnails": vid["thumbnail"]['thumbnails'] + }) + + def prepare_radioRenderer_info(self): + for vid in self.primary_contents: + if 'radioRenderer' in vid: + vid = vid['radioRenderer'] + title = vid["title"]["simpleText"] + thumb = vid["thumbnail"]['thumbnails'] + vid = vid['navigationEndpoint'] + url = vid['commandMetadata']['webCommandMetadata']['url'] + videoId = vid['watchEndpoint']['videoId'] + playlistId = vid['watchEndpoint']['playlistId'] + self.radio.append({ "title": title, - "length": length_txt, - "length_human": length_caption, - "videoId": videoId, "thumbnails": thumb, - "description": desc - } - ) - elif 'shelfRenderer' in vid: - entries = vid['shelfRenderer'] - # most recent from channel {title_from_step_above} - # related to your search - - category = entries["title"]["simpleText"] - # TODO category localization - # this comes in lang from your ip address - # not good to use as dict keys, can assumptions be made about - # ordering and num of results? last item always seems to be - # related artists and first (if any) featured channel - ch = featured_channel.get("title", "") + "url": self.base_url + url, + "videoId": videoId, + "playlistId": playlistId + }) - for vid in entries["content"]["verticalListRenderer"]['items']: - vid = vid['videoRenderer'] + def prepare_movieRenderer_info(self): + for vid in self.primary_contents: + if 'movieRenderer' in vid: + vid = vid['movieRenderer'] + title = " ".join([r["text"] for r in vid['title']["runs"]]) thumb = vid["thumbnail"]['thumbnails'] - d = [r["text"] for r in vid['title']["runs"]] - title = " ".join(d) - - length_caption = \ - vid["lengthText"]['accessibility']["accessibilityData"][ - "label"] - length_txt = vid["lengthText"]['simpleText'] videoId = vid['videoId'] - url = vid['navigationEndpoint']['commandMetadata'][ - 'webCommandMetadata']['url'] - - if ch and category.endswith(ch): - featured_channel["videos"].append( - { - "url": base_url + url, - "title": title, - "length": length_txt, - "length_human": length_caption, - "videoId": videoId, - "thumbnails": thumb - } - ) - else: - related_to_search.append( - { - "url": base_url + url, - "title": title, - "length": length_txt, - "length_human": length_caption, - "videoId": videoId, - "thumbnails": thumb, - "reason": category - } - ) - - elif 'playlistRenderer' in vid: - # playlist - vid = vid['playlistRenderer'] - playlist = { - "title": vid["title"]["simpleText"] - } - vid = vid['navigationEndpoint'] - playlist["url"] = \ - base_url + vid['commandMetadata']['webCommandMetadata']['url'] - playlist["videoId"] = vid['watchEndpoint']['videoId'] - playlist["playlistId"] = vid['watchEndpoint']['playlistId'] - playlists.append(playlist) - - elif 'horizontalCardListRenderer' in vid: - # alternative search (related artists) - for vid in vid['horizontalCardListRenderer']['cards']: - vid = vid['searchRefinementCardRenderer'] - url = \ - vid['searchEndpoint']['commandMetadata'][ - "webCommandMetadata"][ - "url"] - related_queries.append({ - "title": vid['searchEndpoint']['searchEndpoint']["query"], - "url": base_url + url, - "thumbnails": vid["thumbnail"]['thumbnails'] + meta = vid['bottomMetadataItems'] + meta = [m["simpleText"] for m in meta] + desc = " ".join([r["text"] for r in vid['descriptionSnippet']["runs"]]) + url = vid['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + + movies.append({ + "title": title, + "thumbnails": thumb, + "url": self.base_url + url, + "videoId": videoId, + "metadata": meta, + "description": desc }) - elif 'radioRenderer' in vid: - # playlist data - vid = vid['radioRenderer'] - title = vid["title"]["simpleText"] - thumb = vid["thumbnail"]['thumbnails'] - vid = vid['navigationEndpoint'] - url = vid['commandMetadata']['webCommandMetadata']['url'] - videoId = vid['watchEndpoint']['videoId'] - playlistId = vid['watchEndpoint']['playlistId'] - radio.append({ - "title": title, - "thumbnails": thumb, - "url": base_url + url, - "videoId": videoId, - "playlistId": playlistId - }) - elif 'movieRenderer' in vid: - # full movies - vid = vid['movieRenderer'] - title = " ".join([r["text"] for r in vid['title']["runs"]]) - thumb = vid["thumbnail"]['thumbnails'] - videoId = vid['videoId'] - meta = vid['bottomMetadataItems'] - meta = [m["simpleText"] for m in meta] - desc = " ".join([r["text"] for r in vid['descriptionSnippet']["runs"]]) - url = vid['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + def prepare_carouselAdRenderer_info(self): + for vid in self.primary_contents: + if 'carouselAdRenderer' in vid: + vid = vid["carouselAdRenderer"] + # skip ads + + def prepare_autoCorrectedQuery_info(self): + for vid in self.primary_contents: + if 'showingResultsForRenderer' in vid: + q = vid['showingResultsForRenderer']['correctedQuery'] + self.corrected_query = " ".join([r["text"] for r in q["runs"]]) - movies.append({ - "title": title, - "thumbnails": thumb, - "url": base_url + url, - "videoId": videoId, - "metadata": meta, - "description": desc - }) - elif 'carouselAdRenderer' in vid: - vid = vid["carouselAdRenderer"] - # skip ads - elif 'showingResultsForRenderer' in vid: - # auto correct for query - q = vid['showingResultsForRenderer']['correctedQuery'] - data["corrected_query"] = " ".join([r["text"] for r in q["runs"]]) - elif 'searchPyvRenderer' in vid: - for entry in vid['searchPyvRenderer']['ads']: - entry = entry['promotedVideoRenderer'] - desc = entry["description"]['simpleText'] - title = entry['longBylineText']['runs'][0]["text"] - url = base_url + entry['longBylineText']['runs'][0][ - 'navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'] - promoted.append( - {"title": title, - "description": desc, - "url": url}) - elif 'channelRenderer' in vid: - continue # handled in first pass - else: - continue - # Debug, never reached this point - print(1) - print(vid) + def prepare_searchPyRenderer_info(self): + for vid in self.primary_contents: + if 'searchPyvRenderer' in vid: + for entry in vid['searchPyvRenderer']['ads']: + entry = entry['promotedVideoRenderer'] + desc = entry["description"]['simpleText'] + title = entry['longBylineText']['runs'][0]["text"] + url = self.base_url + entry['longBylineText']['runs'][0][ + 'navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'] + self.promoted.append({ + "title": title, + "description": desc, + "url": url + }) - if contents.get("secondaryContents"): - secondary = \ - contents["secondaryContents"]["secondarySearchContainerRenderer"][ - "contents"][0]["universalWatchCardRenderer"] - for vid in secondary["sections"]: - entries = vid['watchCardSectionSequenceRenderer'] - for entry in entries['lists']: - if 'verticalWatchCardListRenderer' in entry: - for vid in entry['verticalWatchCardListRenderer']["items"]: - vid = vid['watchCardCompactVideoRenderer'] - thumbs = vid['thumbnail']['thumbnails'] + def filter_for_secondaryContents(self): + if self.contents.get("secondaryContents"): + self.secondary_contents = \ + self.contents["secondaryContents"]["secondarySearchContainerRenderer"][ + "contents"][0]["universalWatchCardRenderer"] + self.prepare_secondaryContentsRender() + - d = [r["text"] for r in vid['title']["runs"]] - title = " ".join(d) + def prepare_secondaryContentsRender(self): + for vid in self.secondary_contents["sections"]: + entries = vid['watchCardSectionSequenceRenderer'] + for entry in entries['lists']: + if 'verticalWatchCardListRenderer' in entry: + for vid in entry['verticalWatchCardListRenderer']["items"]: + vid = vid['watchCardCompactVideoRenderer'] + thumbs = vid['thumbnail']['thumbnails'] + + d = [r["text"] for r in vid['title']["runs"]] + title = " ".join(d) + url = vid['navigationEndpoint']['commandMetadata'][ + 'webCommandMetadata']['url'] + videoId = vid['navigationEndpoint']['watchEndpoint'][ + 'videoId'] + playlistId = \ + vid['navigationEndpoint']['watchEndpoint']['playlistId'] + length_caption = \ + vid["lengthText"]['accessibility'][ + "accessibilityData"]["label"] + length_txt = vid["lengthText"]['simpleText'] - url = vid['navigationEndpoint']['commandMetadata'][ - 'webCommandMetadata']['url'] - videoId = vid['navigationEndpoint']['watchEndpoint'][ - 'videoId'] - playlistId = \ - vid['navigationEndpoint']['watchEndpoint'][ - 'playlistId'] - length_caption = \ - vid["lengthText"]['accessibility'][ - "accessibilityData"][ - "label"] - length_txt = vid["lengthText"]['simpleText'] - - # TODO investigate - # These seem to always be from featured channel - # playlistId doesnt match any extracted playlist - featured_channel["videos"].append( - { - "url": base_url + url, + #TODO investigate + #These seem to always be from featured channel + #playlistId doesnt match any extracted playlist + self.featured_channel["videos"].append({ + "url": self.base_url + url, "title": title, "length": length_txt, "length_human": length_caption, "videoId": videoId, "playlistId": playlistId, "thumbnails": thumbs + }) + elif 'horizontalCardListRenderer' in entry: + for vid in entry['horizontalCardListRenderer']['cards']: + vid = vid['searchRefinementCardRenderer'] + playlistId = \ + vid['searchEndpoint']['watchPlaylistEndpoint'][ + 'playlistId'] + thumbs = vid['thumbnail']['thumbnails'] + url = vid['searchEndpoint']['commandMetadata'][ + 'webCommandMetadata']['url'] + d = [r["text"] for r in vid['query']["runs"]] + title = " ".join(d) + self.featured_channel["playlists"].append({ + "url": self.base_url + url, + "title": title, + "thumbnails": thumbs, + "playlistId": playlistId + }) + + def prepare_pageTrending_info(self): + for items in self.primary_contents_page: + if 'itemSectionRenderer' in items: + page_items = items['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['expandedShelfContentsRenderer']['items'] + for x in range(len(page_items)): + if 'videoRenderer' in page_items[x]: + vid = page_items[x]['videoRenderer'] + thumb = vid["thumbnail"]['thumbnails'] + + #Get video view count or live watch count + if "simpleText" in vid["shortViewCountText"]: + views = vid["shortViewCountText"]["simpleText"] + else: + views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] + + #Get video published_time assume if not available video is Live + if "publishedTimeText" in vid: + published_time = vid["publishedTimeText"]["simpleText"] + else: + published_time = "Live" + + title = " ".join([r["text"] for r in vid['title']["runs"]]) + + if 'descriptionSnippet' in vid: + desc = " ".join([ + r["text"] for r in vid['descriptionSnippet']["runs"]]) + else: # ocasionally happens + desc = title + + #Length filter for live video + if "lengthText" in vid: + length_caption = \ + vid["lengthText"]['accessibility']["accessibilityData"][ + "label"] + length_txt = vid["lengthText"]['simpleText'] + else: + length_caption = "Live" + length_txt = "Live" + + videoId = vid['videoId'] + url = \ + vid['navigationEndpoint']['commandMetadata'][ + 'webCommandMetadata']['url'] + self.videos_on_page.append( + { + "url": self.base_url + url, + "title": title, + "length": length_txt, + "length_human": length_caption, + "views": views, + "published_time": published_time, + "videoId": videoId, + "thumbnails": thumb, + "description": desc } ) - elif 'horizontalCardListRenderer' in entry: - for vid in entry['horizontalCardListRenderer']['cards']: - vid = vid['searchRefinementCardRenderer'] - playlistId = \ - vid['searchEndpoint']['watchPlaylistEndpoint'][ - 'playlistId'] - thumbs = vid['thumbnail']['thumbnails'] - url = vid['searchEndpoint']['commandMetadata'][ - 'webCommandMetadata']['url'] - d = [r["text"] for r in vid['query']["runs"]] - title = " ".join(d) - featured_channel["playlists"].append({ - "url": base_url + url, - "title": title, - "thumbnails": thumbs, - "playlistId": playlistId - }) - else: - continue - # Debug, never reached this point - print(2 ) - print(entry) - - data["videos"] = videos - data["playlists"] = playlists - data["featured_channel"] = featured_channel - data["related_videos"] = related_to_search - data["related_queries"] = related_queries - data["full_movies"] = movies - data["promoted"] = promoted - return data + def prepare_pageRequested_info(self): + for items in self.primary_contents_page: + if 'itemSectionRenderer' in items: + page_items = items['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['horizontalListRenderer']['items'] + for x in range(len(page_items)): + if 'gridVideoRenderer' in page_items[x]: + vid = page_items[x]['gridVideoRenderer'] + thumb = vid["thumbnail"]['thumbnails'] + + #Get video view count or live watch count + if "shortViewCountText" in vid: + if "simpleText" in vid["shortViewCountText"]: + views = vid["shortViewCountText"]["simpleText"] + else: + views = vid["shortViewCountText"]["runs"][0]["text"] + " " + vid["shortViewCountText"]["runs"][1]["text"] + else: + views = "unavailable" + + #Get video published_time assume if not available video is Live + if "publishedTimeText" in vid: + published_time = vid["publishedTimeText"]["simpleText"] + else: + published_time = "Live" + + #title = " ".join([r["text"] for r in vid['title']["runs"]]) + title = vid['title']['simpleText'] + + if 'descriptionSnippet' in vid: + desc = " ".join([ + r["text"] for r in vid['descriptionSnippet']["runs"]]) + else: # ocasionally happens + desc = title + + #Length filter for live video + overlayInformation = vid['thumbnailOverlays'][0] + if "thumbnailOverlayTimeStatusRenderer" in overlayInformation: + length_caption = \ + overlayInformation['thumbnailOverlayTimeStatusRenderer']['text']['accessibility']["accessibilityData"][ + "label"] + length_txt = overlayInformation['thumbnailOverlayTimeStatusRenderer']['text']['simpleText'] + else: + length_caption = "Live" + length_txt = "Live" + + videoId = vid['videoId'] + url = \ + vid['navigationEndpoint']['commandMetadata'][ + 'webCommandMetadata']['url'] + self.videos_on_page.append( + { + "url": self.base_url + url, + "title": title, + "length": length_txt, + "length_human": length_caption, + "views": views, + "published_time": published_time, + "videoId": videoId, + "thumbnails": thumb, + "description": desc + } + )