From f8239dcee23ab7fb1db5114dbcd033feb3098519 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Tue, 7 Nov 2023 14:23:16 +0100 Subject: [PATCH] Fixing yt captions when encountering inconsistent transcripts Sometimes duration is not given, we default it to 0 --- minet/scrape/soup.py | 14 +++++++++++--- minet/youtube/scraper.py | 6 +++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/minet/scrape/soup.py b/minet/scrape/soup.py index dc2065c884..80d66ff2a4 100644 --- a/minet/scrape/soup.py +++ b/minet/scrape/soup.py @@ -1,5 +1,5 @@ import re -from typing import List, Optional, cast +from typing import List, Optional, cast, overload import warnings from contextlib import contextmanager @@ -94,8 +94,16 @@ def get_outer_html(self) -> str: def __getitem__(self, name: str) -> str: return cast(str, super().__getitem__(name)) - def get(self, name: str) -> Optional[str]: - return cast(Optional[str], super().get(name)) + @overload + def get(self, name: str, default: str = ...) -> str: + ... + + @overload + def get(self, name: str, default: None = ...) -> Optional[str]: + ... + + def get(self, name: str, default: Optional[str] = None) -> Optional[str]: + return cast(Optional[str], super().get(name, default)) def get_list(self, name: str) -> List[str]: value = super().get(name) diff --git a/minet/youtube/scraper.py b/minet/youtube/scraper.py index c21bc66a6a..fbc3e64f44 100644 --- a/minet/youtube/scraper.py +++ b/minet/youtube/scraper.py @@ -101,10 +101,14 @@ def get_video_captions( captions = [] for item in soup.select("text"): + + # NOTE: sometimes duration is absent. I don't really + # know what is the best solution there (merging with + # previous item?). So for now, we default duration to 0. captions.append( YouTubeCaptionLine( float(item["start"]), - float(item["dur"]), + float(item.get("dur", "0")), unescape(item.get_text().strip()), ) )