Skip to content

Commit

Permalink
Fixing yt captions when encountering inconsistent transcripts
Browse files Browse the repository at this point in the history
Sometimes duration is not given, we default it to 0
  • Loading branch information
Yomguithereal committed Nov 7, 2023
1 parent 20c4dd8 commit f8239dc
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
14 changes: 11 additions & 3 deletions minet/scrape/soup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import List, Optional, cast
from typing import List, Optional, cast, overload

import warnings
from contextlib import contextmanager
Expand Down Expand Up @@ -94,8 +94,16 @@ def get_outer_html(self) -> str:
def __getitem__(self, name: str) -> str:
return cast(str, super().__getitem__(name))

def get(self, name: str) -> Optional[str]:
return cast(Optional[str], super().get(name))
@overload
def get(self, name: str, default: str = ...) -> str:
...

@overload
def get(self, name: str, default: None = ...) -> Optional[str]:
...

def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
return cast(Optional[str], super().get(name, default))

def get_list(self, name: str) -> List[str]:
value = super().get(name)
Expand Down
6 changes: 5 additions & 1 deletion minet/youtube/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,14 @@ def get_video_captions(
captions = []

for item in soup.select("text"):

# NOTE: sometimes duration is absent. I don't really
# know what is the best solution there (merging with
# previous item?). So for now, we default duration to 0.
captions.append(
YouTubeCaptionLine(
float(item["start"]),
float(item["dur"]),
float(item.get("dur", "0")),
unescape(item.get_text().strip()),
)
)
Expand Down

0 comments on commit f8239dc

Please sign in to comment.