From 7df2da29ada7ba229d4677ed4245fb83a87d6f0d Mon Sep 17 00:00:00 2001 From: LutingWang <2457348692@qq.com> Date: Wed, 11 Dec 2024 02:17:18 +0000 Subject: [PATCH] feat: whisper --- README.md | 1 + docs/source/data/v3det.rst | 9 +++++++++ docs/source/pretrained/whisper.py | 30 ++++++++++++++++++++++++++++++ docs/source/pretrained/whisper.rst | 17 +++++++++++++++++ pyproject.toml | 2 ++ todd/utils/networks.py | 15 +++++++++++++-- 6 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 docs/source/pretrained/whisper.py create mode 100644 docs/source/pretrained/whisper.rst diff --git a/README.md b/README.md index bd63fd2..611ec67 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Prerequisites: - torch - torchvision +- torchaudio ```bash pip install git+https://github.com/lvis-dataset/lvis-api.git@lvis_challenge_2021 diff --git a/docs/source/data/v3det.rst b/docs/source/data/v3det.rst index 0b06ca6..248af72 100644 --- a/docs/source/data/v3det.rst +++ b/docs/source/data/v3det.rst @@ -29,3 +29,12 @@ https://v3det.openxlab.org.cn/ .. code:: data/v3det/ + └── images + ├── a00000066 + │ ├── 0_2530_11591900204_c1c10c1531_c.jpg + │ └── ... + └── ... + +The training split contains 183,354 images, the validation split contains +29,821 images, and the test split contains 29,863 images. +There is a total of 13,432 categories in the images folder. diff --git a/docs/source/pretrained/whisper.py b/docs/source/pretrained/whisper.py new file mode 100644 index 0000000..0e596ae --- /dev/null +++ b/docs/source/pretrained/whisper.py @@ -0,0 +1,30 @@ +from transformers import AutomaticSpeechRecognitionPipeline, pipeline + +from todd.utils import get_audio + +audio, _ = get_audio( + 'https://github.com/SWivid/F5-TTS/raw/refs/heads/main/' + 'src/f5_tts/infer/examples/basic/basic_ref_zh.wav', +) + +pipe: AutomaticSpeechRecognitionPipeline = pipeline( + 'automatic-speech-recognition', + model='pretrained/whisper/whisper-large-v3-turbo', + torch_dtype='auto', + device_map='auto', +) + +result = pipe(audio) +print(result) + +result = pipe(audio, generate_kwargs=dict(language='zh')) +print(result) + +result = pipe(audio, generate_kwargs=dict(task='translate', language='en')) +print(result) + +result = pipe(audio, return_timestamps=True) +print(result) + +result = pipe(audio, return_timestamps='word') +print(result) diff --git a/docs/source/pretrained/whisper.rst b/docs/source/pretrained/whisper.rst new file mode 100644 index 0000000..2866102 --- /dev/null +++ b/docs/source/pretrained/whisper.rst @@ -0,0 +1,17 @@ +Whisper +======= + +.. code-block:: bash + + root=pretrained/whisper + mkdir -p ${root} && cd ${root} + git clone git@hf.co:openai/whisper-large-v3-turbo + +.. code:: + + pretrained/whisper/ + └── whisper-large-v3-turbo + +.. literalinclude:: whisper.py + :language: python + :linenos: diff --git a/pyproject.toml b/pyproject.toml index 5133203..0d4ddc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ 'opencv-python', 'python-pptx', 'pycocotools', + 'soundfile', 'tensorboard', 'timm', 'toml', @@ -162,6 +163,7 @@ module = [ 'pptx.*', 'scipy.*', 'setuptools.*', + 'soundfile.*', 'torchvision.*', 'transformers.*', 'yapf.*', diff --git a/todd/utils/networks.py b/todd/utils/networks.py index ef33e46..b59703a 100644 --- a/todd/utils/networks.py +++ b/todd/utils/networks.py @@ -1,5 +1,7 @@ __all__ = [ + 'get_bytes', 'get_image', + 'get_audio', ] from io import BytesIO @@ -7,12 +9,21 @@ import numpy as np import numpy.typing as npt import requests +import soundfile as sf from PIL import Image -def get_image(url: str) -> npt.NDArray[np.uint8]: +def get_bytes(url: str) -> BytesIO: response = requests.get(url, timeout=5) response.raise_for_status() - with Image.open(BytesIO(response.content)) as image: + return BytesIO(response.content) + + +def get_image(url: str) -> npt.NDArray[np.uint8]: + with Image.open(get_bytes(url)) as image: image = image.convert('RGB') return np.array(image) + + +def get_audio(url: str) -> tuple[npt.NDArray[np.float64], int]: + return sf.read(get_bytes(url))