feat: whisper

LutingWang · Dec 11, 2024 · 7df2da2 · 7df2da2
1 parent df72baa
commit 7df2da2
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -26,6 +26,7 @@ Prerequisites:
 
 - torch
 - torchvision
+- torchaudio
 
 ```bash
 pip install git+https://github.com/lvis-dataset/lvis-api.git@lvis_challenge_2021

diff --git a/docs/source/data/v3det.rst b/docs/source/data/v3det.rst
@@ -29,3 +29,12 @@ https://v3det.openxlab.org.cn/
 .. code::
 
     data/v3det/
+    └── images
+        ├── a00000066
+        │   ├── 0_2530_11591900204_c1c10c1531_c.jpg
+        │   └── ...
+        └── ...
+
+The training split contains 183,354 images, the validation split contains
+29,821 images, and the test split contains 29,863 images.
+There is a total of 13,432 categories in the images folder.
diff --git a/docs/source/pretrained/whisper.py b/docs/source/pretrained/whisper.py
@@ -0,0 +1,30 @@
+from transformers import AutomaticSpeechRecognitionPipeline, pipeline
+
+from todd.utils import get_audio
+
+audio, _ = get_audio(
+    'https://github.com/SWivid/F5-TTS/raw/refs/heads/main/'
+    'src/f5_tts/infer/examples/basic/basic_ref_zh.wav',
+)
+
+pipe: AutomaticSpeechRecognitionPipeline = pipeline(
+    'automatic-speech-recognition',
+    model='pretrained/whisper/whisper-large-v3-turbo',
+    torch_dtype='auto',
+    device_map='auto',
+)
+
+result = pipe(audio)
+print(result)
+
+result = pipe(audio, generate_kwargs=dict(language='zh'))
+print(result)
+
+result = pipe(audio, generate_kwargs=dict(task='translate', language='en'))
+print(result)
+
+result = pipe(audio, return_timestamps=True)
+print(result)
+
+result = pipe(audio, return_timestamps='word')
+print(result)
diff --git a/docs/source/pretrained/whisper.rst b/docs/source/pretrained/whisper.rst
@@ -0,0 +1,17 @@
+Whisper
+=======
+
+.. code-block:: bash
+
+    root=pretrained/whisper
+    mkdir -p ${root} && cd ${root}
+    git clone git@hf.co:openai/whisper-large-v3-turbo
+
+.. code::
+
+    pretrained/whisper/
+    └── whisper-large-v3-turbo
+
+.. literalinclude:: whisper.py
+    :language: python
+    :linenos:
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     'opencv-python',
     'python-pptx',
     'pycocotools',
+    'soundfile',
     'tensorboard',
     'timm',
     'toml',
@@ -162,6 +163,7 @@ module = [
     'pptx.*',
     'scipy.*',
     'setuptools.*',
+    'soundfile.*',
     'torchvision.*',
     'transformers.*',
     'yapf.*',

diff --git a/todd/utils/networks.py b/todd/utils/networks.py
@@ -1,18 +1,29 @@
 __all__ = [
+    'get_bytes',
     'get_image',
+    'get_audio',
 ]
 
 from io import BytesIO
 
 import numpy as np
 import numpy.typing as npt
 import requests
+import soundfile as sf
 from PIL import Image
 
 
-def get_image(url: str) -> npt.NDArray[np.uint8]:
+def get_bytes(url: str) -> BytesIO:
     response = requests.get(url, timeout=5)
     response.raise_for_status()
-    with Image.open(BytesIO(response.content)) as image:
+    return BytesIO(response.content)
+
+
+def get_image(url: str) -> npt.NDArray[np.uint8]:
+    with Image.open(get_bytes(url)) as image:
         image = image.convert('RGB')
         return np.array(image)
+
+
+def get_audio(url: str) -> tuple[npt.NDArray[np.float64], int]:
+    return sf.read(get_bytes(url))
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ Prerequisites: @@
     - torch
     - torchvision
+    - torchaudio
     ```bash
     pip install git+https://github.com/lvis-dataset/lvis-api.git@lvis_challenge_2021
@@ Expand Down @@