From 7df2da29ada7ba229d4677ed4245fb83a87d6f0d Mon Sep 17 00:00:00 2001
From: LutingWang <2457348692@qq.com>
Date: Wed, 11 Dec 2024 02:17:18 +0000
Subject: [PATCH] feat: whisper

---
 README.md                          |  1 +
 docs/source/data/v3det.rst         |  9 +++++++++
 docs/source/pretrained/whisper.py  | 30 ++++++++++++++++++++++++++++++
 docs/source/pretrained/whisper.rst | 17 +++++++++++++++++
 pyproject.toml                     |  2 ++
 todd/utils/networks.py             | 15 +++++++++++++--
 6 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/pretrained/whisper.py
 create mode 100644 docs/source/pretrained/whisper.rst

diff --git a/README.md b/README.md
index bd63fd2..611ec67 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ Prerequisites:
 
 - torch
 - torchvision
+- torchaudio
 
 ```bash
 pip install git+https://github.com/lvis-dataset/lvis-api.git@lvis_challenge_2021
diff --git a/docs/source/data/v3det.rst b/docs/source/data/v3det.rst
index 0b06ca6..248af72 100644
--- a/docs/source/data/v3det.rst
+++ b/docs/source/data/v3det.rst
@@ -29,3 +29,12 @@ https://v3det.openxlab.org.cn/
 .. code::
 
     data/v3det/
+    └── images
+        ├── a00000066
+        │   ├── 0_2530_11591900204_c1c10c1531_c.jpg
+        │   └── ...
+        └── ...
+
+The training split contains 183,354 images, the validation split contains
+29,821 images, and the test split contains 29,863 images.
+There is a total of 13,432 categories in the images folder.
diff --git a/docs/source/pretrained/whisper.py b/docs/source/pretrained/whisper.py
new file mode 100644
index 0000000..0e596ae
--- /dev/null
+++ b/docs/source/pretrained/whisper.py
@@ -0,0 +1,30 @@
+from transformers import AutomaticSpeechRecognitionPipeline, pipeline
+
+from todd.utils import get_audio
+
+audio, _ = get_audio(
+    'https://github.com/SWivid/F5-TTS/raw/refs/heads/main/'
+    'src/f5_tts/infer/examples/basic/basic_ref_zh.wav',
+)
+
+pipe: AutomaticSpeechRecognitionPipeline = pipeline(
+    'automatic-speech-recognition',
+    model='pretrained/whisper/whisper-large-v3-turbo',
+    torch_dtype='auto',
+    device_map='auto',
+)
+
+result = pipe(audio)
+print(result)
+
+result = pipe(audio, generate_kwargs=dict(language='zh'))
+print(result)
+
+result = pipe(audio, generate_kwargs=dict(task='translate', language='en'))
+print(result)
+
+result = pipe(audio, return_timestamps=True)
+print(result)
+
+result = pipe(audio, return_timestamps='word')
+print(result)
diff --git a/docs/source/pretrained/whisper.rst b/docs/source/pretrained/whisper.rst
new file mode 100644
index 0000000..2866102
--- /dev/null
+++ b/docs/source/pretrained/whisper.rst
@@ -0,0 +1,17 @@
+Whisper
+=======
+
+.. code-block:: bash
+
+    root=pretrained/whisper
+    mkdir -p ${root} && cd ${root}
+    git clone git@hf.co:openai/whisper-large-v3-turbo
+
+.. code::
+
+    pretrained/whisper/
+    └── whisper-large-v3-turbo
+
+.. literalinclude:: whisper.py
+    :language: python
+    :linenos:
diff --git a/pyproject.toml b/pyproject.toml
index 5133203..0d4ddc5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     'opencv-python',
     'python-pptx',
     'pycocotools',
+    'soundfile',
     'tensorboard',
     'timm',
     'toml',
@@ -162,6 +163,7 @@ module = [
     'pptx.*',
     'scipy.*',
     'setuptools.*',
+    'soundfile.*',
     'torchvision.*',
     'transformers.*',
     'yapf.*',
diff --git a/todd/utils/networks.py b/todd/utils/networks.py
index ef33e46..b59703a 100644
--- a/todd/utils/networks.py
+++ b/todd/utils/networks.py
@@ -1,5 +1,7 @@
 __all__ = [
+    'get_bytes',
     'get_image',
+    'get_audio',
 ]
 
 from io import BytesIO
@@ -7,12 +9,21 @@
 import numpy as np
 import numpy.typing as npt
 import requests
+import soundfile as sf
 from PIL import Image
 
 
-def get_image(url: str) -> npt.NDArray[np.uint8]:
+def get_bytes(url: str) -> BytesIO:
     response = requests.get(url, timeout=5)
     response.raise_for_status()
-    with Image.open(BytesIO(response.content)) as image:
+    return BytesIO(response.content)
+
+
+def get_image(url: str) -> npt.NDArray[np.uint8]:
+    with Image.open(get_bytes(url)) as image:
         image = image.convert('RGB')
         return np.array(image)
+
+
+def get_audio(url: str) -> tuple[npt.NDArray[np.float64], int]:
+    return sf.read(get_bytes(url))