Skip to content

Commit

Permalink
feat: whisper
Browse files Browse the repository at this point in the history
  • Loading branch information
LutingWang committed Dec 11, 2024
1 parent df72baa commit 7df2da2
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Prerequisites:

- torch
- torchvision
- torchaudio

```bash
pip install git+https://github.com/lvis-dataset/lvis-api.git@lvis_challenge_2021
Expand Down
9 changes: 9 additions & 0 deletions docs/source/data/v3det.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,12 @@ https://v3det.openxlab.org.cn/
.. code::
data/v3det/
└── images
├── a00000066
│ ├── 0_2530_11591900204_c1c10c1531_c.jpg
│ └── ...
└── ...
The training split contains 183,354 images, the validation split contains
29,821 images, and the test split contains 29,863 images.
There is a total of 13,432 categories in the images folder.
30 changes: 30 additions & 0 deletions docs/source/pretrained/whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from transformers import AutomaticSpeechRecognitionPipeline, pipeline

from todd.utils import get_audio

audio, _ = get_audio(
'https://github.com/SWivid/F5-TTS/raw/refs/heads/main/'
'src/f5_tts/infer/examples/basic/basic_ref_zh.wav',
)

pipe: AutomaticSpeechRecognitionPipeline = pipeline(
'automatic-speech-recognition',
model='pretrained/whisper/whisper-large-v3-turbo',
torch_dtype='auto',
device_map='auto',
)

result = pipe(audio)
print(result)

result = pipe(audio, generate_kwargs=dict(language='zh'))
print(result)

result = pipe(audio, generate_kwargs=dict(task='translate', language='en'))
print(result)

result = pipe(audio, return_timestamps=True)
print(result)

result = pipe(audio, return_timestamps='word')
print(result)
17 changes: 17 additions & 0 deletions docs/source/pretrained/whisper.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Whisper
=======

.. code-block:: bash
root=pretrained/whisper
mkdir -p ${root} && cd ${root}
git clone git@hf.co:openai/whisper-large-v3-turbo
.. code::
pretrained/whisper/
└── whisper-large-v3-turbo
.. literalinclude:: whisper.py
:language: python
:linenos:
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
'opencv-python',
'python-pptx',
'pycocotools',
'soundfile',
'tensorboard',
'timm',
'toml',
Expand Down Expand Up @@ -162,6 +163,7 @@ module = [
'pptx.*',
'scipy.*',
'setuptools.*',
'soundfile.*',
'torchvision.*',
'transformers.*',
'yapf.*',
Expand Down
15 changes: 13 additions & 2 deletions todd/utils/networks.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
__all__ = [
'get_bytes',
'get_image',
'get_audio',
]

from io import BytesIO

import numpy as np
import numpy.typing as npt
import requests
import soundfile as sf
from PIL import Image


def get_image(url: str) -> npt.NDArray[np.uint8]:
def get_bytes(url: str) -> BytesIO:
response = requests.get(url, timeout=5)
response.raise_for_status()
with Image.open(BytesIO(response.content)) as image:
return BytesIO(response.content)


def get_image(url: str) -> npt.NDArray[np.uint8]:
with Image.open(get_bytes(url)) as image:
image = image.convert('RGB')
return np.array(image)


def get_audio(url: str) -> tuple[npt.NDArray[np.float64], int]:
return sf.read(get_bytes(url))

0 comments on commit 7df2da2

Please sign in to comment.