Skip to content

Commit

Permalink
Merge pull request #39 from Huanshere/whisperapi
Browse files Browse the repository at this point in the history
v0.3.0
  • Loading branch information
Huanshere authored Sep 4, 2024
2 parents 0bb99df + 5bebfd3 commit 2b5ac48
Show file tree
Hide file tree
Showing 26 changed files with 471 additions and 386 deletions.
6 changes: 1 addition & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,5 @@ _model_cache/
# large files
/ffmpeg.exe
/ffmpeg
*.mp4
*.webm
*.mp3
.DS_Store
runtime/
dev/
_config.py
16 changes: 6 additions & 10 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
# 建议在 streamlit 页面中调整设置
## ======================== 基本设置 ======================== ##
# API 设置 建议使用唯一真神 https://api.wlai.vip, sonnet 价格仅 10r/1M
# !一定确保 key 是 AZ 渠道
# !WHISPER_API_KEY 暂时必须走 `纯AZ渠道`, API_KEY 建议走默认渠道
API_KEY = 'sk-xxx'
WHISPER_API_KEY = 'sk-xxx'
BASE_URL = 'https://api2.wlai.vip'
MODEL = ['claude-3-5-sonnet-20240620']

# 语言设置,用自然语言描述
TARGET_LANGUAGE = '简体中文'

# 字幕设置
## 每行英文字幕的最大长度字母数量
MAX_ENGLISH_LENGTH = 80
## 每行原字幕的最大长度字母数量
MAX_SRC_LENGTH = 80
## 每行翻译字幕的最大长度 根据目标语言调整(如中文为30个字)
MAX_TARGET_LANGUAGE_LENGTH = 30

Expand All @@ -22,9 +23,6 @@
# 视频分辨率
RESOLUTIOM = '854x480'

# whisper 指定语言,auto 为自动识别但目前不用英文后续会有小问题
AUDIO_LANGUAGE = 'en'

## ======================== 进阶设置设置 ======================== ##
# 支持视频格式
ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm']
Expand All @@ -49,15 +47,13 @@
gemini_pretrain = os.getenv('GEMINI_PRETRAIN')
cloud_model_dir = os.path.join(gemini_pretrain, "_model_cache")

# WhisperNLP 配置
# GPT_SoVITSuvr5 模型目录
MODEL_DIR = "./_model_cache" if not cloud else cloud_model_dir
WHISPER_MODEL = "medium" # medium :12 GB < GPU > 12GB : large-v2
SPACY_NLP_MODEL = "en_core_web_md" # _md 足够

# 音频配置
MIN_SUBTITLE_DURATION = 5

# 配音视频中原始人声音量
# 配音视频中原始人声音量 0.1=10%
ORIGINAL_VOLUME = 0.1

# 第一次粗切单词数,18以下会切太碎影响翻译,22 以上太长会导致后续为字幕切分难以对齐
Expand Down
120 changes: 65 additions & 55 deletions core/prompts_storage.py

Large diffs are not rendered by default.

62 changes: 54 additions & 8 deletions core/spacy_utils/load_nlp_model.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,62 @@
import os,sys
import os,sys,json
import spacy
from spacy.cli import download
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from config import SPACY_NLP_MODEL
from core.step2_whisperapi import get_whisper_language

def get_spacy_model(language: str):
language_map = {
"english": "en_core_web_sm",
"chinese": "zh_core_web_sm",
"spanish": "es_core_news_sm",
"french": "fr_core_news_sm",
"german": "de_core_news_sm",
"italian": "it_core_news_sm",
"japanese": "ja_core_news_sm",
"portuguese": "pt_core_news_sm",
"dutch": "nl_core_news_sm",
"greek": "el_core_news_sm",
"russian": "ru_core_news_sm",
"arabic": "ar_core_news_sm",
"hindi": "hi_core_news_sm",
"korean": "ko_core_news_sm",
"polish": "pl_core_news_sm",
"ukrainian": "uk_core_news_sm",
"vietnamese": "vi_core_news_sm",
"turkish": "tr_core_news_sm",
"thai": "th_core_news_sm",
"romanian": "ro_core_news_sm",
"danish": "da_core_news_sm",
"finnish": "fi_core_news_sm",
"hungarian": "hu_core_news_sm",
"norwegian": "nb_core_news_sm",
"swedish": "sv_core_news_sm"
}

model = language_map.get(language.lower(), "en_core_web_sm")
if language not in language_map:
print(f"Spacy 模型不支持'{language}',使用 en_core_web_sm 模型作为后备选项...")
return model

def init_nlp():
print(f"⏳ Loading NLP Spacy model: <{SPACY_NLP_MODEL}> ...")
try:
nlp = spacy.load(SPACY_NLP_MODEL)
language = get_whisper_language()
model = get_spacy_model(language)
print(f"⏳ 正在加载 NLP Spacy 模型: <{model}> ...")
try:
nlp = spacy.load(model)
except:
print(f"正在下载 {model} 模型...")
download(model)
nlp = spacy.load(model)
except:
print(f"Downloading {SPACY_NLP_MODEL} model...")
download(SPACY_NLP_MODEL)
nlp = spacy.load(SPACY_NLP_MODEL)
print(f"✅ NLP Spacy model loaded successfully!")
print(f"未检测到语言,使用 en_core_web_sm 模型作为后备选项...")
model = "en_core_web_sm"
try:
nlp = spacy.load(model)
except:
print(f"正在下载 {model} 模型...")
download(model)
nlp = spacy.load(model)
print(f"✅ NLP Spacy 模型加载成功!")
return nlp
9 changes: 5 additions & 4 deletions core/spacy_utils/split_by_comma.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def split_by_comma(text, nlp):
start = 0

for i, token in enumerate(doc):
if token.text == ",":
if token.text == "," or token.text == ",":
suitable_for_splitting = analyze_comma(start, doc, token)

if suitable_for_splitting :
Expand Down Expand Up @@ -67,6 +67,7 @@ def split_by_comma_main():
print("💾 Sentences split by commas saved to → `sentences_by_comma.txt`")

if __name__ == "__main__":
# split_by_comma_main()
test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not."
print(split_by_comma(test))
split_by_comma_main()
# nlp = init_nlp()
# test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not."
# print(split_by_comma(test, nlp))
16 changes: 10 additions & 6 deletions core/spacy_utils/split_by_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,18 @@ def split_sentences_main():
split_sentences = split_by_connectors(sentence.strip(), nlp = nlp)
all_split_sentences.extend(split_sentences)

# output to sentence_splitbymark.txt
with open("output/log/sentence_splitbymark.txt", "w+", encoding="utf-8") as output_file:
# output to sentence_splitbynlp.txt
with open("output/log/sentence_splitbynlp.txt", "w+", encoding="utf-8") as output_file:
for sentence in all_split_sentences:
output_file.write(sentence + "\n")
# 最后一行不加换行符
output_file.seek(output_file.tell() - 1, os.SEEK_SET)
output_file.truncate()

print("💾 Sentences split by connectors saved to → `sentence_splitbymark.txt`")
print("💾 Sentences split by connectors saved to → `sentence_splitbynlp.txt`")

if __name__ == "__main__":
# split_sentences_main()
a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't."
print(split_by_connectors(a))
split_sentences_main()
# nlp = init_nlp()
# a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't."
# print(split_by_connectors(a, nlp))
5 changes: 1 addition & 4 deletions core/spacy_utils/split_by_mark.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import os,sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from load_nlp_model import init_nlp

def split_by_mark():
nlp = init_nlp()
df = pd.read_excel("output/log/cleaned_chunks.xlsx")
df['text'] = df['text'].str.strip('"').str.strip()
input_text = " ".join(df['text'])
input_text = open("output/log/raw_transcript.txt", "r", encoding="utf-8").read()
doc = nlp(input_text)
assert doc.has_annotation("SENT_START")

Expand Down
92 changes: 0 additions & 92 deletions core/step2_whisper_stamped.py

This file was deleted.

Loading

0 comments on commit 2b5ac48

Please sign in to comment.