Merge pull request #39 from Huanshere/whisperapi

v0.3.0
Huanshere · Sep 4, 2024 · 2b5ac48 · 2b5ac48
2 parents 0bb99df + 5bebfd3
commit 2b5ac48
Show file tree

Hide file tree

Showing 26 changed files with 471 additions and 386 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,9 +158,5 @@ _model_cache/
 # large files
 /ffmpeg.exe
 /ffmpeg
-*.mp4
-*.webm
-*.mp3
 .DS_Store
-runtime/
-dev/
+_config.py
diff --git a/config.py b/config.py
@@ -2,17 +2,18 @@
 # 建议在 streamlit 页面中调整设置
 ## ======================== 基本设置 ======================== ##
 # API 设置 建议使用唯一真神 https://api.wlai.vip, sonnet 价格仅 10r/1M
-# ！一定确保 key 是 AZ 渠道
+# ！WHISPER_API_KEY 暂时必须走 `纯AZ渠道`， API_KEY 建议走默认渠道
 API_KEY = 'sk-xxx'
+WHISPER_API_KEY = 'sk-xxx'
 BASE_URL = 'https://api2.wlai.vip'
 MODEL = ['claude-3-5-sonnet-20240620']
 
 # 语言设置，用自然语言描述
 TARGET_LANGUAGE = '简体中文'
 
 # 字幕设置
-## 每行英文字幕的最大长度字母数量
-MAX_ENGLISH_LENGTH = 80
+## 每行原字幕的最大长度字母数量
+MAX_SRC_LENGTH = 80
 ## 每行翻译字幕的最大长度 根据目标语言调整（如中文为30个字）
 MAX_TARGET_LANGUAGE_LENGTH = 30  
 
@@ -22,9 +23,6 @@
 # 视频分辨率
 RESOLUTIOM = '854x480'
 
-# whisper 指定语言，auto 为自动识别但目前不用英文后续会有小问题
-AUDIO_LANGUAGE = 'en'
-
 ## ======================== 进阶设置设置 ======================== ##
 # 支持视频格式
 ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm']
@@ -49,15 +47,13 @@
     gemini_pretrain = os.getenv('GEMINI_PRETRAIN')
     cloud_model_dir = os.path.join(gemini_pretrain, "_model_cache") 
 
-# Whisper 和 NLP 配置
+# GPT_SoVITS 和 uvr5 模型目录
 MODEL_DIR = "./_model_cache" if not cloud else cloud_model_dir
-WHISPER_MODEL = "medium"    # medium :12 GB < GPU > 12GB : large-v2
-SPACY_NLP_MODEL = "en_core_web_md"   # _md 足够
 
 # 音频配置
 MIN_SUBTITLE_DURATION = 5
 
-# 配音视频中原始人声音量
+# 配音视频中原始人声音量 0.1=10%
 ORIGINAL_VOLUME = 0.1
 
 # 第一次粗切单词数，18以下会切太碎影响翻译，22 以上太长会导致后续为字幕切分难以对齐

diff --git a/core/prompts_storage.py b/core/prompts_storage.py
diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py
@@ -1,16 +1,62 @@
-import os,sys
+import os,sys,json
 import spacy
 from spacy.cli import download
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-from config import SPACY_NLP_MODEL
+from core.step2_whisperapi import get_whisper_language
+
+def get_spacy_model(language: str):
+    language_map = {
+        "english": "en_core_web_sm",
+        "chinese": "zh_core_web_sm",
+        "spanish": "es_core_news_sm",
+        "french": "fr_core_news_sm",
+        "german": "de_core_news_sm",
+        "italian": "it_core_news_sm",
+        "japanese": "ja_core_news_sm",
+        "portuguese": "pt_core_news_sm",
+        "dutch": "nl_core_news_sm",
+        "greek": "el_core_news_sm",
+        "russian": "ru_core_news_sm",
+        "arabic": "ar_core_news_sm",
+        "hindi": "hi_core_news_sm",
+        "korean": "ko_core_news_sm",
+        "polish": "pl_core_news_sm",
+        "ukrainian": "uk_core_news_sm",
+        "vietnamese": "vi_core_news_sm",
+        "turkish": "tr_core_news_sm",
+        "thai": "th_core_news_sm",
+        "romanian": "ro_core_news_sm",
+        "danish": "da_core_news_sm",
+        "finnish": "fi_core_news_sm",
+        "hungarian": "hu_core_news_sm",
+        "norwegian": "nb_core_news_sm",
+        "swedish": "sv_core_news_sm"
+    }
+
+    model = language_map.get(language.lower(), "en_core_web_sm")
+    if language not in language_map:
+        print(f"Spacy 模型不支持'{language}'，使用 en_core_web_sm 模型作为后备选项...")
+    return model
 
 def init_nlp():
-    print(f"⏳ Loading NLP Spacy model: <{SPACY_NLP_MODEL}> ...")
     try:
-        nlp = spacy.load(SPACY_NLP_MODEL)
+        language = get_whisper_language()
+        model = get_spacy_model(language)
+        print(f"⏳ 正在加载 NLP Spacy 模型: <{model}> ...")
+        try:
+            nlp = spacy.load(model)
+        except:
+            print(f"正在下载 {model} 模型...")
+            download(model)
+            nlp = spacy.load(model)
     except:
-        print(f"Downloading {SPACY_NLP_MODEL} model...")
-        download(SPACY_NLP_MODEL)
-        nlp = spacy.load(SPACY_NLP_MODEL)
-    print(f"✅ NLP Spacy model loaded successfully!")
+        print(f"未检测到语言，使用 en_core_web_sm 模型作为后备选项...")
+        model = "en_core_web_sm"
+        try:
+            nlp = spacy.load(model)
+        except:
+            print(f"正在下载 {model} 模型...")
+            download(model)
+            nlp = spacy.load(model)
+    print(f"✅ NLP Spacy 模型加载成功！")
     return nlp
diff --git a/core/spacy_utils/split_by_comma.py b/core/spacy_utils/split_by_comma.py
@@ -32,7 +32,7 @@ def split_by_comma(text, nlp):
     start = 0
 
     for i, token in enumerate(doc):
-        if token.text == ",":
+        if token.text == "," or token.text == "，":
             suitable_for_splitting = analyze_comma(start, doc, token)
 
             if suitable_for_splitting :
@@ -67,6 +67,7 @@ def split_by_comma_main():
     print("💾 Sentences split by commas saved to →  `sentences_by_comma.txt`")
 
 if __name__ == "__main__":
-    # split_by_comma_main()
-    test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not."
-    print(split_by_comma(test))
+    split_by_comma_main()
+    # nlp = init_nlp()
+    # test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not."
+    # print(split_by_comma(test, nlp))
diff --git a/core/spacy_utils/split_by_connector.py b/core/spacy_utils/split_by_connector.py
@@ -89,14 +89,18 @@ def split_sentences_main():
         split_sentences = split_by_connectors(sentence.strip(), nlp = nlp)
         all_split_sentences.extend(split_sentences)
 
-    # output to sentence_splitbymark.txt
-    with open("output/log/sentence_splitbymark.txt", "w+", encoding="utf-8") as output_file:
+    # output to sentence_splitbynlp.txt
+    with open("output/log/sentence_splitbynlp.txt", "w+", encoding="utf-8") as output_file:
         for sentence in all_split_sentences:
             output_file.write(sentence + "\n")
+        # 最后一行不加换行符
+        output_file.seek(output_file.tell() - 1, os.SEEK_SET)
+        output_file.truncate()
 
-    print("💾 Sentences split by connectors saved to →  `sentence_splitbymark.txt`")
+    print("💾 Sentences split by connectors saved to →  `sentence_splitbynlp.txt`")
 
 if __name__ == "__main__":
-    # split_sentences_main()
-    a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't."
-    print(split_by_connectors(a))
+    split_sentences_main()
+    # nlp = init_nlp()
+    # a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't."
+    # print(split_by_connectors(a, nlp))
diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py
@@ -1,15 +1,12 @@
 import warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
-import pandas as pd
 import os,sys
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from load_nlp_model import init_nlp
 
 def split_by_mark():
     nlp = init_nlp()
-    df = pd.read_excel("output/log/cleaned_chunks.xlsx")
-    df['text'] = df['text'].str.strip('"').str.strip()
-    input_text = " ".join(df['text'])
+    input_text = open("output/log/raw_transcript.txt", "r", encoding="utf-8").read()
     doc = nlp(input_text)
     assert doc.has_annotation("SENT_START")
 

diff --git a/core/step2_whisper_stamped.py b/core/step2_whisper_stamped.py