diff --git a/.gitignore b/.gitignore index aaac5435..c2150805 100644 --- a/.gitignore +++ b/.gitignore @@ -158,9 +158,5 @@ _model_cache/ # large files /ffmpeg.exe /ffmpeg -*.mp4 -*.webm -*.mp3 .DS_Store -runtime/ -dev/ \ No newline at end of file +_config.py \ No newline at end of file diff --git a/config.py b/config.py index 8d2740aa..a26d25ec 100644 --- a/config.py +++ b/config.py @@ -2,8 +2,9 @@ # 建议在 streamlit 页面中调整设置 ## ======================== 基本设置 ======================== ## # API 设置 建议使用唯一真神 https://api.wlai.vip, sonnet 价格仅 10r/1M -# !一定确保 key 是 AZ 渠道 +# !WHISPER_API_KEY 暂时必须走 `纯AZ渠道`, API_KEY 建议走默认渠道 API_KEY = 'sk-xxx' +WHISPER_API_KEY = 'sk-xxx' BASE_URL = 'https://api2.wlai.vip' MODEL = ['claude-3-5-sonnet-20240620'] @@ -11,8 +12,8 @@ TARGET_LANGUAGE = '简体中文' # 字幕设置 -## 每行英文字幕的最大长度字母数量 -MAX_ENGLISH_LENGTH = 80 +## 每行原字幕的最大长度字母数量 +MAX_SRC_LENGTH = 80 ## 每行翻译字幕的最大长度 根据目标语言调整(如中文为30个字) MAX_TARGET_LANGUAGE_LENGTH = 30 @@ -22,9 +23,6 @@ # 视频分辨率 RESOLUTIOM = '854x480' -# whisper 指定语言,auto 为自动识别但目前不用英文后续会有小问题 -AUDIO_LANGUAGE = 'en' - ## ======================== 进阶设置设置 ======================== ## # 支持视频格式 ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm'] @@ -49,15 +47,13 @@ gemini_pretrain = os.getenv('GEMINI_PRETRAIN') cloud_model_dir = os.path.join(gemini_pretrain, "_model_cache") -# Whisper 和 NLP 配置 +# GPT_SoVITS 和 uvr5 模型目录 MODEL_DIR = "./_model_cache" if not cloud else cloud_model_dir -WHISPER_MODEL = "medium" # medium :12 GB < GPU > 12GB : large-v2 -SPACY_NLP_MODEL = "en_core_web_md" # _md 足够 # 音频配置 MIN_SUBTITLE_DURATION = 5 -# 配音视频中原始人声音量 +# 配音视频中原始人声音量 0.1=10% ORIGINAL_VOLUME = 0.1 # 第一次粗切单词数,18以下会切太碎影响翻译,22 以上太长会导致后续为字幕切分难以对齐 diff --git a/core/prompts_storage.py b/core/prompts_storage.py index bddb44fc..7f71ed7a 100644 --- a/core/prompts_storage.py +++ b/core/prompts_storage.py @@ -1,12 +1,14 @@ -import json - +import os,sys,json +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from core.step2_whisperapi import get_whisper_language ## ================================================================ # @ step4_splitbymeaning.py def get_split_prompt(sentence, num_parts = 2, word_limit = 20): # ! only support num_parts = 2 + language = get_whisper_language() split_prompt = f""" ### Role -You are a professional and experienced Netflix subtitle splitter. +You are a professional and experienced Netflix subtitle splitter in {language}. ### Task Your task is to split the given subtitle text into **{num_parts}** parts, each should be less than {word_limit} words. @@ -43,15 +45,17 @@ def get_split_prompt(sentence, num_parts = 2, word_limit = 20): ## ================================================================ # @ step4_1_summarize.py -def get_summary_prompt(English_content, target_language): +def get_summary_prompt(source_content): + src_language = get_whisper_language() + from config import TARGET_LANGUAGE summary_prompt = f""" ### Role -You are a professional video translation expert and terminology consultant. Your expertise lies not only in accurately understanding the original text but also in extracting key professional terms and optimizing the translation to better suit the expression habits and cultural background of {target_language}. +You are a professional video translation expert and terminology consultant. Your expertise lies not only in accurately understanding the original {src_language} text but also in extracting key professional terms and optimizing the translation to better suit the expression habits and cultural background of {TARGET_LANGUAGE}. ### Task Description -For the provided original video text, you need to: +For the provided original {src_language} video text, you need to: 1. Summarize the video's main topic in one sentence -2. Extract professional terms that appear in the video, and provide {target_language} translations or suggest keeping the original language terms. Avoid extracting simple, common words. +2. Extract professional terms that appear in the video, and provide {TARGET_LANGUAGE} translations or suggest keeping the original language terms. Avoid extracting simple, common words. 3. For each translated term, provide a brief explanation ### Analysis and Summary Steps @@ -61,7 +65,7 @@ def get_summary_prompt(English_content, target_language): - Summarize the topic in one concise sentence 2. Term extraction: - Carefully read the entire text, marking professional terms - - For each term, provide a {target_language} translation or suggest keeping the original, only the word itself is needed, not the pronunciation + - For each term, provide a {TARGET_LANGUAGE} translation or suggest keeping the original, only the word itself is needed, not the pronunciation - Add a brief explanation for each term to help the translator understand - If the word is a fixed abbreviation, please keep the original. @@ -71,13 +75,13 @@ def get_summary_prompt(English_content, target_language): "theme": "", "terms": [ {{ - "original": "", - "translation": "<{target_language} translation or keep original>", + "original": "", + "translation": "<{TARGET_LANGUAGE} translation or keep original>", "explanation": "" }}, {{ - "original": "", - "translation": "<{target_language} translation or keep original>", + "original": "", + "translation": "<{TARGET_LANGUAGE} translation or keep original>", "explanation": "" }}, ... @@ -105,7 +109,7 @@ def get_summary_prompt(English_content, target_language): ### Video text data to be processed -{English_content} +{source_content} """.strip() @@ -113,7 +117,6 @@ def get_summary_prompt(English_content, target_language): ## ================================================================ # @ step5_translate.py & translate_lines.py - def generate_shared_prompt(previous_content_prompt, after_content_prompt, summary_prompt, things_to_note_prompt): return f'''### Context Information @@ -130,9 +133,8 @@ def generate_shared_prompt(previous_content_prompt, after_content_prompt, summar ### Points to Note {things_to_note_prompt}''' - - -def get_prompt_faithfulness(lines, shared_prompt, target_language = '简体中文'): +def get_prompt_faithfulness(lines, shared_prompt): + from config import TARGET_LANGUAGE # Split lines by \n line_splits = lines.split('\n') @@ -140,20 +142,21 @@ def get_prompt_faithfulness(lines, shared_prompt, target_language = '简体中 json_format = {} for i, line in enumerate(line_splits, 1): json_format[i] = { - "Original English": line, - "Direct Translation": f"<>" + "Original Subtitle": line, + "Direct Translation": f"<>" } + src_language = get_whisper_language() prompt_faithfulness = f''' ### Role Definition -You are a professional Netflix subtitle translator, fluent in both the original video language and {target_language}, as well as their respective cultures. Your expertise lies in accurately understanding the semantics and structure of the original text and faithfully translating it into {target_language} while preserving the original meaning. +You are a professional Netflix subtitle translator, fluent in both {src_language} and {TARGET_LANGUAGE}, as well as their respective cultures. Your expertise lies in accurately understanding the semantics and structure of the original {src_language} text and faithfully translating it into {TARGET_LANGUAGE} while preserving the original meaning. ### Task Background -We have a segment of original subtitles that need to be directly translated into {target_language}. These subtitles come from a specific context and may contain specific themes and terminology. +We have a segment of original {src_language} subtitles that need to be directly translated into {TARGET_LANGUAGE}. These subtitles come from a specific context and may contain specific themes and terminology. ### Task Description -Based on the provided original subtitles, you need to: -1. Translate the original subtitles into {target_language} line by line +Based on the provided original {src_language} subtitles, you need to: +1. Translate the original {src_language} subtitles into {TARGET_LANGUAGE} line by line 2. Ensure the translation is faithful to the original, accurately conveying the original meaning 3. Consider the context and professional terminology @@ -177,25 +180,27 @@ def get_prompt_faithfulness(lines, shared_prompt, target_language = '简体中 -def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt, target_language): +def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt): + from config import TARGET_LANGUAGE json_format = {} for key, value in faithfulness_result.items(): json_format[key] = { - "Original English": value['Original English'], + "Original Subtitle": value['Original Subtitle'], "Direct Translation": value['Direct Translation'], - "Translation Reflection": "<>", - "Free Translation": f"<>" + "Translation Reflection": "<>", + "Free Translation": f"<>" } + src_language = get_whisper_language() prompt_expressiveness = f''' ### Role Definition -You are a professional Netflix subtitle translator and language consultant. Your expertise lies not only in accurately understanding the original video language but also in optimizing the {target_language} translation to better suit the target language's expression habits and cultural background. +You are a professional Netflix subtitle translator and language consultant. Your expertise lies not only in accurately understanding the original {src_language} but also in optimizing the {TARGET_LANGUAGE} translation to better suit the target language's expression habits and cultural background. ### Task Background -We already have a direct translation version of the original subtitles. Now we need you to reflect on and improve these direct translations to create more natural and fluent {target_language} subtitles. +We already have a direct translation version of the original {src_language} subtitles. Now we need you to reflect on and improve these direct translations to create more natural and fluent {TARGET_LANGUAGE} subtitles. ### Task Description -Based on the provided original text and {target_language} direct translation, you need to: +Based on the provided original {src_language} text and {TARGET_LANGUAGE} direct translation, you need to: 1. Analyze the direct translation results line by line, pointing out existing issues 2. Provide detailed modification suggestions 3. Perform free translation based on your analysis @@ -210,11 +215,11 @@ def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt, target_ - Check if the language style is consistent with the original text - Check the conciseness of the subtitles, point out where the translation is too wordy, the translation should be close to the original text in length -2. {target_language} Free Translation: +2. {TARGET_LANGUAGE} Free Translation: - Based on the reflection in step 1, perform free translation - - Aim for contextual smoothness and naturalness, conforming to {target_language} expression habits - - Ensure it's easy for {target_language} audience to understand and accept - - Keep the subtitles concise, with a plain and natural language style, and maintain consistency in structure between the free translation and the English original + - Aim for contextual smoothness and naturalness, conforming to {TARGET_LANGUAGE} expression habits + - Ensure it's easy for {TARGET_LANGUAGE} audience to understand and accept + - Keep the subtitles concise, with a plain and natural language style, and maintain consistency in structure between the free translation and the {src_language} original ### Subtitle Data @@ -230,42 +235,44 @@ def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt, target_ ## ================================================================ # @ step6_splitforsub.py -def get_align_prompt(en_original, target_original, en_part, target_language): - en_splits = en_part.split('\n') - num_parts = len(en_splits) - en_part = en_part.replace('\n', ' [br] ') +def get_align_prompt(src_sub, tr_sub, src_part): + from config import TARGET_LANGUAGE + src_language = get_whisper_language() + src_splits = src_part.split('\n') + num_parts = len(src_splits) + src_part = src_part.replace('\n', ' [br] ') align_prompt = ''' ### Role Definition -You are a Netflix subtitle alignment expert fluent in both the original video language and {target_language}. Your expertise lies in accurately understanding the semantics and structure of both languages, enabling you to flexibly split sentences while preserving the original meaning. +You are a Netflix subtitle alignment expert fluent in both {src_language} and {target_language}. Your expertise lies in accurately understanding the semantics and structure of both languages, enabling you to flexibly split sentences while preserving the original meaning. ### Task Background -We have the original video language and {target_language} original subtitles for a Netflix program, as well as a pre-processed split version of the original video language subtitles. Your task is to create the best splitting scheme for the {target_language} subtitles based on this information. +We have {src_language} and {target_language} original subtitles for a Netflix program, as well as a pre-processed split version of {src_language} subtitles. Your task is to create the best splitting scheme for the {target_language} subtitles based on this information. ### Task Description -Based on the provided original video language and {target_language} original subtitles, as well as the pre-processed split version, you need to: -1. Analyze the word order and structural correspondence between the original video language and {target_language} subtitles +Based on the provided original {src_language} and {target_language} original subtitles, as well as the pre-processed split version, you need to: +1. Analyze the word order and structural correspondence between {src_language} and {target_language} subtitles 2. Provide 3 different splitting schemes for the {target_language} subtitles 3. Evaluate these schemes and select the best one 4. Never leave empty lines. If it's difficult to split based on meaning, you may appropriately rewrite the sentences that need to be aligned ### Subtitle Data -Original: "{en_original}" -{target_language} Original: "{target_original}" -Pre-processed Original Video Language ([br] indicates split points): {en_part} +{src_language} Original: "{src_sub}" +{target_language} Original: "{tr_sub}" +Pre-processed {src_language} Subtitles ([br] indicates split points): {src_part} ### Processing Steps Please follow these steps and provide the results for each step in the JSON output: -1. Analysis and Comparison: Briefly analyze the word order, sentence structure, and semantic correspondence between the original video language and {target_language} subtitles. Point out key word correspondences, similarities and differences in sentence patterns, and language features that may affect splitting. -2. Start Alignment: Based on your analysis, provide 3 different alignment methods for {target_language} subtitles according to the format. The split positions in the original video language must be consistent with the pre-processed original video language split version and cannot be changed arbitrarily. +1. Analysis and Comparison: Briefly analyze the word order, sentence structure, and semantic correspondence between {src_language} and {target_language} subtitles. Point out key word correspondences, similarities and differences in sentence patterns, and language features that may affect splitting. +2. Start Alignment: Based on your analysis, provide 3 different alignment methods for {target_language} subtitles according to the format. The split positions in {src_language} must be consistent with the pre-processed {src_language} split version and cannot be changed arbitrarily. 3. Evaluation and Selection: Examine and briefly evaluate the 3 schemes, considering factors such as sentence completeness, semantic coherence, and appropriateness of split points. 4. Best Scheme: Select the best alignment scheme, output only a single number, 1 or 2 or 3. ### Output Format Please complete the following JSON data, where << >> represents placeholders, and return your results in JSON format: {{ - "analysis": "<>", + "analysis": "<>", "align_way_1": [ {align_parts_json} ], @@ -280,22 +287,24 @@ def get_align_prompt(en_original, target_original, en_part, target_language): align_parts_json = ','.join( f''' {{ - "en_part_{i+1}": "<<{en_splits[i]}>>", - "target_part_{i+1}": "<>" + "src_part_{i+1}": "<<{src_splits[i]}>>", + "target_part_{i+1}": "<>" }}''' for i in range(num_parts) ) return align_prompt.format( - en_original=en_original, - target_original=target_original, - en_part=en_part, + src_language=src_language, + target_language=TARGET_LANGUAGE, + src_sub=src_sub, + tr_sub=tr_sub, + src_part=src_part, align_parts_json=align_parts_json, - target_language=target_language ) ## ================================================================ # @ step9_generate_audio_task.py @ step10_generate_audio.py def get_subtitle_trim_prompt(trans_text, duration, fierce_mode = False): + src_language = get_whisper_language() if not fierce_mode: rule = 'Only consider a. Replacing commas with spaces to reduce pause time. b. Reducing filler words without modifying meaningful content. c. Omitting unnecessary modifiers or pronouns, for example "Please explain your thought process" can be shortened to "Please explain thought process"' else: @@ -303,7 +312,7 @@ def get_subtitle_trim_prompt(trans_text, duration, fierce_mode = False): trim_prompt = ''' ### Role Definition -You are a professional subtitle editor, editing and optimizing subtitles before handing them over to voice actors. Your expertise lies in cleverly condensing subtitles while ensuring the original meaning remains intact. +You are a professional {src_language} subtitle editor, editing and optimizing subtitles before handing them over to voice actors. Your expertise lies in cleverly condensing subtitles while ensuring the original meaning remains intact. ### Subtitle Data @@ -327,6 +336,7 @@ def get_subtitle_trim_prompt(trans_text, duration, fierce_mode = False): }} ''' return trim_prompt.format( + src_language=src_language, trans_text=trans_text, duration=duration, rule=rule diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py index 3c0f4cc2..26672f33 100644 --- a/core/spacy_utils/load_nlp_model.py +++ b/core/spacy_utils/load_nlp_model.py @@ -1,16 +1,62 @@ -import os,sys +import os,sys,json import spacy from spacy.cli import download sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -from config import SPACY_NLP_MODEL +from core.step2_whisperapi import get_whisper_language + +def get_spacy_model(language: str): + language_map = { + "english": "en_core_web_sm", + "chinese": "zh_core_web_sm", + "spanish": "es_core_news_sm", + "french": "fr_core_news_sm", + "german": "de_core_news_sm", + "italian": "it_core_news_sm", + "japanese": "ja_core_news_sm", + "portuguese": "pt_core_news_sm", + "dutch": "nl_core_news_sm", + "greek": "el_core_news_sm", + "russian": "ru_core_news_sm", + "arabic": "ar_core_news_sm", + "hindi": "hi_core_news_sm", + "korean": "ko_core_news_sm", + "polish": "pl_core_news_sm", + "ukrainian": "uk_core_news_sm", + "vietnamese": "vi_core_news_sm", + "turkish": "tr_core_news_sm", + "thai": "th_core_news_sm", + "romanian": "ro_core_news_sm", + "danish": "da_core_news_sm", + "finnish": "fi_core_news_sm", + "hungarian": "hu_core_news_sm", + "norwegian": "nb_core_news_sm", + "swedish": "sv_core_news_sm" + } + + model = language_map.get(language.lower(), "en_core_web_sm") + if language not in language_map: + print(f"Spacy 模型不支持'{language}',使用 en_core_web_sm 模型作为后备选项...") + return model def init_nlp(): - print(f"⏳ Loading NLP Spacy model: <{SPACY_NLP_MODEL}> ...") try: - nlp = spacy.load(SPACY_NLP_MODEL) + language = get_whisper_language() + model = get_spacy_model(language) + print(f"⏳ 正在加载 NLP Spacy 模型: <{model}> ...") + try: + nlp = spacy.load(model) + except: + print(f"正在下载 {model} 模型...") + download(model) + nlp = spacy.load(model) except: - print(f"Downloading {SPACY_NLP_MODEL} model...") - download(SPACY_NLP_MODEL) - nlp = spacy.load(SPACY_NLP_MODEL) - print(f"✅ NLP Spacy model loaded successfully!") + print(f"未检测到语言,使用 en_core_web_sm 模型作为后备选项...") + model = "en_core_web_sm" + try: + nlp = spacy.load(model) + except: + print(f"正在下载 {model} 模型...") + download(model) + nlp = spacy.load(model) + print(f"✅ NLP Spacy 模型加载成功!") return nlp \ No newline at end of file diff --git a/core/spacy_utils/split_by_comma.py b/core/spacy_utils/split_by_comma.py index e947191e..e84c6d78 100644 --- a/core/spacy_utils/split_by_comma.py +++ b/core/spacy_utils/split_by_comma.py @@ -32,7 +32,7 @@ def split_by_comma(text, nlp): start = 0 for i, token in enumerate(doc): - if token.text == ",": + if token.text == "," or token.text == ",": suitable_for_splitting = analyze_comma(start, doc, token) if suitable_for_splitting : @@ -67,6 +67,7 @@ def split_by_comma_main(): print("💾 Sentences split by commas saved to → `sentences_by_comma.txt`") if __name__ == "__main__": - # split_by_comma_main() - test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not." - print(split_by_comma(test)) \ No newline at end of file + split_by_comma_main() + # nlp = init_nlp() + # test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not." + # print(split_by_comma(test, nlp)) \ No newline at end of file diff --git a/core/spacy_utils/split_by_connector.py b/core/spacy_utils/split_by_connector.py index 4f501744..db1ca60c 100644 --- a/core/spacy_utils/split_by_connector.py +++ b/core/spacy_utils/split_by_connector.py @@ -89,14 +89,18 @@ def split_sentences_main(): split_sentences = split_by_connectors(sentence.strip(), nlp = nlp) all_split_sentences.extend(split_sentences) - # output to sentence_splitbymark.txt - with open("output/log/sentence_splitbymark.txt", "w+", encoding="utf-8") as output_file: + # output to sentence_splitbynlp.txt + with open("output/log/sentence_splitbynlp.txt", "w+", encoding="utf-8") as output_file: for sentence in all_split_sentences: output_file.write(sentence + "\n") + # 最后一行不加换行符 + output_file.seek(output_file.tell() - 1, os.SEEK_SET) + output_file.truncate() - print("💾 Sentences split by connectors saved to → `sentence_splitbymark.txt`") + print("💾 Sentences split by connectors saved to → `sentence_splitbynlp.txt`") if __name__ == "__main__": - # split_sentences_main() - a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't." - print(split_by_connectors(a)) \ No newline at end of file + split_sentences_main() + # nlp = init_nlp() + # a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't." + # print(split_by_connectors(a, nlp)) \ No newline at end of file diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py index 3c8be6f6..430314a9 100644 --- a/core/spacy_utils/split_by_mark.py +++ b/core/spacy_utils/split_by_mark.py @@ -1,15 +1,12 @@ import warnings warnings.filterwarnings("ignore", category=FutureWarning) -import pandas as pd import os,sys sys.path.append(os.path.dirname(os.path.abspath(__file__))) from load_nlp_model import init_nlp def split_by_mark(): nlp = init_nlp() - df = pd.read_excel("output/log/cleaned_chunks.xlsx") - df['text'] = df['text'].str.strip('"').str.strip() - input_text = " ".join(df['text']) + input_text = open("output/log/raw_transcript.txt", "r", encoding="utf-8").read() doc = nlp(input_text) assert doc.has_annotation("SENT_START") diff --git a/core/step2_whisper_stamped.py b/core/step2_whisper_stamped.py deleted file mode 100644 index 7216e61d..00000000 --- a/core/step2_whisper_stamped.py +++ /dev/null @@ -1,92 +0,0 @@ -import os,sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import subprocess -import whisper_timestamped as whisper -import torch -import pandas as pd -from typing import List, Dict -import warnings -from core.step1_ytdlp import find_video_files -warnings.filterwarnings("ignore") - -def convert_video_to_audio_and_transcribe(input_file: str): - from config import WHISPER_MODEL, MODEL_DIR, AUDIO_LANGUAGE - # 🎬➡️🎵➡️📊 Convert video to audio and transcribe - audio_file = os.path.splitext(input_file)[0] + '_temp.mp3' - - try: - if not os.path.exists(audio_file): - # Convert video to audio - ffmpeg_cmd = [ - 'ffmpeg', - '-i', input_file, - '-vn', - '-acodec', 'libmp3lame', - '-ar', '16000', - '-b:a', '64k', - audio_file - ] - print(f"🎬➡️🎵 正在转换为音频......") - subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE) - print(f"🎬➡️🎵 已将 <{input_file}> 转换为 <{audio_file}>\n") - - # Check file size - if os.path.getsize(audio_file) > 25 * 1024 * 1024: - print("⚠️ 文件大小超过25MB。请使用更小的文件。") - return None - - # Transcribe audio - device = 'cuda:0' if torch.cuda.is_available() else 'cpu' # sadly whisper does not support mps on mac - print(f"🚀 正在启动Whisper...\n🖥️ ASR设备: {device}") - - audio = whisper.load_audio(audio_file) - os.makedirs(MODEL_DIR, exist_ok=True) - model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR) - # result = whisper.transcribe(model, audio, language="en") - if AUDIO_LANGUAGE == 'auto': - result = whisper.transcribe(model, audio) - else: - result = whisper.transcribe(model, audio, language=AUDIO_LANGUAGE) - - # Process transcription results - all_words: List[Dict[str, float]] = [ - {'text': f"{word['text']}", 'start': word['start'], 'end': word['end']} - for segment in result['segments'] - for word in segment['words'] - ] - - df = pd.DataFrame(all_words) - return df - - except subprocess.CalledProcessError as e: - print(f"❌ 转换 {input_file} 时出错: {e.stderr.decode()}") - return None - finally: - if os.path.exists(audio_file): - os.remove(audio_file) - print(f"🗑️ 临时音频文件 {audio_file} 已被删除。") - - -def save_results(df: pd.DataFrame): - # 💾 Save transcription results as Excel and text files - os.makedirs('output', exist_ok=True) - os.makedirs('output/log', exist_ok=True) - excel_path = os.path.join('output/log', "cleaned_chunks.xlsx") - # 给df[text]列都加上"",防止数字被excel自动转换为数字 - df['text'] = df['text'].apply(lambda x: f'"{x}"') - df.to_excel(excel_path, index=False) - print(f"📊 Excel文件已保存到 {excel_path}") - -def transcript(video_file: StopIteration): - if not os.path.exists("output/log/cleaned_chunks.xlsx"): - # 🎥➡️📝 Transcribe video to text - df = convert_video_to_audio_and_transcribe(video_file) - if df is not None: - save_results(df) - else: - print("📊 转录结果已存在,跳过转录步骤。") - -if __name__ == "__main__": - video_file = find_video_files() - print(f"🎬 找到的视频文件: {video_file}, 开始转录...") - transcript(video_file) \ No newline at end of file diff --git a/core/step2_whisperapi.py b/core/step2_whisperapi.py new file mode 100644 index 00000000..37efa7e5 --- /dev/null +++ b/core/step2_whisperapi.py @@ -0,0 +1,153 @@ +import os,sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import subprocess +from openai import OpenAI +import pandas as pd +from typing import List, Dict +from uvr5.uvr5_for_videolingo import uvr5_for_videolingo +import librosa +import numpy as np +import json + +def convert_video_to_audio(input_file: str): + # 🎬➡️🎵 Convert video to audio + # audio_file = os.path.splitext(input_file)[0] + '_temp.mp3' + os.makedirs('output/audio', exist_ok=True) + audio_file = 'output/audio/raw_full_audio.wav' + + if not os.path.exists(audio_file): + # Convert video to audio using single line ffmpeg command + ffmpeg_cmd = [ + 'ffmpeg', + '-i', input_file, + '-vn', + '-acodec', 'libmp3lame', + '-ar', '16000', + '-b:a', '64k', + audio_file + ] + print(f"🎬➡️🎵 正在转换为音频......") + subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE) + print(f"🎬➡️🎵 已将 <{input_file}> 转换为 <{audio_file}>\n") + + # Check file size + if os.path.getsize(audio_file) > 25 * 1024 * 1024: + print("⚠️ 文件大小超过25MB。请使用更小的文件。") + return None + + return audio_file + +def detect_background_music(audio_file: str, threshold: float = 20.0) -> bool: + print(f"🎵➡️🔍 正在检测背景音乐...") + y, sr = librosa.load(audio_file) + S = np.abs(librosa.stft(y)) + contrast = librosa.feature.spectral_contrast(S=S, sr=sr) + mean_contrast = np.mean(contrast) + + print(f"平均频谱对比度: {mean_contrast}") + return mean_contrast > threshold + +def uvr5_process(audio_file: str): + audio_dir = os.path.dirname(audio_file) + audio_name = os.path.basename(audio_file) + vocal_file = os.path.join(audio_dir, 'raw_vocal_uvr.wav') + bg_file = os.path.join(audio_dir, 'raw_background_uvr.wav') + comp_vocal = os.path.join(audio_dir, 'raw_vocal.mp3') + comp_bg = os.path.join(audio_dir, 'raw_background.mp3') + + if not os.path.exists(comp_vocal) or not os.path.exists(comp_bg): + if not os.path.exists(vocal_file) and detect_background_music(audio_file): + print("🎵➡️🎵 正在使用uvr5分离人声和伴奏......") + uvr5_for_videolingo(audio_file, save_dir=audio_dir) + os.rename(os.path.join(audio_dir, f'vocal_{audio_name}_10.wav'), vocal_file) + os.rename(os.path.join(audio_dir, f'instrument_{audio_name}_10.wav'), bg_file) + else: + print("未检测到明显的背景音乐或已处理,跳过UVR处理。") + return audio_file + + for in_file, out_file, type_name in [ + (vocal_file, comp_vocal, "人声"), + (bg_file, comp_bg, "背景") + ]: + print(f"🎵➡️🗜️ 正在压缩{type_name}音频文件......") + subprocess.run([ + 'ffmpeg', + '-i', in_file, + '-ar', '16000', + '-b:a', '64k', + out_file + ], check=True, stderr=subprocess.PIPE) + print(f"🎵➡️🗜️ {type_name}音频文件已压缩: {out_file}") + # 删除原始文件 + os.remove(in_file) + else: + print("🎵➡️🎵 UVR处理和压缩已完成,跳过处理。") + + return comp_vocal + +def transcribe_audio(audio_file: str): + from config import WHISPER_API_KEY, BASE_URL + print(f"🎵➡️📝 正在转录音频{audio_file}为文本......") + client = OpenAI( + base_url=BASE_URL+"/v1", + api_key=WHISPER_API_KEY + ) + + audio = open(audio_file, "rb") + transcript = client.audio.transcriptions.create( + file=audio, + model="whisper-1", + response_format="verbose_json", + timestamp_granularities=["word"] + ) + + # 保存原始转录文本 + os.makedirs('output/log', exist_ok=True) + with open("output/log/raw_transcript.txt", "w") as f: + f.write(transcript.text) + + print(f"🎵➡️📝 转录音频为文本完成,识别语言为: {transcript.language}") + with open("output/log/transcript_language.json", "w") as f: + json.dump({"language": transcript.language}, f) + + # 处理转录结果 + all_words: List[Dict[str, float]] = [ + {'text': f'"{word_info["word"]}"', 'start': round(word_info['start'], 2), 'end': round(word_info['end'], 2)} + for word_info in transcript.words + ] + + df = pd.DataFrame(all_words) + + # 💾 将转录结果保存为Excel文件 + excel_path = os.path.join('output/log', "cleaned_chunks.xlsx") + df.to_excel(excel_path, index=False) + print(f"📊 Excel文件已保存到 {excel_path}") + + return df + +def get_whisper_language(): + try: + with open("output/log/transcript_language.json", "r") as f: + language = json.load(f)["language"] + return language + except: + print("无法读取语言信息") + return None + +def transcribe(video_file: str): + if not os.path.exists("output/log/cleaned_chunks.xlsx"): + # 🎥➡️🎵 将视频转换为音频 + audio_file = convert_video_to_audio(video_file) + if audio_file: + #! 暂时保留, uvr5 效果一次不够感觉 + # vocal_file = uvr5_process(audio_file) + # 🎵➡️📝 转录音频为文本并保存结果 + # transcribe_audio(vocal_file) + transcribe_audio(audio_file) + else: + print("📊 转录结果已存在,跳过转录步骤。") + +if __name__ == "__main__": + from core.step1_ytdlp import find_video_files + video_file = find_video_files() + transcribe(video_file) \ No newline at end of file diff --git a/core/step3_1_spacy_split.py b/core/step3_1_spacy_split.py index 022a5021..b410d526 100644 --- a/core/step3_1_spacy_split.py +++ b/core/step3_1_spacy_split.py @@ -14,6 +14,4 @@ def split_by_spacy(): return if __name__ == '__main__': - split_by_spacy() - - + split_by_spacy() \ No newline at end of file diff --git a/core/step3_2_splitbymeaning.py b/core/step3_2_splitbymeaning.py index c54ceafb..a7507d94 100644 --- a/core/step3_2_splitbymeaning.py +++ b/core/step3_2_splitbymeaning.py @@ -88,7 +88,7 @@ def parallel_split_sentences(sentences, max_length, max_workers, retry_attempt=0 def split_sentences_by_meaning(): """按意义分割句子的主要函数。""" # 读取输入的句子 - with open('output/log/sentence_splitbymark.txt', 'r', encoding='utf-8') as f: + with open('output/log/sentence_splitbynlp.txt', 'r', encoding='utf-8') as f: sentences = [line.strip() for line in f.readlines()] # 🔄 多次处理句子以确保全部被分割 @@ -102,5 +102,5 @@ def split_sentences_by_meaning(): print('✅ 所有句子已成功分割') if __name__ == '__main__': - print(split_sentence('Which makes no sense to the... average guy who always pushes the character creation slider all the way to the right.', 2, 22)) - # split_sentences_by_meaning() \ No newline at end of file + # print(split_sentence('Which makes no sense to the... average guy who always pushes the character creation slider all the way to the right.', 2, 22)) + split_sentences_by_meaning() \ No newline at end of file diff --git a/core/step4_1_summarize.py b/core/step4_1_summarize.py index daf89032..36f22e78 100644 --- a/core/step4_1_summarize.py +++ b/core/step4_1_summarize.py @@ -13,7 +13,7 @@ def combine_chunks(): def search_things_to_note_in_prompt(sentence): """Search for terms to note in the given sentence""" - with open('output/log/translate terminology.json', 'r', encoding='utf-8') as file: + with open('output/log/terminology.json', 'r', encoding='utf-8') as file: things_to_note = json.load(file) things_to_note_list = [term['original'] for term in things_to_note['terms'] if term['original'].lower() in sentence.lower()] if things_to_note_list: @@ -28,15 +28,15 @@ def search_things_to_note_in_prompt(sentence): return None def get_summary(): - from config import step4_1_summarize_model, TARGET_LANGUAGE - English_content = combine_chunks() - summary_prompt = get_summary_prompt(English_content, TARGET_LANGUAGE) + from config import step4_1_summarize_model + src_content = combine_chunks() + summary_prompt = get_summary_prompt(src_content) summary = ask_gpt(summary_prompt, model=step4_1_summarize_model, response_json=True, log_title='summary') - with open('output/log/translate terminology.json', 'w', encoding='utf-8') as f: + with open('output/log/terminology.json', 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=4) - print('💾 Summary log saved to → `output/log/translate terminology.json`') + print('💾 Summary log saved to → `output/log/terminology.json`') if __name__ == '__main__': get_summary() \ No newline at end of file diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py index e2fc17ad..4ee2d06a 100644 --- a/core/step4_2_translate_all.py +++ b/core/step4_2_translate_all.py @@ -9,7 +9,7 @@ # Function to split text into chunks def split_chunks_by_chars(chunk_size=600, max_i=12): """Split text into chunks based on character count, return a list of multi-line text chunks""" - with open("output/log/sentence_splitbymeaning.txt", "r", encoding="utf-8") as file: + with open("output/log/sentence_splitbynlp.txt", "r", encoding="utf-8") as file: sentences = file.read().strip().split('\n') chunks = [] @@ -42,17 +42,17 @@ def translate_chunk(chunk, chunks, theme_prompt, i): # 🚀 Main function to translate all chunks def translate_all(): - from config import MAX_WORKERS # Check if the file exists if os.path.exists("output/log/translation_results.xlsx"): - print("🚨 The file `translation_results.xlsx` already exists, skipping this step.") + print("🚨 文件 `translation_results.xlsx` 已经存在,跳过此步。") return chunks = split_chunks_by_chars() - with open('output/log/translate terminology.json', 'r', encoding='utf-8') as file: + with open('output/log/terminology.json', 'r', encoding='utf-8') as file: theme_prompt = json.load(file).get('theme') # 🔄 Use concurrent execution for translation + from config import MAX_WORKERS with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = [] for i, chunk in enumerate(chunks): @@ -66,11 +66,11 @@ def translate_all(): results.sort(key=lambda x: x[0]) # Sort results based on original order # 💾 Save results to lists and Excel file - en_text, trans_text = [], [] + src_text, trans_text = [], [] for _, chunk, translation in results: - en_text.extend(chunk.split('\n')) + src_text.extend(chunk.split('\n')) trans_text.extend(translation.split('\n')) - pd.DataFrame({'English': en_text, 'Translation': trans_text}).to_excel("output/log/translation_results.xlsx", index=False) + pd.DataFrame({'Source': src_text, 'Translation': trans_text}).to_excel("output/log/translation_results.xlsx", index=False) if __name__ == '__main__': diff --git a/core/step4_2_translate_once.py b/core/step4_2_translate_once.py index 725b2adc..054ae000 100644 --- a/core/step4_2_translate_once.py +++ b/core/step4_2_translate_once.py @@ -4,25 +4,23 @@ from core.prompts_storage import generate_shared_prompt, get_prompt_faithfulness, get_prompt_expressiveness def translate_lines(lines, previous_content_prompt, after_cotent_prompt, things_to_note_prompt, summary_prompt, index = 0): - from config import step4_2_translate_direct_model, step4_2_translate_free_model, TARGET_LANGUAGE - """Translate multiple lines of English text separated by \n into Chinese, using a three-step translation approach for fidelity, expressiveness, and elegance. Returns multiple lines of translated results.""" + from config import step4_2_translate_direct_model, step4_2_translate_free_model shared_prompt = generate_shared_prompt(previous_content_prompt, after_cotent_prompt, summary_prompt, things_to_note_prompt) ## Step 1: Faithful to the Original Text - prompt1 = get_prompt_faithfulness(lines, shared_prompt,target_language=TARGET_LANGUAGE) + prompt1 = get_prompt_faithfulness(lines, shared_prompt) faith_result = ask_gpt(prompt1, model=step4_2_translate_direct_model, response_json=True, log_title='translate_faithfulness') - print(faith_result) for i in faith_result: - print(f'📄 Original English: {faith_result[i]["Original English"]}') - print(f'📚 Direct Translation: {faith_result[i]["Direct Translation"]}') + print(f'📄 Original Subtitle: {faith_result[i]["Original Subtitle"]}') + print(f'📚 Direct Translation: {faith_result[i]["Direct Translation"]}') ## Step 2: Express Smoothly - prompt2 = get_prompt_expressiveness(faith_result, lines, shared_prompt,target_language=TARGET_LANGUAGE) + prompt2 = get_prompt_expressiveness(faith_result, lines, shared_prompt) express_result = ask_gpt(prompt2, model=step4_2_translate_free_model, response_json=True, log_title='translate_expressiveness') for i in express_result: - print(f'📄 Original English: {express_result[i]["Original English"]}') - print(f'🧠 Free Translation: {express_result[i]["Free Translation"]}') + print(f'📄 Original Subtitle: {express_result[i]["Original Subtitle"]}') + print(f'🧠 Free Translation: {express_result[i]["Free Translation"]}') translate_result = "\n".join([express_result[i]["Free Translation"].strip() for i in express_result]) if len(lines.split('\n')) != len(translate_result.split('\n')): diff --git a/core/step5_splitforsub.py b/core/step5_splitforsub.py index bc87cecd..cfbc7f48 100644 --- a/core/step5_splitforsub.py +++ b/core/step5_splitforsub.py @@ -17,63 +17,60 @@ def calc_len(text: str) -> float: else: return len(text) -def align_subs(en_sub: str, tr_sub: str, en_part: str) -> Tuple[List[str], List[str]]: - from config import TARGET_LANGUAGE - align_prompt = get_align_prompt(en_sub, tr_sub, en_part, target_language=TARGET_LANGUAGE) +def align_subs(src_sub: str, tr_sub: str, src_part: str) -> Tuple[List[str], List[str]]: + align_prompt = get_align_prompt(src_sub, tr_sub, src_part) parsed = ask_gpt(align_prompt, model=step5_align_model, response_json=True, log_title='align_subs') best = int(parsed['best_way']) align_data = parsed[f'align_way_{best}'] - en_parts = en_part.split('\n') + src_parts = src_part.split('\n') tr_parts = [item[f'target_part_{i+1}'].strip() for i, item in enumerate(align_data)] - print(f"🔗 Aligned parts:\nSRC_LANG: {en_parts}\nTARGET_LANG: {tr_parts}\n================") - return en_parts, tr_parts + print(f"🔗 Aligned parts:\nSRC_LANG: {src_parts}\nTARGET_LANG: {tr_parts}\n") + return src_parts, tr_parts -def split_align_subs(en_lines: List[str], tr_lines: List[str], max_en_len=80, max_tr_len=30, max_retry=5) -> Tuple[List[str], List[str]]: +def split_align_subs(src_lines: List[str], tr_lines: List[str], max_retry=5) -> Tuple[List[str], List[str]]: + from config import MAX_SRC_LENGTH, MAX_TARGET_LANGUAGE_LENGTH for attempt in range(max_retry): - print(f"🔄 Splitting attempt {attempt + 1}") + print(f"🔄 切割尝试第 {attempt + 1} 次") to_split = [] - for i, (en, tr) in enumerate(zip(en_lines, tr_lines)): - if len(en) > max_en_len or calc_len(tr) > max_tr_len: + for i, (en, tr) in enumerate(zip(src_lines, tr_lines)): + if len(en) > MAX_SRC_LENGTH or calc_len(tr) > MAX_TARGET_LANGUAGE_LENGTH: to_split.append(i) - print(f"📏 Line {i} needs splitting:\nSRC_LANG: {en}\nTARGET_LANG: {tr}\n================") + print(f"📏 第 {i} 行需要切割:\nSRC_LANG: {en}\nTARGET_LANG: {tr}\n") def process(i): - split_en = split_sentence(en_lines[i], num_parts=2).strip() - en_lines[i], tr_lines[i] = align_subs(en_lines[i], tr_lines[i], split_en) + split_en = split_sentence(src_lines[i], num_parts=2).strip() + src_lines[i], tr_lines[i] = align_subs(src_lines[i], tr_lines[i], split_en) - with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + from config import MAX_WORKERS + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: executor.map(process, to_split) - # Flatten `en_lines` and `tr_lines` - en_lines = [item for sublist in en_lines for item in (sublist if isinstance(sublist, list) else [sublist])] + # Flatten `src_lines` and `tr_lines` + src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])] tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])] - if all(len(en) <= max_en_len for en in en_lines) and all(calc_len(tr) <= max_tr_len for tr in tr_lines): + if all(len(en) <= MAX_SRC_LENGTH for en in src_lines) and all(calc_len(tr) <= MAX_TARGET_LANGUAGE_LENGTH for tr in tr_lines): break - return en_lines, tr_lines + return src_lines, tr_lines def split_for_sub_main(): - from config import MAX_ENGLISH_LENGTH, MAX_TARGET_LANGUAGE_LENGTH - # check if "output/log/translation_results_for_subtitles.xlsx" exists if os.path.exists("output/log/translation_results_for_subtitles.xlsx"): - print("🚨 The file `translation_results_for_subtitles.xlsx` already exists, skipping this step.") + print("🚨 文件 `translation_results_for_subtitles.xlsx` 已经存在,跳过此步。") return - print('🚀 Starting subtitle splitting process...') + print('🚀 开始字幕分割...') df = pd.read_excel("output/log/translation_results.xlsx") - - en_lines = df['English'].tolist() + src_lines = df['Source'].tolist() tr_lines = df['Translation'].tolist() - - en_lines, tr_lines = split_align_subs(en_lines, tr_lines, MAX_ENGLISH_LENGTH, MAX_TARGET_LANGUAGE_LENGTH) - pd.DataFrame({'English': en_lines, 'Translation': tr_lines}).to_excel("output/log/translation_results_for_subtitles.xlsx", index=False) - print('✅ Subtitle splitting process completed!') + src_lines, tr_lines = split_align_subs(src_lines, tr_lines, max_retry=5) + pd.DataFrame({'Source': src_lines, 'Translation': tr_lines}).to_excel("output/log/translation_results_for_subtitles.xlsx", index=False) + print('✅ 字幕分割完成!') if __name__ == '__main__': split_for_sub_main() \ No newline at end of file diff --git a/core/step6_generate_final_timeline.py b/core/step6_generate_final_timeline.py index e94037dd..999d9567 100644 --- a/core/step6_generate_final_timeline.py +++ b/core/step6_generate_final_timeline.py @@ -1,5 +1,6 @@ import pandas as pd import os +import string def convert_to_srt_format(start_time, end_time): """Convert time (in seconds) to the format: hours:minutes:seconds,milliseconds""" @@ -18,13 +19,17 @@ def align_timestamp(df_text, df_translate, for_audio = False): """Align timestamps and add a new timestamp column to df_translate""" df_trans_time = df_translate.copy() - # Clean text 🧹 - clean_list = [',', '.'] - for char in clean_list: - df_text['text'] = df_text['text'].str.replace(char, '') - df_translate['English'] = df_translate['English'].str.replace(char, '') + #! 特殊符号特殊处理 + # 1. 把df_translate['Source']中每一句的"-"替换为" ",(避免连词) + df_translate['Source'] = df_translate['Source'].str.replace('-', ' ') + # 2. 所有,和.都替换为" ",然后把" "替换为" "(避免大数字) + df_translate['Source'] = df_translate['Source'].str.replace(',', ' ').str.replace('.', ' ').str.replace(' ', ' ') + # 3. 使用string.punctuation删除所有标点符号 + df_text['text'] = df_text['text'].str.translate(str.maketrans('', '', string.punctuation)) + df_translate['Source'] = df_translate['Source'].str.translate(str.maketrans('', '', string.punctuation)) + # 4. 转换为小写 df_text['text'] = df_text['text'].str.lower() - df_translate['English'] = df_translate['English'].str.lower() + df_translate['Source'] = df_translate['Source'].str.lower() # Assign an ID to each word in df_text['text'] and create a new DataFrame words = df_text['text'].str.split(expand=True).stack().reset_index(level=1, drop=True).reset_index() @@ -37,7 +42,7 @@ def align_timestamp(df_text, df_translate, for_audio = False): line_index = 0 - for line in df_translate['English']: + for line in df_translate['Source']: line_words = line.split() line_word_index = 0 start_time_id = None @@ -80,49 +85,47 @@ def align_timestamp(df_text, df_translate, for_audio = False): df_trans_time['timestamp'] = df_trans_time['timestamp'].apply(lambda x: convert_to_srt_format(x[0], x[1])) # Output subtitles 📜 - en_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['English']}\n\n" for i, row in df_trans_time.iterrows()]).strip() + src_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['Source']}\n\n" for i, row in df_trans_time.iterrows()]).strip() trans_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['Translation']}\n\n" for i, row in df_trans_time.iterrows()]).strip() - en_trans_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['English']}\n{row['Translation']}\n\n" for i, row in df_trans_time.iterrows()]).strip() - trans_en_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['Translation']}\n{row['English']}\n\n" for i, row in df_trans_time.iterrows()]).strip() + src_trans_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['Source']}\n{row['Translation']}\n\n" for i, row in df_trans_time.iterrows()]).strip() + trans_en_sub_str = ''.join([f"{i}\n{row['timestamp']}\n{row['Translation']}\n{row['Source']}\n\n" for i, row in df_trans_time.iterrows()]).strip() if not for_audio: os.makedirs('output', exist_ok=True) - with open('output/english_subtitles.srt', 'w', encoding='utf-8') as f: - f.write(en_sub_str) - with open('output/translated_subtitles.srt', 'w', encoding='utf-8') as f: + with open('output/src_subtitles.srt', 'w', encoding='utf-8') as f: + f.write(src_sub_str) + with open('output/trans_subtitles.srt', 'w', encoding='utf-8') as f: f.write(trans_sub_str) - with open('output/bilingual_en_trans_subtitles.srt', 'w', encoding='utf-8') as f: - f.write(en_trans_sub_str) - with open('output/bilingual_trans_en_subtitles.srt', 'w', encoding='utf-8') as f: + with open('output/bilingual_src_trans_subtitles.srt', 'w', encoding='utf-8') as f: + f.write(src_trans_sub_str) + with open('output/bilingual_trans_src_subtitles.srt', 'w', encoding='utf-8') as f: f.write(trans_en_sub_str) else: os.makedirs('output/audio', exist_ok=True) - with open('output/audio/english_subtitles_for_audio.srt', 'w', encoding='utf-8') as f: - f.write(en_sub_str) - with open('output/audio/translated_subtitles_for_audio.srt', 'w', encoding='utf-8') as f: - f.write(trans_sub_str - ) + with open('output/audio/src_subs_for_audio.srt', 'w', encoding='utf-8') as f: + f.write(src_sub_str) + with open('output/audio/trans_subs_for_audio.srt', 'w', encoding='utf-8') as f: + f.write(trans_sub_str) return df_trans_time def align_timestamp_main(): df_text = pd.read_excel('output/log/cleaned_chunks.xlsx') df_text['text'] = df_text['text'].str.strip('"').str.strip() df_translate = pd.read_excel('output/log/translation_results_for_subtitles.xlsx') - df_translate['Translation'] = df_translate['Translation'].apply(lambda x: str(x).strip('。').strip(',') if pd.notna(x) else '') + df_translate['Translation'] = df_translate['Translation'].apply(lambda x: str(x).strip('。').strip(',').strip('"') if pd.notna(x) else '') # check if there's empty translation if (df_translate['Translation'].str.len() == 0).sum() > 0: - raise ValueError(r'🚫 Empty translation detected! Please manually check the `output\log\translation_results_for_subtitles.xlsx` then rerun.') + raise ValueError(r'🚫 检测到空的翻译行!请手动检查 `output\log\translation_results_for_subtitles.xlsx` 中的空行填充内容,然后重新运行。') align_timestamp(df_text, df_translate) - print('🎉📝 Subtitles generated successfully! Go check it out inside `output` 👀') + print('🎉📝 字幕生成成功!请在 `output` 文件夹中查看 👀') # for audio df_translate_for_audio = pd.read_excel('output/log/translation_results.xlsx') df_translate_for_audio['Translation'] = df_translate_for_audio['Translation'].apply(lambda x: str(x).strip('。').strip(',')) if (df_translate_for_audio['Translation'].str.len() == 0).sum() > 0: - raise ValueError(r'🚫 Empty translation detected! Please manually check the `output\log\translation_results.xlsx` then rerun.') + raise ValueError(r'🚫 检测到空的翻译行!请手动检查 `output\log\translation_results.xlsx` 中的空行填充内容,然后重新运行。') align_timestamp(df_text, df_translate_for_audio, for_audio=True) - print('🎉📝 Subtitles for audio generated successfully! Go check it out inside `output/audio` 👀') - + print('🎉📝 音频字幕生成成功!请在 `output/audio` 文件夹中查看 👀') if __name__ == '__main__': diff --git a/core/step7_merge_sub_to_vid.py b/core/step7_merge_sub_to_vid.py index b6dbd143..931a827d 100644 --- a/core/step7_merge_sub_to_vid.py +++ b/core/step7_merge_sub_to_vid.py @@ -1,29 +1,27 @@ -import os, glob, subprocess, time, sys +import os, subprocess, time, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from core.step1_ytdlp import find_video_files -EN_FONT_SIZE = 16 -TRANS_FONT_SIZE = 18 +SRC_FONT_SIZE = 15 +TRANS_FONT_SIZE = 19 FONT_NAME = 'Arial' TRANS_FONT_NAME = 'Arial' -EN_FONT_COLOR = '&HFFFFFF' -EN_OUTLINE_COLOR = '&H000000' -EN_OUTLINE_WIDTH = 1 -EN_SHADOW_COLOR = '&H80000000' +SRC_FONT_COLOR = '&HFFFFFF' +SRC_OUTLINE_COLOR = '&H000000' +SRC_OUTLINE_WIDTH = 1 +SRC_SHADOW_COLOR = '&H80000000' TRANS_FONT_COLOR = '&H00FFFF' TRANS_OUTLINE_COLOR = '&H000000' TRANS_OUTLINE_WIDTH = 1 TRANS_BACK_COLOR = '&H33000000' - - def merge_subtitles_to_video(): from config import RESOLUTIOM TARGET_WIDTH, TARGET_HEIGHT = RESOLUTIOM.split('x') ## merge subtitles to video and save the output video video_file = find_video_files() - en_srt = "output/english_subtitles.srt" - trans_srt = "output/translated_subtitles.srt" + en_srt = "output/src_subtitles.srt" + trans_srt = "output/trans_subtitles.srt" if not os.path.exists(en_srt) or not os.path.exists(trans_srt): print("Subtitle files not found in the 'output' directory.") @@ -37,9 +35,9 @@ def merge_subtitles_to_video(): '-vf', ( f"scale={TARGET_WIDTH}:{TARGET_HEIGHT}:force_original_aspect_ratio=decrease," f"pad={TARGET_WIDTH}:{TARGET_HEIGHT}:(ow-iw)/2:(oh-ih)/2," - f"subtitles={en_srt}:force_style='FontSize={EN_FONT_SIZE},FontName={FONT_NAME}," - f"PrimaryColour={EN_FONT_COLOR},OutlineColour={EN_OUTLINE_COLOR},OutlineWidth={EN_OUTLINE_WIDTH}," - f"ShadowColour={EN_SHADOW_COLOR},BorderStyle=1'," + f"subtitles={en_srt}:force_style='FontSize={SRC_FONT_SIZE},FontName={FONT_NAME}," + f"PrimaryColour={SRC_FONT_COLOR},OutlineColour={SRC_OUTLINE_COLOR},OutlineWidth={SRC_OUTLINE_WIDTH}," + f"ShadowColour={SRC_SHADOW_COLOR},BorderStyle=1'," f"subtitles={trans_srt}:force_style='FontSize={TRANS_FONT_SIZE},FontName={TRANS_FONT_NAME}," f"PrimaryColour={TRANS_FONT_COLOR},OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH}," f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=25,BorderStyle=4'" diff --git a/core/step8_extract_refer_audio.py b/core/step8_extract_refer_audio.py index 070b4744..d6a2e071 100644 --- a/core/step8_extract_refer_audio.py +++ b/core/step8_extract_refer_audio.py @@ -33,7 +33,7 @@ def step8_main(input_video): print('output/audio/background.wav already exists, skip.') return - with open('output/audio/english_subtitles_for_audio.srt', 'r', encoding='utf-8') as f: + with open('output/audio/src_subs_for_audio.srt', 'r', encoding='utf-8') as f: srt_content = f.read() subtitles = parse_srt(srt_content) diff --git a/core/step9_generate_audio_task.py b/core/step9_generate_audio_task.py index f4be65e0..aa77be69 100644 --- a/core/step9_generate_audio_task.py +++ b/core/step9_generate_audio_task.py @@ -6,7 +6,6 @@ from core.ask_gpt import ask_gpt from core.prompts_storage import get_subtitle_trim_prompt - def check(text, duration, max_chars_per_second=8): # 定义标点符号列表 punctuations = ',,。!?:;"()《》【】' @@ -85,12 +84,12 @@ def process_srt(file_path): df.loc[i, 'text'] += ', ' + df.loc[i+1, 'text'] df.loc[i, 'end_time'] = df.loc[i+1, 'end_time'] df.loc[i, 'duration'] = (datetime.datetime.combine(datetime.date.today(), df.loc[i, 'end_time']) - - datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time'])).total_seconds() + datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time'])).total_seconds() df = df.drop(i+1).reset_index(drop=True) else: print(f"延长字幕 {i+1} 的持续时间到{MIN_SUBTITLE_DURATION}秒") df.loc[i, 'end_time'] = (datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time']) + - datetime.timedelta(seconds=MIN_SUBTITLE_DURATION)).time() + datetime.timedelta(seconds=MIN_SUBTITLE_DURATION)).time() df.loc[i, 'duration'] = MIN_SUBTITLE_DURATION i += 1 else: diff --git a/install.py b/install.py index 8c972c0f..fa9641c9 100644 --- a/install.py +++ b/install.py @@ -5,7 +5,6 @@ import zipfile import shutil sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from config import SPACY_NLP_MODEL, WHISPER_MODEL def install_package(*packages): subprocess.check_call([sys.executable, "-m", "pip", "install", *packages]) @@ -50,14 +49,14 @@ def install_torch(gpu_available): print("检测到GPU。正在安装支持CUDA的PyTorch...") subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchvision", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cu118"]) else: - print("未检测到GPU。正在安装不支持CUDA的PyTorch...") + print("正在安装cpu版本的PyTorch...") install_package("torch", "torchvision", "torchaudio") elif platform.system() == "Darwin": # macOS if "arm" in platform.processor().lower(): print("检测到Apple Silicon。正在安装支持MLX (Metal)的PyTorch...") subprocess.check_call([sys.executable, "-m", "pip", "install", "--pre", "torch", "torchvision", "torchaudio", "--extra-index-url", "https://download.pytorch.org/whl/nightly/cpu"]) else: - print("正在为macOS安装不支持MLX的PyTorch...") + print("正在为macOS安装cpu版本的PyTorch...") install_package("torch", "torchvision", "torchaudio") elif platform.system() == "Linux": if gpu_available: @@ -77,16 +76,6 @@ def install_requirements(): else: print("未找到requirements.txt。跳过安装。") -def download_spacy_model(): - """Download the specified spaCy model.""" - import spacy - from spacy.cli import download - try: - spacy.load(SPACY_NLP_MODEL) - except: - print(f"正在下载{SPACY_NLP_MODEL}模型...") - download(SPACY_NLP_MODEL) - def dowanload_uvr_model(): """Download the specified uvr model.""" if not os.path.exists("_model_cache/uvr5_weights/HP2_all_vocals.pth"): @@ -175,7 +164,6 @@ def download_and_extract_ffmpeg(): ffmpeg_exe = "ffmpeg" url = "https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz" else: - print("FFmpeg下载仅支持Windows和macOS。") return if os.path.exists(ffmpeg_exe): @@ -219,42 +207,34 @@ def download_and_extract_ffmpeg(): print("下载FFmpeg失败") def main(): - print("开始安装喽...") + print("开始安装...") # Install requests first install_package("requests") # Check GPU availability gpu_available = check_gpu() - + print(f"GPU 可用: {gpu_available}") + if gpu_available: + if_gpu = input("是否安装GPU版本的PyTorch? (注意:Windows 下安装 GPU 版本需要额外安装 Cmake 及 Visual Studio, 详情见 Github 主页) (y/n): ") + gpu_available = if_gpu.lower() == 'y' # Install PyTorch install_torch(gpu_available) # Install other requirements install_requirements() - # Download nltk for sovits - import nltk - nltk.download('averaged_perceptron_tagger_eng') + #! 暂时停用配音功能 + # # Download nltk for sovits + # import nltk + # nltk.download('averaged_perceptron_tagger_eng') - # Install spaCy model - install_package("spacy") - download_spacy_model() - - # Download UVR model - dowanload_uvr_model() + # # Download UVR model + # dowanload_uvr_model() - # Download GPT-SoVITS model - download_sovits_model() - download_huanyu_model() # custom model - - # Download Whisper model .pt - import torch - import whisper_timestamped as whisper - MODEL_DIR = "./_model_cache" - device = 'cuda:0' if torch.cuda.is_available() else 'cpu' - os.makedirs(MODEL_DIR, exist_ok=True) - model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR) + # # Download GPT-SoVITS model + # download_sovits_model() + # download_huanyu_model() # custom model # Download and extract FFmpeg download_and_extract_ffmpeg() diff --git a/requirements.txt b/requirements.txt index 7a2c8b41..a639871b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ jupyter openai openpyxl spacy -whisper_timestamped yt-dlp pyyaml pydub @@ -14,37 +13,31 @@ json-repair resampy streamlit -# sovits -requests -pydub -pydantic -soundfile -flash-attention -numpy==1.26.4 -scipy -tensorboard -numba -pytorch-lightning -gradio>=4.29 -gradio_client -ffmpeg-python -onnxruntime -tqdm -funasr==1.0.0 -cn2an -pypinyin -pyopenjtalk -g2p_en -torchaudio -modelscope==1.10.0 -sentencepiece -transformers -chardet -PyYAML -psutil -jieba_fast -jieba -LangSegment>=0.3.1 -wordsegment -srt -pyloudnorm \ No newline at end of file +#! sovits依赖项 暂时停用配音功能,开发好后请更改此处及 install.py +# pydantic +# soundfile +# flash-attention +# numpy==1.26.4 +# scipy +# tensorboard +# numba +# pytorch-lightning +# ffmpeg-python +# onnxruntime +# tqdm +# funasr==1.0.0 +# cn2an +# pypinyin +# pyopenjtalk +# g2p_en +# modelscope==1.10.0 +# sentencepiece +# transformers +# chardet +# psutil +# jieba_fast +# jieba +# LangSegment>=0.3.1 +# wordsegment +# srt +# pyloudnorm \ No newline at end of file diff --git a/st.py b/st.py index ba9e660b..d40e283e 100644 --- a/st.py +++ b/st.py @@ -16,8 +16,8 @@ def text_processing_section():

该阶段包括以下步骤:

- 1. Whisper语音转录
- 2. Spacy 和 llm 分割句子
+ 1. Whisper 单词级转录
+ 2. Spacy 和 Claude 分割句子
3. 总结和多步翻译
4. 切割对齐长字幕
5. 生成时间轴和字幕
@@ -45,7 +45,8 @@ def process_text(): video_file = step1_ytdlp.find_video_files() with st.spinner("使用Whisper进行转录..."): - step2_whisper_stamped.transcript(video_file) + # step2_whisper_stamped.transcript(video_file) + step2_whisperapi.transcribe(video_file) with st.spinner("分割长句..."): step3_1_spacy_split.split_by_spacy() step3_2_splitbymeaning.split_sentences_by_meaning() @@ -112,8 +113,9 @@ def main(): st.markdown(give_star_button, unsafe_allow_html=True) download_video_section(cloud) text_processing_section() - if not cloud: - audio_processing_section() + st.warning("配音功能仍在开发中,暂已停用,感谢理解!") + # if not cloud: + # audio_processing_section() if __name__ == "__main__": main() \ No newline at end of file diff --git a/st_components/download_video_section.py b/st_components/download_video_section.py index 45035c36..0c235014 100644 --- a/st_components/download_video_section.py +++ b/st_components/download_video_section.py @@ -2,6 +2,7 @@ import os, sys, shutil sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from core.step1_ytdlp import download_video_ytdlp, find_video_files +from time import sleep def download_video_section(cloud): title1 = "上传视频 " if cloud else "下载或上传视频" @@ -14,6 +15,7 @@ def download_video_section(cloud): os.remove(video_file) if os.path.exists("output"): shutil.rmtree("output") + sleep(0.5) st.rerun() return True except: @@ -25,7 +27,7 @@ def download_video_section(cloud): download_video_ytdlp(url) st.rerun() from config import ALLOWED_VIDEO_FORMATS - uploaded_file = st.file_uploader("或上传视频 <30min", type=ALLOWED_VIDEO_FORMATS) + uploaded_file = st.file_uploader("或上传视频 建议<40min", type=ALLOWED_VIDEO_FORMATS) if uploaded_file: os.makedirs("output", exist_ok=True) # 视频写入output文件夹 diff --git a/st_components/imports_and_utils.py b/st_components/imports_and_utils.py index 4e255880..dd86b2b3 100644 --- a/st_components/imports_and_utils.py +++ b/st_components/imports_and_utils.py @@ -1,6 +1,6 @@ import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core import step1_ytdlp, step2_whisper_stamped, step3_1_spacy_split, step3_2_splitbymeaning +from core import step1_ytdlp, step2_whisperapi, step3_1_spacy_split, step3_2_splitbymeaning from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline from core import step7_merge_sub_to_vid, step8_extract_refer_audio, step9_generate_audio_task from core import step10_generate_audio, step11_merge_audio_to_vid diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py index 8877408e..3bcc5ab9 100644 --- a/st_components/sidebar_setting.py +++ b/st_components/sidebar_setting.py @@ -21,36 +21,33 @@ def page_setting(): st.header("LLM 配置") - api_key = st.text_input("API_key", value=config.API_KEY) - base_url = st.text_input("Base_url", value=config.BASE_URL) - models = st.text_input("Model", value=','.join(config.MODEL)) + api_key = st.text_input("API_KEU", value=config.API_KEY) + whisper_api_key = st.text_input("WHISPER_API_KEY(纯 AZ 渠道)", value=config.WHISPER_API_KEY) + base_url = st.text_input("BASE_URL", value=config.BASE_URL) + models = st.text_input("MODEL", value=','.join(config.MODEL)) if api_key != config.API_KEY: changes["API_KEY"] = api_key + if whisper_api_key != config.WHISPER_API_KEY: + changes["WHISPER_API_KEY"] = whisper_api_key if base_url != config.BASE_URL: changes["BASE_URL"] = base_url if models.split(',') != config.MODEL: changes["MODEL"] = models.split(',') st.header("字幕设置") - cols_audio = st.columns(2) - with cols_audio[0]: - audio_language = st.radio("whisper 识别语言:", options=["auto", "en"], index=0 if config.AUDIO_LANGUAGE == "auto" else 1) - if audio_language != config.AUDIO_LANGUAGE: - changes["AUDIO_LANGUAGE"] = audio_language - with cols_audio[1]: - target_language = st.text_input("翻译目标语言:", value=config.TARGET_LANGUAGE) - if target_language != config.TARGET_LANGUAGE: - changes["TARGET_LANGUAGE"] = target_language + target_language = st.text_input("翻译目标语言:", value=config.TARGET_LANGUAGE) + if target_language != config.TARGET_LANGUAGE: + changes["TARGET_LANGUAGE"] = target_language st.write("每行字幕最大字符数:") - col1, col2 = st.columns(2) - with col1: - max_english_length = st.number_input("英文:", value=config.MAX_ENGLISH_LENGTH) - if max_english_length != config.MAX_ENGLISH_LENGTH: - changes["MAX_ENGLISH_LENGTH"] = int(max_english_length) + cols_sub = st.columns(2) + with cols_sub[0]: + max_src_length = st.number_input("原字幕:", value=config.MAX_SRC_LENGTH) + if max_src_length != config.MAX_SRC_LENGTH: + changes["MAX_SRC_LENGTH"] = int(max_src_length) - with col2: - max_target_language_length = st.number_input("翻译:", value=config.MAX_TARGET_LANGUAGE_LENGTH) + with cols_sub[1]: + max_target_language_length = st.number_input("翻译字幕:", value=config.MAX_TARGET_LANGUAGE_LENGTH) if max_target_language_length != config.MAX_TARGET_LANGUAGE_LENGTH: changes["MAX_TARGET_LANGUAGE_LENGTH"] = int(max_target_language_length) @@ -63,10 +60,12 @@ def page_setting(): if resolution != config.RESOLUTIOM: changes["RESOLUTIOM"] = resolution - st.header("SoVITS 角色配置") - dubbing_character = st.text_input("配音角色:", value=config.DUBBING_CHARACTER) - if dubbing_character != config.DUBBING_CHARACTER: - changes["DUBBING_CHARACTER"] = dubbing_character + + #! 配音功能仍在开发中,暂已停用,感谢理解! + # st.header("SoVITS 角色配置") + # dubbing_character = st.text_input("配音角色:", value=config.DUBBNING_CHARACTER) + # if dubbing_character != config.DUBBNING_CHARACTER: + # changes["DUBBNING_CHARACTER"] = dubbing_character if changes: st.toast("记得点击下方的'保存设置'按钮", icon="🔔") diff --git "a/\344\270\200\351\224\256\345\220\257\345\212\250.command" "b/\344\270\200\351\224\256\345\220\257\345\212\250.command" new file mode 100755 index 00000000..a888d67e --- /dev/null +++ "b/\344\270\200\351\224\256\345\220\257\345\212\250.command" @@ -0,0 +1,5 @@ +#!/bin/bash +cd "$(dirname "$0")" +source .venv/bin/activate +echo "初次启动会比较慢,请耐心等待..." +streamlit run st.py \ No newline at end of file