From 04d8dc726e90883c851c4d43cb662daf12a351c6 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 16 Apr 2021 09:36:09 +0100 Subject: [PATCH 1/9] handle file paths containing whitespace --- subaligner/_version.py | 2 +- subaligner/media_helper.py | 20 ++++++++++---------- subaligner/utils.py | 17 +++++++++-------- tests/integration/feature/subaligner.feature | 6 ++++++ tests/integration/radish/step.py | 8 ++++---- tests/subaligner/resource/test spaced.mp4 | 1 + tests/subaligner/resource/test spaced.vtt | 1 + tests/subaligner/test_utils.py | 2 +- 8 files changed, 33 insertions(+), 24 deletions(-) create mode 120000 tests/subaligner/resource/test spaced.mp4 create mode 120000 tests/subaligner/resource/test spaced.vtt diff --git a/subaligner/_version.py b/subaligner/_version.py index ad394de..685b1c8 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.1.3" +__version__ = "0.1.4" diff --git a/subaligner/media_helper.py b/subaligner/media_helper.py index 11ad93c..a36d9df 100644 --- a/subaligner/media_helper.py +++ b/subaligner/media_helper.py @@ -6,6 +6,7 @@ import shutil import atexit import signal +import shlex from typing import Optional, Tuple, List from copy import deepcopy @@ -71,16 +72,16 @@ def extract_audio(video_file_path, decompress: bool = False, freq: int = 16000) ) command = ( - "{0} -y -xerror -i {1} -ac 2 -ar {2} -vn {3}".format( + "{0} -y -xerror -i '{1}' -ac 2 -ar {2} -vn '{3}'".format( MediaHelper.FFMPEG_BIN, video_file_path, freq, audio_file_path ) if decompress - else "{0} -y -xerror -i {1} -vn -acodec copy {2}".format( + else "{0} -y -xerror -i '{1}' -vn -acodec copy '{2}'".format( MediaHelper.FFMPEG_BIN, video_file_path, audio_file_path ) ) with subprocess.Popen( - command.split(), + shlex.split(command), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -184,16 +185,16 @@ def extract_audio_from_start_to_end(audio_file_path: str, start: str, end: Optio if end is not None: duration = MediaHelper.get_duration_in_seconds(start, end) - command = "{0} -y -xerror -i {1} -ss {2} -t {3} -acodec copy {4}".format( + command = "{0} -y -xerror -i '{1}' -ss {2} -t {3} -acodec copy '{4}'".format( MediaHelper.FFMPEG_BIN, audio_file_path, start, duration, segment_path ) else: - command = "{0} -y -xerror -i {1} -ss {2} -acodec copy {3}".format( + command = "{0} -y -xerror -i '{1}' -ss {2} -acodec copy '{3}'".format( MediaHelper.FFMPEG_BIN, audio_file_path, start, segment_path ) with subprocess.Popen( - command, - shell=True, + shlex.split(command), + shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, @@ -319,7 +320,7 @@ def get_frame_rate(file_path: str) -> float: """ with subprocess.Popen( - "{0} -i {1} -t 00:00:10 -f null /dev/null".format(MediaHelper.FFMPEG_BIN, file_path).split(), + shlex.split("{0} -i '{1}' -t 00:00:10 -f null /dev/null".format(MediaHelper.FFMPEG_BIN, file_path)), shell=False, stderr=subprocess.PIPE, close_fds=True, @@ -339,8 +340,7 @@ def get_frame_rate(file_path: str) -> float: try: std_out, std_err = process.communicate(timeout=MediaHelper.__CMD_TIME_OUT) if process.returncode != 0: - MediaHelper.__LOGGER.warning("[{}-{}] Cannot extract the frame rate from video: {}\n{}" - .format(threading.current_thread().name, process.pid, file_path, std_err)) + MediaHelper.__LOGGER.warning("[{}-{}] Cannot extract the frame rate from video: {}\n{}".format(threading.current_thread().name, process.pid, file_path, std_err)) raise NoFrameRateException( "Cannot extract the frame rate from video: {}".format(file_path) ) diff --git a/subaligner/utils.py b/subaligner/utils.py index eae0235..fd06699 100644 --- a/subaligner/utils.py +++ b/subaligner/utils.py @@ -4,6 +4,7 @@ import requests import shutil import cchardet +import shlex from pycaption import ( CaptionConverter, @@ -84,7 +85,7 @@ def srt2vtt(srt_file_path: str, vtt_file_path: Optional[str] = None, timeout_sec _vtt_file_path = srt_file_path.replace(".srt", ".vtt") if vtt_file_path is None else vtt_file_path encoding = Utils.detect_encoding(srt_file_path) - command = "{0} -y -sub_charenc {1} -i {2} -f webvtt {3}".format(Utils.FFMPEG_BIN, encoding, srt_file_path, _vtt_file_path) + command = "{0} -y -sub_charenc {1} -i '{2}' -f webvtt '{3}'".format(Utils.FFMPEG_BIN, encoding, srt_file_path, _vtt_file_path) timeout_msg = "Timeout on converting SubRip to WebVTT: {}".format(srt_file_path) error_msg = "Cannot convert SubRip to WebVTT: {}".format(srt_file_path) @@ -111,7 +112,7 @@ def vtt2srt(vtt_file_path: str, srt_file_path: Optional[str] = None, timeout_sec _srt_file_path = vtt_file_path.replace(".vtt", ".srt") if srt_file_path is None else srt_file_path encoding = Utils.detect_encoding(vtt_file_path) - command = "{0} -y -sub_charenc {1} -i {2} -f srt {3}".format(Utils.FFMPEG_BIN, encoding, vtt_file_path, _srt_file_path) + command = "{0} -y -sub_charenc {1} -i '{2}' -f srt '{3}'".format(Utils.FFMPEG_BIN, encoding, vtt_file_path, _srt_file_path) timeout_msg = "Timeout on converting WebVTT to SubRip: {}".format(vtt_file_path) error_msg = "Cannot convert WebVTT to SubRip: {}".format(vtt_file_path) @@ -400,7 +401,7 @@ def sbv2srt(sbv_file_path: str, srt_file_path: Optional[str] = None) -> None: caption.encoding = encoding if srt_file_path is None: - srt_file_path = srt_file_path.replace(".sbv", ".srt") + srt_file_path = sbv_file_path.replace(".sbv", ".srt") with open(srt_file_path, "w") as file: srt_writer = SrtWriter(file, captions) @@ -468,7 +469,7 @@ def ytt2srt(transcript_file_path: str, srt_file_path: Optional[str] = None) -> N caption.encoding = encoding if srt_file_path is None: - srt_file_path = srt_file_path.replace(".ytt", ".srt") + srt_file_path = transcript_file_path.replace(".ytt", ".srt") with open(srt_file_path, "w") as file: srt_writer = SrtWriter(file, captions) @@ -488,7 +489,7 @@ def extract_teletext_as_subtitle(ts_file_path: str, page_num: int, output_file_p timeout_secs {int} -- The timeout in seconds on extraction {default: 30}. """ - command = "{0} -y -fix_sub_duration -txt_page {1} -txt_format text -i {2} {3}".format(Utils.FFMPEG_BIN, page_num, ts_file_path, output_file_path) + command = "{0} -y -fix_sub_duration -txt_page {1} -txt_format text -i '{2}' '{3}'".format(Utils.FFMPEG_BIN, page_num, ts_file_path, output_file_path) timeout_msg = "Timeout on extracting Teletext from transport stream: {} on page: {}".format(ts_file_path, page_num) error_msg = "Cannot extract Teletext from transport stream: {} on page: {}".format(ts_file_path, page_num) @@ -514,7 +515,7 @@ def extract_matroska_subtitle(mkv_file_path: str, stream_index: int, output_file timeout_secs {int} -- The timeout in seconds on extraction {default: 30}. """ - command = "{0} -y -i {1} -map 0:s:{2} {3}".format(Utils.FFMPEG_BIN, mkv_file_path, stream_index, output_file_path) + command = "{0} -y -i '{1}' -map 0:s:{2} '{3}'".format(Utils.FFMPEG_BIN, mkv_file_path, stream_index, output_file_path) timeout_msg = "Timeout on extracting the subtitle from file: {} with stream index: {}".format(mkv_file_path, stream_index) error_msg = "Cannot extract the subtitle from file: {} with stream index: {}".format(mkv_file_path, stream_index) @@ -566,7 +567,7 @@ def contains_embedded_subtitles(video_file_path: str, timeout_secs: int = 30) -> bool -- True if the video contains embedded subtitles or False otherwise. """ - command = "{0} -y -i {1} -c copy -map 0:s -f null - -v 0 -hide_banner".format(Utils.FFMPEG_BIN, video_file_path) + command = "{0} -y -i '{1}' -c copy -map 0:s -f null - -v 0 -hide_banner".format(Utils.FFMPEG_BIN, video_file_path) timeout_msg = "Timeout on detecting embedded subtitles from file: {}".format(video_file_path) error_msg = "Embedded subtitle detection failed for file: {}".format(video_file_path) @@ -614,7 +615,7 @@ def __convert_subtitle(source_file_path: str, source_ext: str, target_file_path: @staticmethod def _run_command(command: str, timeout_secs: int, timeout_msg: str, error_msg: str, callback: Callable[[int, str], Any]) -> Any: with subprocess.Popen( - command.split(), + shlex.split(command), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature index ef534d3..66c16d7 100644 --- a/tests/integration/feature/subaligner.feature +++ b/tests/integration/feature/subaligner.feature @@ -189,6 +189,12 @@ Feature: Subaligner CLI | subaligner | single | | subaligner | dual | + Scenario: Test alignments with the file path containing whitespace ([] == " ") + Given I have a video file "test[]spaced.mp4" + And I have a subtitle file "test[]spaced.vtt" + When I run the alignment with subaligner on them with dual stage + Then a new subtitle file "test[]spaced_aligned.vtt" is generated + @exception Scenario Outline: Test errors out on unsupported subtitle input Given I have a video file "test.mp4" diff --git a/tests/integration/radish/step.py b/tests/integration/radish/step.py index 7397987..dd6e402 100644 --- a/tests/integration/radish/step.py +++ b/tests/integration/radish/step.py @@ -15,7 +15,7 @@ def video_file(step, file_name): if file_name.lower().startswith("http"): step.context.video_file_path = file_name else: - step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name) + step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name).replace("[]", " ") @given('I have a subtitle file "{file_name:S}"') @@ -23,7 +23,7 @@ def subtitle_file(step, file_name): if file_name.lower().startswith("http"): step.context.subtitle_path_or_selector = file_name else: - step.context.subtitle_path_or_selector = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name) + step.context.subtitle_path_or_selector = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name).replace("[]", " ") @given('I have selector "{selector:S}" for the embedded subtitle') @@ -133,7 +133,7 @@ def run_subaligner_with_custom_model(step, aligner, mode): @then('a new subtitle file "{file_name:S}" is generated') def expect_result(step, file_name): - output_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name) + output_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name.replace("[]", " ")) assert step.context.exit_code == 0 assert os.path.isfile(output_file_path) is True @@ -194,7 +194,7 @@ def unsupported_subtitle(step): @given("I have an unsupported video file") def unsupported_video(step): - step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", "unsupported") + step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", "unsupported").replace("[]", " ") @given('I have an audiovisual file directory "{av_dir:S}"') diff --git a/tests/subaligner/resource/test spaced.mp4 b/tests/subaligner/resource/test spaced.mp4 new file mode 120000 index 0000000..954f241 --- /dev/null +++ b/tests/subaligner/resource/test spaced.mp4 @@ -0,0 +1 @@ +test.mp4 \ No newline at end of file diff --git a/tests/subaligner/resource/test spaced.vtt b/tests/subaligner/resource/test spaced.vtt new file mode 120000 index 0000000..96a0225 --- /dev/null +++ b/tests/subaligner/resource/test spaced.vtt @@ -0,0 +1 @@ +test.vtt \ No newline at end of file diff --git a/tests/subaligner/test_utils.py b/tests/subaligner/test_utils.py index 57f1305..1e9ad73 100644 --- a/tests/subaligner/test_utils.py +++ b/tests/subaligner/test_utils.py @@ -245,7 +245,7 @@ def test_ytt2srt(self): def test_extract_teletext_as_srt(self, mocked_run_command): Undertest.extract_teletext_as_subtitle("ts_file_path", 888, "srt_file_path") - mocked_run_command.assert_called_once_with("ffmpeg -y -fix_sub_duration -txt_page 888 -txt_format text -i {} {}".format("ts_file_path", "srt_file_path"), ANY, ANY, ANY, ANY) + mocked_run_command.assert_called_once_with("ffmpeg -y -fix_sub_duration -txt_page 888 -txt_format text -i {} {}".format("'ts_file_path'", "'srt_file_path'"), ANY, ANY, ANY, ANY) def test_extract_matroska_subtitle(self): output_file_path = os.path.join(self.resource_tmp, "extracted.matroska.srt") From 64d3cf462be980a9e25843a6bdccae736110e822 Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 10 May 2021 09:28:34 +0100 Subject: [PATCH 2/9] support subtitle translation during alignment and update docs --- README.md | 11 +- requirements-app.txt | 3 + requirements.txt | 3 + site/source/acknowledgement.rst | 1 + site/source/advanced_usage.rst | 15 +++ site/source/conf.py | 3 +- site/source/usage.rst | 13 ++- subaligner/__main__.py | 23 +++- subaligner/embedder.py | 4 +- subaligner/hparam_tuner.py | 2 +- subaligner/hyperparameters.py | 4 +- subaligner/logger.py | 2 +- subaligner/network.py | 2 +- subaligner/predictor.py | 7 +- subaligner/singleton.py | 2 +- subaligner/subaligner_1pass/__main__.py | 18 +++- subaligner/subaligner_2pass/__main__.py | 22 +++- subaligner/subaligner_convert/__main__.py | 20 +++- subaligner/subtitle.py | 12 ++- subaligner/trainer.py | 2 +- subaligner/translator.py | 102 ++++++++++++++++++ tests/integration/feature/subaligner.feature | 13 +++ .../feature/subaligner_convert.feature | 9 ++ tests/integration/radish/step.py | 32 ++++++ tests/subaligner/test_translator.py | 53 +++++++++ 25 files changed, 348 insertions(+), 30 deletions(-) create mode 100644 subaligner/translator.py create mode 100644 tests/subaligner/test_translator.py diff --git a/README.md b/README.md index bdc156e..7b21aa7 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,14 @@ $ subaligner -m single -v https://example.com/video.mp4 -s https://example.com/s $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` ``` +# Translative alignment with the ISO 639-3 language code pair (src,tgt) + +$ subaligner_1pass -v video.mp4 -s subtitle.srt -t eng,zho +$ subaligner_2pass -v video.mp4 -s subtitle.srt -t eng,spa +$ subaligner -m single -v video.mp4 -s subtitle.srt -t eng,fra +$ subaligner -m dual -v video.mp4 -s subtitle.srt -t eng,deu +``` +``` # Run alignments with pipx $ pipx run subaligner -m single -v video.mp4 -s subtitle.srt @@ -104,7 +112,8 @@ $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_2pass -v $ docker run -it baxtree/subaligner subaligner_1pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt $ docker run -it baxtree/subaligner subaligner_2pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` -The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass --help`, `subaligner_2pass --help` or `subaligner --help`. +The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass -h`, `subaligner_2pass -h` or `subaligner -h`. +Additional utilities can be used after consulting `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h`. ![](figures/screencast.gif) ## Supported Formats diff --git a/requirements-app.txt b/requirements-app.txt index d55b217..7341479 100644 --- a/requirements-app.txt +++ b/requirements-app.txt @@ -63,13 +63,16 @@ requests-oauthlib==1.3.0 rsa==4.7 scipy~=1.5.4 scikit-learn>=0.19.1 +sentencepiece~=0.1.95 setuptools>=41.0.0 tblib==1.3.2 tensorflow>=1.15.5,<2.5 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 +torch~=1.8.1 tornado==5.1.0 +transformers~=4.5.1 urllib3==1.25.9 Werkzeug>=0.15.3 zict==0.1.3 diff --git a/requirements.txt b/requirements.txt index d55b217..7341479 100644 --- a/requirements.txt +++ b/requirements.txt @@ -63,13 +63,16 @@ requests-oauthlib==1.3.0 rsa==4.7 scipy~=1.5.4 scikit-learn>=0.19.1 +sentencepiece~=0.1.95 setuptools>=41.0.0 tblib==1.3.2 tensorflow>=1.15.5,<2.5 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 +torch~=1.8.1 tornado==5.1.0 +transformers~=4.5.1 urllib3==1.25.9 Werkzeug>=0.15.3 zict==0.1.3 diff --git a/site/source/acknowledgement.rst b/site/source/acknowledgement.rst index 895c00b..9fe08f6 100644 --- a/site/source/acknowledgement.rst +++ b/site/source/acknowledgement.rst @@ -12,3 +12,4 @@ Acknowledgement - `pysrt `_ - `pysubs2 `_ - `aeneas `_ + - `transformers `_ diff --git a/site/source/advanced_usage.rst b/site/source/advanced_usage.rst index 408d469..77c3c4b 100644 --- a/site/source/advanced_usage.rst +++ b/site/source/advanced_usage.rst @@ -39,6 +39,16 @@ Embeddings extracted from your media files can be reused with `-utd` or `--use_t model of another kind (instead of re-using the same model on training resumption) without going through the feature embedding process, which could take quite long to finish for a large dataset so as to be unnecessary if there is no change on it. +**Ignore sound effects:: + + (.venv) $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory --sound_effect_start_marker "(" --sound_effect_end_marker ")" + +It is not uncommon that subtitles sometimes contain sound effects (e.g., "BARK", "(applause)" and "[MUSIC]", etc.). For limited training +data sets and not sophisticated enough network architectures, the model usually cannot capture all the sound effects very well. +To filter out sound effect subtitles and only preserve the vocal ones, you can pass in `-sesm` or `--sound_effect_start_marker` and/or +`seem` or `--sound_effect_end_marker` with strings which will be used by subaligner for finding sound effects and ignoring them within the training process. +For example, the above exemplary command will treat any strings starting with "(" and ending with ")" as sound effects. + **Run alignments after training**:: (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -tod training_output_directory @@ -97,6 +107,11 @@ flags to customise the configuration on tuning: **Convert the subtitle to another format**:: (.venv) $ subaligner_convert -i subtitle.srt -o subtitle.vtt + (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_zh.vtt -t eng,zho + +**Convert the subtitle and trans**:: + + (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_es.srt -t eng,spa For output subtitles like MicroDVD relying on the frame rate, its value needs to be passed in with `-fr` or `--frame_rate`. diff --git a/site/source/conf.py b/site/source/conf.py index 550f347..4d31e50 100644 --- a/site/source/conf.py +++ b/site/source/conf.py @@ -80,7 +80,8 @@ "pysubs2", "cchardet", "captionstransformer", - "bs4" + "bs4", + "transformers" ] def setup(app): diff --git a/site/source/usage.rst b/site/source/usage.rst index 2f6efdb..7b076a5 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -8,23 +8,30 @@ segments individually with an option of stretching each segment. Make sure you have got the virtual environment activated upfront. -**Single-stage alignment**:: +**Single-stage alignment (high-level shift with lower latency)**:: (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt (.venv) $ subaligner_1pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt -**Dual-stage alignment**:: +**Dual-stage alignment (low-level shift with higher latency)**:: (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt (.venv) $ subaligner_2pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt -**Pass in single-stage or dual-stage as the alignment mode**:: +**Pass in single-stage or dual-stage as the alignment mode (src,tgt)**:: (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m single -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m dual -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt +**Translative alignment with the ISO 639-3 language code pair**:: + + (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t eng,zho + (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -t eng,spa + (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t eng,fra + (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t eng,deu + **Run alignments with the docker image**:: $ docker pull baxtree/subaligner diff --git a/subaligner/__main__.py b/subaligner/__main__.py index ccf51ed..6a134bc 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -12,13 +12,15 @@ Max global log loss for alignment -so, --stretch_off Switch off stretch on non-English speech and subtitles) -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} - Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes]. + Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]. NB: This will be ignored if either -so or --stretch_off is present -fos, --exit_segfail Exit on any segment alignment failures -tod TRAINING_OUTPUT_DIRECTORY, --training_output_directory TRAINING_OUTPUT_DIRECTORY Path to the output directory containing training results -o OUTPUT, --output OUTPUT Path to the output subtitle file + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -98,7 +100,7 @@ def main(): type=str, choices=Language.ALLOWED_VALUES, default=Language.ENG, - help="Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes].\nNB: This will be ignored if either -so or --stretch_off is present", + help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present", ) parser.add_argument( "-fos", @@ -120,6 +122,12 @@ def main(): default="", help="Path to the output subtitle file", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -153,6 +161,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException from subaligner.exception import TerminalException from subaligner.utils import Utils @@ -201,9 +210,17 @@ def main(): stretch_in_lang=stretch_in_lang, exit_segfail=exit_segfail, ) + aligned_subtitle_path = "_aligned.".join( FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output - Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + aligned_subs = translator.translate_subs(aligned_subs) + Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate) log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs) if log_loss is None or log_loss > FLAGS.max_logloss: diff --git a/subaligner/embedder.py b/subaligner/embedder.py index ada3efa..2793788 100644 --- a/subaligner/embedder.py +++ b/subaligner/embedder.py @@ -22,7 +22,7 @@ def __init__( hop_len: int = 512, step_sample: float = 0.04, len_sample: float = 0.075, - ): + ) -> None: """Feature embedder initialiser. Keyword Arguments: @@ -235,7 +235,7 @@ def position_to_time_str(self, position: int) -> str: def extract_data_and_label_from_audio( self, audio_file_path: str, - subtitle_file_path: str, + subtitle_file_path: Optional[str], subtitles: Optional[SubRipFile] = None, sound_effect_start_marker: Optional[str] = None, sound_effect_end_marker: Optional[str] = None, diff --git a/subaligner/hparam_tuner.py b/subaligner/hparam_tuner.py index 5b6e506..dd779f8 100644 --- a/subaligner/hparam_tuner.py +++ b/subaligner/hparam_tuner.py @@ -29,7 +29,7 @@ def __init__(self, num_of_trials: int = 5, tuning_epochs: int = 5, network_type: str = Network.LSTM, - **kwargs): + **kwargs) -> None: """Hyperparameter tuner initialiser Arguments: diff --git a/subaligner/hyperparameters.py b/subaligner/hyperparameters.py index b51ec37..1ab1dc5 100644 --- a/subaligner/hyperparameters.py +++ b/subaligner/hyperparameters.py @@ -9,7 +9,7 @@ class Hyperparameters(object): OPTIMIZERS = ["adadelta", "adagrad", "adam", "adamax", "ftrl", "nadam", "rmsprop", "sgd"] - def __init__(self): + def __init__(self) -> None: """Hyperparameters initialiser setting default values""" self.__learning_rate = 0.001 @@ -120,7 +120,7 @@ def optimizer(self, value: str) -> None: self.__optimizer = "SGD" @property - def loss(self) -> float: + def loss(self) -> str: return self.__loss @property diff --git a/subaligner/logger.py b/subaligner/logger.py index 352e863..54c7adc 100644 --- a/subaligner/logger.py +++ b/subaligner/logger.py @@ -11,7 +11,7 @@ class Logger(Singleton): VERBOSE = True QUIET = False - def __init__(self, output_log: str = "output.log"): + def __init__(self, output_log: str = "output.log") -> None: self.__loggers: Dict[str, logging.Logger] = {} self.__output_log = output_log diff --git a/subaligner/network.py b/subaligner/network.py index 2cdaab1..6aabb79 100644 --- a/subaligner/network.py +++ b/subaligner/network.py @@ -56,7 +56,7 @@ def __init__( hyperparameters: Hyperparameters, model_path: Optional[str] = None, backend: str = "tensorflow" - ): + ) -> None: """ Network object initialiser used by factory methods. Arguments: diff --git a/subaligner/predictor.py b/subaligner/predictor.py index 50d1143..3a9b6f5 100644 --- a/subaligner/predictor.py +++ b/subaligner/predictor.py @@ -18,6 +18,7 @@ from .singleton import Singleton from .subtitle import Subtitle from .hyperparameters import Hyperparameters +from .translator import Translator from .exception import TerminalException from .exception import NoFrameRateException from .logger import Logger @@ -38,7 +39,7 @@ class Predictor(Singleton): __SEGMENT_PREDICTION_TIMEOUT = 60 # Maximum waiting time in seconds when predicting each segment - def __init__(self, **kwargs): + def __init__(self, **kwargs) -> None: """Feature predictor initialiser. Keyword Arguments: @@ -574,7 +575,7 @@ def __predict( audio_file_path {string} -- The file path of the original audio (default: {None}). subtitles {list} -- The list of SubRip files (default: {None}). max_shift_secs {float} -- The maximum seconds by which subtitle cues can be shifted (default: {None}). - previous_gap {float} -- The duration betwee the start time of the audio segment and the start time of the subtitle segment. + previous_gap {float} -- The duration between the start time of the audio segment and the start time of the subtitle segment (default: {None}). Returns: tuple -- The shifted subtitles, the audio file path and the voice probabilities of the original audio. @@ -675,7 +676,7 @@ def __predict( self.__feature_embedder.position_to_duration(pos_to_delay) - original_start ) elif subtitles is not None: # for each in second pass - seconds_to_shift = self.__feature_embedder.position_to_duration(pos_to_delay) - previous_gap + seconds_to_shift = self.__feature_embedder.position_to_duration(pos_to_delay) - previous_gap if previous_gap is not None else 0.0 else: if os.path.exists(audio_file_path): os.remove(audio_file_path) diff --git a/subaligner/singleton.py b/subaligner/singleton.py index 266813a..70671ca 100644 --- a/subaligner/singleton.py +++ b/subaligner/singleton.py @@ -6,7 +6,7 @@ class _Singleton(type): # type: ignore _instances: Dict[Any, Any] = {} - def __call__(cls, *args, **kwargs): + def __call__(cls, *args, **kwargs) -> Any: if cls not in cls._instances: cls._instances[cls] = super(_Singleton, cls).__call__( *args, **kwargs diff --git a/subaligner/subaligner_1pass/__main__.py b/subaligner/subaligner_1pass/__main__.py index ebc5197..c83e3e6 100755 --- a/subaligner/subaligner_1pass/__main__.py +++ b/subaligner/subaligner_1pass/__main__.py @@ -12,6 +12,8 @@ Path to the output directory containing training results -o OUTPUT, --output OUTPUT Path to the output subtitle file + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -81,6 +83,12 @@ def main(): default="", help="Path to the output subtitle file", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -108,6 +116,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException from subaligner.exception import TerminalException from subaligner.utils import Utils @@ -149,7 +158,14 @@ def main(): aligned_subtitle_path = "_aligned.".join( FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output - Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + subs = translator.translate_subs(subs) + Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate) log_loss = predictor.get_log_loss(voice_probabilities, subs) if log_loss is None or log_loss > FLAGS.max_logloss: diff --git a/subaligner/subaligner_2pass/__main__.py b/subaligner/subaligner_2pass/__main__.py index f279092..0252ba2 100755 --- a/subaligner/subaligner_2pass/__main__.py +++ b/subaligner/subaligner_2pass/__main__.py @@ -12,13 +12,15 @@ Max global log loss for alignment -so, --stretch_off Switch off stretch on subtitles for non-English speech -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} - Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes]. + Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]. NB: This will be ignored if either -so or --stretch_off is present -fos, --exit_segfail Exit on any segment alignment failures -tod TRAINING_OUTPUT_DIRECTORY, --training_output_directory TRAINING_OUTPUT_DIRECTORY Path to the output directory containing training results -o OUTPUT, --output OUTPUT Path to the output subtitle file + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -87,7 +89,7 @@ def main(): type=str, choices=Language.ALLOWED_VALUES, default=Language.ENG, - help="Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes].\nNB: This will be ignored if either -so or --stretch_off is present", + help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present", ) parser.add_argument( "-fos", @@ -109,6 +111,12 @@ def main(): default="", help="Path to the output subtitle file", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -139,6 +147,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException from subaligner.exception import TerminalException from subaligner.utils import Utils @@ -183,7 +192,14 @@ def main(): aligned_subtitle_path = "_aligned.".join( FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output - Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + subs_list = translator.translate_subs(subs) + Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate) log_loss = predictor.get_log_loss(voice_probabilities, subs_list) if log_loss is None or log_loss > FLAGS.max_logloss: diff --git a/subaligner/subaligner_convert/__main__.py b/subaligner/subaligner_convert/__main__.py index 1977030..d0e238d 100755 --- a/subaligner/subaligner_convert/__main__.py +++ b/subaligner/subaligner_convert/__main__.py @@ -8,6 +8,8 @@ -h, --help show this help message and exit -fr FRAME_RATE, --frame_rate FRAME_RATE Frame rate used by conversion to formats such as MicroDVD + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -64,6 +66,12 @@ def main(): default=None, help="Frame rate used by conversion to formats such as MicroDVD", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -85,6 +93,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.subtitle import Subtitle + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException, TerminalException from subaligner.utils import Utils @@ -96,7 +105,16 @@ def main(): Utils.download_file(FLAGS.input_subtitle_path, local_subtitle_path) subtitle = Subtitle.load(local_subtitle_path) - Subtitle.save_subs_as_target_format(subtitle.subs, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + subs_list = translator.translate_subs(subtitle.subs) + Subtitle.export_subtitle(local_subtitle_path, subs_list, FLAGS.output_subtitle_path, FLAGS.frame_rate, "utf-8") + Subtitle.save_subs_as_target_format(subs_list, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, subtitle.subs, FLAGS.output_subtitle_path, FLAGS.frame_rate) + Subtitle.save_subs_as_target_format(subtitle.subs, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate) print("Subtitle converted and saved to: {}".format(FLAGS.output_subtitle_path)) except UnsupportedFormatException as e: print( diff --git a/subaligner/subtitle.py b/subaligner/subtitle.py index c687423..ea2ce9b 100644 --- a/subaligner/subtitle.py +++ b/subaligner/subtitle.py @@ -39,7 +39,7 @@ class Subtitle(object): SBV_EXTENSIONS = [".sbv"] YT_TRANSCRIPT_EXTENSIONS = [".ytt"] - def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str): + def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str) -> None: """Subtitle object initialiser. Arguments: @@ -375,7 +375,7 @@ def shift_subtitle( return shifted_subtitle_file_path @staticmethod - def save_subs_as_target_format(subs: List[SubRipItem], source_file_path: str, target_file_path: str, frame_rate: Optional[float] = None) -> None: + def save_subs_as_target_format(subs: List[SubRipItem], source_file_path: str, target_file_path: str, frame_rate: Optional[float] = None, encoding: Optional[str] = None) -> None: """Save SubRipItems with the format determined by the target file extension. Arguments: @@ -383,14 +383,15 @@ def save_subs_as_target_format(subs: List[SubRipItem], source_file_path: str, ta source_file_path {string} -- The path to the original subtitle file. target_file_path {string} -- The path to the output subtitle file. frame_rate {float} -- The frame rate used by conversion to formats such as MicroDVD + encoding {str} -- The encoding of the exported output file {default: None}. """ - encoding = Utils.detect_encoding(source_file_path) + encoding = Utils.detect_encoding(source_file_path) if encoding is None else encoding _, file_extension = os.path.splitext(target_file_path.lower()) Subtitle.__save_subtitle_by_extension(file_extension, subs, source_file_path, target_file_path, encoding, frame_rate) @staticmethod - def export_subtitle(source_file_path: str, subs: List[SubRipItem], target_file_path: str, frame_rate: float = 25.0) -> None: + def export_subtitle(source_file_path: str, subs: List[SubRipItem], target_file_path: str, frame_rate: float = 25.0, encoding: Optional[str] = None) -> None: """Export subtitle in the format determined by the file extension. Arguments: @@ -398,9 +399,10 @@ def export_subtitle(source_file_path: str, subs: List[SubRipItem], target_file_p subs {list} -- A list of SubRipItems. target_file_path {string} -- The path to the exported subtitle file. frame_rate {float} -- The frame rate for frame-based subtitle formats {default: 25.0}. + encoding {str} -- The encoding of the exported subtitle file {default: None}. """ - encoding = Utils.detect_encoding(source_file_path) + encoding = Utils.detect_encoding(source_file_path) if encoding is None else encoding _, file_extension = os.path.splitext(source_file_path.lower()) Subtitle.__save_subtitle_by_extension(file_extension, subs, source_file_path, target_file_path, encoding, frame_rate, is_exporting=True) diff --git a/subaligner/trainer.py b/subaligner/trainer.py index 71442e8..97a1adc 100644 --- a/subaligner/trainer.py +++ b/subaligner/trainer.py @@ -25,7 +25,7 @@ class Trainer(object): __LOGGER = Logger().get_logger(__name__) __MAX_BYTES = 2 ** 31 - 1 - def __init__(self, feature_embedder: FeatureEmbedder): + def __init__(self, feature_embedder: FeatureEmbedder) -> None: """Initialiser for the training process. Arguments: diff --git a/subaligner/translator.py b/subaligner/translator.py new file mode 100644 index 0000000..72bb628 --- /dev/null +++ b/subaligner/translator.py @@ -0,0 +1,102 @@ +import pycountry +import time +from copy import deepcopy +from pysrt import SubRipItem +from transformers import MarianMTModel, MarianTokenizer +from typing import List +from .singleton import Singleton +from .logger import Logger + + +class Translator(Singleton): + + __LOGGER = Logger().get_logger(__name__) + __TENSOR_TYPE = "pt" + __OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" + __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" + + def __init__(self, source_language, target_lang) -> None: + self.__initialise_model(source_language, target_lang) + + @staticmethod + def get_iso_639_alpha_2(language_code: str) -> str: + lang = pycountry.languages.get(alpha_3=language_code) + if lang is None: + raise ValueError("Cannot recognise %s as an ISO 639-3 language code" % language_code) + elif hasattr(lang, "alpha_2"): + return lang.alpha_2 + else: + return lang.alpha_3 + + def translate_subs(self, subs: List[SubRipItem]) -> List[SubRipItem]: + new_subs = deepcopy(subs) + src_texts = [sub.text for sub in new_subs] + tokenizer = self.tokenizer(src_texts, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.lang_model.generate(**tokenizer) + translated_texts = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated] + for index in range(len(new_subs)): + new_subs[index].text = translated_texts[index] + return new_subs + + def __initialise_model(self, src_lang, des_lang): + try: + mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(des_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(des_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), des_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_MT.format(src_lang, des_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(des_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(des_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), des_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + self.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, des_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) + message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, des_lang) + Translator.__LOGGER.error(message) + raise NotImplementedError(message) + + def __download_mt_model(self, mt_model_name): + Translator.__LOGGER.debug("Trying to download the MT model %s" % mt_model_name) + self.tokenizer = MarianTokenizer.from_pretrained(mt_model_name) + self.lang_model = MarianMTModel.from_pretrained(mt_model_name) + Translator.__LOGGER.debug("MT model %s downloaded" % mt_model_name) + + def __log_and_back_off(self, mt_model_name): + Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) + time.sleep(1) diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature index 66c16d7..75def31 100644 --- a/tests/integration/feature/subaligner.feature +++ b/tests/integration/feature/subaligner.feature @@ -195,6 +195,19 @@ Feature: Subaligner CLI When I run the alignment with subaligner on them with dual stage Then a new subtitle file "test[]spaced_aligned.vtt" is generated + @translation + Scenario Outline: Test translation on aligned subtitles + Given I have a video file "test.mp4" + And I have a subtitle file + When I run the alignment with on them with stage and for translation + Then a new subtitle file is generated + Examples: + | aligner | mode | subtitle-in | language-pair | subtitle-out | + | subaligner | single | "test.srt" | eng,zho | "test_aligned.srt" | + | subaligner | dual | "test.srt" | eng,spa | "test_aligned.srt" | + | subaligner_1pass | | "test.srt" | eng,fra | "test_aligned.srt" | + | subaligner_2pass | | "test.srt" | eng,deu | "test_aligned.srt" | + @exception Scenario Outline: Test errors out on unsupported subtitle input Given I have a video file "test.mp4" diff --git a/tests/integration/feature/subaligner_convert.feature b/tests/integration/feature/subaligner_convert.feature index 6dc8787..4df2e30 100644 --- a/tests/integration/feature/subaligner_convert.feature +++ b/tests/integration/feature/subaligner_convert.feature @@ -55,3 +55,12 @@ Feature: Subaligner CLI Given I have a subtitle file "https://raw.githubusercontent.com/baxtree/subaligner/master/tests/subaligner/resource/test.srt" When I run the converter with "test_srt.ttml" as the output Then a new subtitle file "test_srt.ttml" is generated + + Scenario Outline: Test subtitle conversion with translation + Given I have a subtitle file + When I run the converter with for translation and as the output + Then a new subtitle file is generated + Examples: + | subtitle-in | language_pair | subtitle-out | + | "test.srt" | eng,zho | "test_zh_srt.ttml" | + | "test.srt" | eng,rus | "test_ru_srt.ttml" | diff --git a/tests/integration/radish/step.py b/tests/integration/radish/step.py index dd6e402..34f3fb3 100644 --- a/tests/integration/radish/step.py +++ b/tests/integration/radish/step.py @@ -49,6 +49,26 @@ def run_subaligner(step, aligner, mode): step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) +@when("I run the alignment with {aligner:S} on them with {mode:S} stage and {language_pair:S} for translation") +def run_subaligner_with_translation(step, aligner, mode, language_pair): + if mode == "": + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-v", step.context.video_file_path, + "-s", step.context.subtitle_path_or_selector, + "-t", language_pair, + "-q"], shell=False) + else: + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-m", mode, + "-v", step.context.video_file_path, + "-s", step.context.subtitle_path_or_selector, + "-t", language_pair, + "-q"], shell=False) + step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) + + @when('I run the alignment with {aligner:S} on them with {mode:S} stage and output "{file_name:S}"') def run_subaligner_with_output(step, aligner, mode, file_name): if mode == "": @@ -290,6 +310,18 @@ def run_subtitle_converter(step, output_subtitle): step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) +@when('I run the converter with {language_pair:S} for translation and "{output_subtitle:S}" as the output') +def run_subtitle_converter_with_translation(step, language_pair, output_subtitle): + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", "subaligner_convert"), + "-i", step.context.subtitle_path_or_selector, + "-o", os.path.join(PWD, "..", "..", "subaligner", "resource", output_subtitle), + "-fr", "25.0", + "-t", language_pair, + "-q"] + step.text.split(" "), shell=False, stdout=subprocess.PIPE) + step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) + + @before.each_scenario(on_tags="train or hyperparameter-tuning") def create_training_output_dir(scenario): scenario.context.temp_dir = tempfile.mkdtemp() diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py new file mode 100644 index 0000000..7bc3821 --- /dev/null +++ b/tests/subaligner/test_translator.py @@ -0,0 +1,53 @@ +import os +import unittest +from mock import Mock, patch +from transformers import MarianMTModel, MarianTokenizer +from subaligner.subtitle import Subtitle +from subaligner.translator import Translator as Undertest + + +class TranslatorTests(unittest.TestCase): + + def setUp(self): + self.srt_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "resource/test.srt" + ) + + def test_get_iso_639_alpha_2(self): + self.assertEqual("en", Undertest.get_iso_639_alpha_2("eng")) + self.assertEqual("ada", Undertest.get_iso_639_alpha_2("ada")) + + @patch("transformers.MarianMTModel.from_pretrained") + @patch("transformers.MarianTokenizer.from_pretrained") + def test_translate_subs(self, tokenizer_from_pretrained, model_from_pretrained): + subs = Subtitle.load(self.srt_file_path).subs + mock_tokenizer = Mock() + mock_tokenizer.return_value = {"input_ids": None, "attention_mask": None} + mock_tokenizer.decode.return_value = "translated" + mock_model = Mock() + mock_model.generate.return_value = [None] * len(subs) + tokenizer_from_pretrained.return_value = mock_tokenizer + model_from_pretrained.return_value = mock_model + + translated_subs = Undertest("eng", "zho").translate_subs(subs) + + self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) + + def test_throw_exception_on_getting_iso_639_alpha_2(self): + try: + Undertest.get_iso_639_alpha_2("afa") + except Exception as e: + self.assertTrue(isinstance(e, ValueError)) + else: + self.fail("Should have thrown exception") + + @patch("transformers.MarianTokenizer.from_pretrained", side_effect=OSError) + def test_throw_exception_on_translating_subs(self, mock_tokenizer_from_pretrained): + subs = Subtitle.load(self.srt_file_path).subs + try: + Undertest("eng", "aar").translate_subs(subs) + except Exception as e: + self.assertTrue(mock_tokenizer_from_pretrained.called) + self.assertTrue(isinstance(e, NotImplementedError)) + else: + self.fail("Should have thrown exception") From e747b9675d38f809a422bafde5b7b6a156cc72bf Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 10 May 2021 09:35:35 +0100 Subject: [PATCH 3/9] add pycountry as a dependency --- requirements-app.txt | 1 + requirements.txt | 1 + site/source/conf.py | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements-app.txt b/requirements-app.txt index 7341479..69e9294 100644 --- a/requirements-app.txt +++ b/requirements-app.txt @@ -46,6 +46,7 @@ psutil==5.6.7 py==1.10.0 pyasn1==0.4.8 pyasn1-modules==0.2.7 +pycountry~=20.7.3 pydot==1.2.4 pydot-ng==1.0.0 pydotplus==2.0.2 diff --git a/requirements.txt b/requirements.txt index 7341479..69e9294 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,6 +46,7 @@ psutil==5.6.7 py==1.10.0 pyasn1==0.4.8 pyasn1-modules==0.2.7 +pycountry~=20.7.3 pydot==1.2.4 pydot-ng==1.0.0 pydotplus==2.0.2 diff --git a/site/source/conf.py b/site/source/conf.py index 4d31e50..81df1ac 100644 --- a/site/source/conf.py +++ b/site/source/conf.py @@ -81,7 +81,8 @@ "cchardet", "captionstransformer", "bs4", - "transformers" + "transformers", + "pycountry" ] def setup(app): From ae803af583dae0fe280cce4cc618d841955b63fc Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 10 May 2021 18:35:03 +0100 Subject: [PATCH 4/9] batch subtitle texts due to too-high memory usage during translation --- site/source/index.rst | 13 +++++++------ subaligner/translator.py | 38 +++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/site/source/index.rst b/site/source/index.rst index 3c05244..b382633 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -7,17 +7,18 @@ Welcome to Subaligner's documentation! ====================================== Given an out-of-sync subtitle file along with a piece of audiovisual content carrying speeches described by it, -Subaligner provides a one-stop solution on automatic subtitle synchronisation with a pretrained deep neural network and forced -alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained +Subaligner provides a one-stop solution on automatic subtitle synchronisation and translation with pretrained deep neural networks +and forced alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained upfront. Subaligner helps subtitlers not only in preprocessing raw subtitle materials (outcome from stenographers or STT workflow, etc.) but also in gaining quality control over their work within subtitle post-production. This tool also tolerates errors occurred in live subtitles which sometimes do not completely or correctly represent what people actually spoke in the companion audiovisual content. -Subligner has been shifted with a command-line interface which helps users to conduct various tasks around subtitle synchronisation -without writing any code as well as APIs targeting developers. With existing audiovisual and in-sync subtitle files at -hand, users can train their own synchroniser with a single command and zero setup. A handful of subtitle formats are supported -and can be converted from one to another either during synchronisation or on on-demand. +Subligner has been shipped with a command-line interface which helps users to conduct various tasks around subtitle +synchronisation and multilingual translation without writing any code. APIs targeting developers have also been provided. +With existing audiovisual and in-sync subtitle files at hand, users can train their own synchroniser with a single +command and zero setup. A handful of subtitle formats are supported and can be converted from one to another either during +synchronisation or on on-demand. Subligner supports the following subtitle formats: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP, EBU STL, SAMI, SCC and SBV. The source code can be found on GitHub: `subaligner `_. diff --git a/subaligner/translator.py b/subaligner/translator.py index 72bb628..c45e748 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -1,7 +1,9 @@ +import math import pycountry import time from copy import deepcopy from pysrt import SubRipItem +from tqdm import tqdm from transformers import MarianMTModel, MarianTokenizer from typing import List from .singleton import Singleton @@ -14,6 +16,7 @@ class Translator(Singleton): __TENSOR_TYPE = "pt" __OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" + __TRANSLATING_BATCH_SIZE = 10 def __init__(self, source_language, target_lang) -> None: self.__initialise_model(source_language, target_lang) @@ -29,13 +32,19 @@ def get_iso_639_alpha_2(language_code: str) -> str: return lang.alpha_3 def translate_subs(self, subs: List[SubRipItem]) -> List[SubRipItem]: + translated_texts = [] + self.lang_model.eval() new_subs = deepcopy(subs) src_texts = [sub.text for sub in new_subs] - tokenizer = self.tokenizer(src_texts, return_tensors=Translator.__TENSOR_TYPE, padding=True) - translated = self.lang_model.generate(**tokenizer) - translated_texts = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated] + num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) + Translator.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) + for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): + tokenizer = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.lang_model.generate(**tokenizer) + translated_texts.extend([self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]) for index in range(len(new_subs)): new_subs[index].text = translated_texts[index] + Translator.__LOGGER.info("Subtitle translated") return new_subs def __initialise_model(self, src_lang, des_lang): @@ -44,43 +53,43 @@ def __initialise_model(self, src_lang, des_lang): self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(des_lang)) self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), des_lang) self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_MT.format(src_lang, des_lang) self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(des_lang)) self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(des_lang)) self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), des_lang) self.__download_mt_model(mt_model_name) return except OSError: - self.__log_and_back_off(mt_model_name) + Translator.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, des_lang) self.__download_mt_model(mt_model_name) @@ -97,6 +106,13 @@ def __download_mt_model(self, mt_model_name): self.lang_model = MarianMTModel.from_pretrained(mt_model_name) Translator.__LOGGER.debug("MT model %s downloaded" % mt_model_name) - def __log_and_back_off(self, mt_model_name): + @staticmethod + def __log_and_back_off(mt_model_name): Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) time.sleep(1) + + @staticmethod + def __batch(iterable, size=1): + total = len(iterable) + for ndx in range(0, total, size): + yield iterable[ndx:min(ndx + size, total)] From 31ff0fa1d6ed37c25846e99f1f989b1d6d9614f3 Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 10 May 2021 18:41:46 +0100 Subject: [PATCH 5/9] add docstring to translator --- subaligner/translator.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/subaligner/translator.py b/subaligner/translator.py index c45e748..b13b5a6 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -11,6 +11,8 @@ class Translator(Singleton): + """Translate subtitles. + """ __LOGGER = Logger().get_logger(__name__) __TENSOR_TYPE = "pt" @@ -18,11 +20,32 @@ class Translator(Singleton): __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" __TRANSLATING_BATCH_SIZE = 10 - def __init__(self, source_language, target_lang) -> None: - self.__initialise_model(source_language, target_lang) + def __init__(self, source_language, target_language) -> None: + """Initialiser for the subtitle translation. + + Arguments: + source_language {string} -- The source language code from ISO 639-3. + target_language {string} -- The target language code from ISO 639-3. + + Raises: + NotImplementedError -- Thrown when the model of the specified language pair is not found. + """ + self.__initialise_model(source_language, target_language) @staticmethod def get_iso_639_alpha_2(language_code: str) -> str: + """Get the alpha 2 language code from a alpha 3 one. + + Arguments: + language_code {string} -- A language code from ISO 639-3. + + Returns: + string -- The alpha 2 language code if exists otherwise the alpha 3 one. + + Raises: + ValueError -- Thrown when the input language code cannot be recognised. + """ + lang = pycountry.languages.get(alpha_3=language_code) if lang is None: raise ValueError("Cannot recognise %s as an ISO 639-3 language code" % language_code) @@ -32,6 +55,15 @@ def get_iso_639_alpha_2(language_code: str) -> str: return lang.alpha_3 def translate_subs(self, subs: List[SubRipItem]) -> List[SubRipItem]: + """Translate a list of subtitle cues. + + Arguments: + subs {list} -- A list of SubRipItems. + + Returns: + {list} -- A list of new SubRipItems holding the translation results. + """ + translated_texts = [] self.lang_model.eval() new_subs = deepcopy(subs) From 61621b6fe81c8b701b50e128a8a6babcd6bdbb67 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 14 May 2021 09:27:44 +0100 Subject: [PATCH 6/9] add -lgs and --languages to display language codes used for translation and stretch --- README.md | 11 ++- site/source/advanced_usage.rst | 7 +- site/source/usage.rst | 11 ++- subaligner/__main__.py | 16 ++-- subaligner/subaligner_1pass/__main__.py | 14 ++- subaligner/subaligner_2pass/__main__.py | 15 ++- subaligner/subaligner_convert/__main__.py | 13 ++- subaligner/translator.py | 92 +++++++++++++++---- tests/integration/feature/subaligner.feature | 10 ++ .../feature/subaligner_convert.feature | 10 ++ tests/integration/radish/step.py | 14 +++ tests/subaligner/test_translator.py | 39 ++++++-- 12 files changed, 201 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 7b21aa7..966e8d0 100644 --- a/README.md +++ b/README.md @@ -85,10 +85,13 @@ $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/sub ``` # Translative alignment with the ISO 639-3 language code pair (src,tgt) -$ subaligner_1pass -v video.mp4 -s subtitle.srt -t eng,zho -$ subaligner_2pass -v video.mp4 -s subtitle.srt -t eng,spa -$ subaligner -m single -v video.mp4 -s subtitle.srt -t eng,fra -$ subaligner -m dual -v video.mp4 -s subtitle.srt -t eng,deu +$ subaligner_1pass --languages +$ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt +$ subaligner_2pass --languages +$ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt +$ subaligner --languages +$ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt +$ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt ``` ``` # Run alignments with pipx diff --git a/site/source/advanced_usage.rst b/site/source/advanced_usage.rst index 77c3c4b..e4f95a4 100644 --- a/site/source/advanced_usage.rst +++ b/site/source/advanced_usage.rst @@ -107,10 +107,15 @@ flags to customise the configuration on tuning: **Convert the subtitle to another format**:: (.venv) $ subaligner_convert -i subtitle.srt -o subtitle.vtt + +**Convert the subtitle to another format and translate**:: + + (.venv) $ subaligner_convert --languages (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_zh.vtt -t eng,zho -**Convert the subtitle and trans**:: +**Translate the subtitle without changing the format**:: + (.venv) $ subaligner_convert --languages (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_es.srt -t eng,spa For output subtitles like MicroDVD relying on the frame rate, its value needs to be passed in with `-fr` or `--frame_rate`. diff --git a/site/source/usage.rst b/site/source/usage.rst index 7b076a5..54f8021 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -27,10 +27,13 @@ Make sure you have got the virtual environment activated upfront. **Translative alignment with the ISO 639-3 language code pair**:: - (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t eng,zho - (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -t eng,spa - (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t eng,fra - (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t eng,deu + (.venv) $ subaligner_1pass --languages + (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt + (.venv) $ subaligner_2pass --languages + (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt + (.venv) $ subaligner --languages + (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt + (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt **Run alignments with the docker image**:: diff --git a/subaligner/__main__.py b/subaligner/__main__.py index 6a134bc..b0c17ae 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -1,8 +1,8 @@ #!/usr/bin/env python """ -usage: subaligner [-h] -m {single,dual} -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-so] +usage: subaligner [-h] [-m {single,dual}] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-so] [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver] + [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lan] [-d] [-q] [-ver] Subaligner command line interface @@ -21,6 +21,7 @@ Path to the output subtitle file -t TRANSLATE, --translate TRANSLATE Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -31,7 +32,7 @@ -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH - File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .ttml, .vtt, .tmp, .dfxp, .xml, .sami, .scc, .sub, .txt, .stl, .ssa, .ytt, .srt, .sbv, .ass, .smi) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -61,7 +62,6 @@ def main(): default="", choices=["single", "dual"], help="Alignment mode: either single or dual", - required=True, ) required_args.add_argument( "-v", @@ -69,7 +69,6 @@ def main(): type=str, default="", help="File path or URL to the video file", - required=True, ) from subaligner.subtitle import Subtitle required_args.add_argument( @@ -78,7 +77,6 @@ def main(): type=str, default="", help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())), - required=True, ) parser.add_argument( "-l", @@ -128,6 +126,8 @@ def main(): type=str, help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -135,6 +135,10 @@ def main(): parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.mode == "": print("--mode was not passed in") sys.exit(21) diff --git a/subaligner/subaligner_1pass/__main__.py b/subaligner/subaligner_1pass/__main__.py index c83e3e6..6f9cde7 100755 --- a/subaligner/subaligner_1pass/__main__.py +++ b/subaligner/subaligner_1pass/__main__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -usage: subaligner_1pass [-h] -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver] +usage: subaligner_1pass [-h] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] Run single-stage alignment @@ -14,6 +14,7 @@ Path to the output subtitle file -t TRANSLATE, --translate TRANSLATE Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -22,7 +23,7 @@ -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH - File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .stl, .dfxp, .xml, .vtt, .sbv, .ytt, .scc, .ttml, .smi, .sami, .ssa, .tmp, .txt, .sub, .srt, .ass) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -51,7 +52,6 @@ def main(): type=str, default="", help="File path or URL to the video file", - required=True, ) from subaligner.subtitle import Subtitle required_args.add_argument( @@ -60,7 +60,6 @@ def main(): type=str, default="", help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())), - required=True, ) parser.add_argument( "-l", @@ -89,6 +88,8 @@ def main(): type=str, help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -96,6 +97,11 @@ def main(): parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + from aeneas.language import Language + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.video_path == "": print("--video_path was not passed in") sys.exit(21) diff --git a/subaligner/subaligner_2pass/__main__.py b/subaligner/subaligner_2pass/__main__.py index 0252ba2..5ae6873 100755 --- a/subaligner/subaligner_2pass/__main__.py +++ b/subaligner/subaligner_2pass/__main__.py @@ -1,8 +1,8 @@ #!/usr/bin/env python """ -usage: subaligner_2pass [-h] -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-so] +usage: subaligner_2pass [-h] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-so] [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver] + [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] Run dual-stage alignment @@ -21,6 +21,7 @@ Path to the output subtitle file -t TRANSLATE, --translate TRANSLATE Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -29,7 +30,7 @@ -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH - File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .ass, .sbv, .srt, .vtt, .ttml, .dfxp, .scc, .txt, .tmp, .smi, .ssa, .sami, .xml, .sub, .stl, .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -58,7 +59,6 @@ def main(): type=str, default="", help="File path or URL to the video file", - required=True, ) from subaligner.subtitle import Subtitle required_args.add_argument( @@ -67,7 +67,6 @@ def main(): type=str, default="", help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())), - required=True, ) parser.add_argument( "-l", @@ -117,6 +116,8 @@ def main(): type=str, help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -124,6 +125,10 @@ def main(): parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.video_path == "": print("--video_path was not passed in") sys.exit(21) diff --git a/subaligner/subaligner_convert/__main__.py b/subaligner/subaligner_convert/__main__.py index d0e238d..9a23953 100755 --- a/subaligner/subaligner_convert/__main__.py +++ b/subaligner/subaligner_convert/__main__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -usage: subaligner_convert [-h] -i INPUT_SUBTITLE_PATH -o OUTPUT_SUBTITLE_PATH [-f FRAME_RATE] [-d] [-q] [-ver] +usage: subaligner_convert [-h] -i INPUT_SUBTITLE_PATH -o OUTPUT_SUBTITLE_PATH [-fr FRAME_RATE] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] Convert a subtitle from the input format to the output format @@ -10,6 +10,7 @@ Frame rate used by conversion to formats such as MicroDVD -t TRANSLATE, --translate TRANSLATE Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -49,7 +50,6 @@ def main(): type=str, default="", help="File path or URL to the input subtitle file", - required=True, ) required_args.add_argument( "-o", @@ -57,7 +57,6 @@ def main(): type=str, default="", help="File path to the output subtitle file", - required=True, ) parser.add_argument( "-fr", @@ -72,12 +71,20 @@ def main(): type=str, help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", help="Switch off logging information") parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + + from aeneas.language import Language + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.input_subtitle_path == "": print("--input_subtitle_path was not passed in") sys.exit(21) diff --git a/subaligner/translator.py b/subaligner/translator.py index b13b5a6..459fd24 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -5,7 +5,7 @@ from pysrt import SubRipItem from tqdm import tqdm from transformers import MarianMTModel, MarianTokenizer -from typing import List +from typing import List, Generator from .singleton import Singleton from .logger import Logger @@ -19,6 +19,31 @@ class Translator(Singleton): __OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" __TRANSLATING_BATCH_SIZE = 10 + __LANGUAGE_CODE_MAPPER = { + "bos": "zls", + "cmn": "zho", + "gla": "cel", + "grc": "grk", + "guj": "inc", + "ina": "art", + "jbo": "art", + "kan": "dra", + "kir": "trk", + "lat": "itc", + "lfn": "art", + "mya": "sit", + "nep": "inc", + "ori": "inc", + "sin": "inc", + "srp": "zls", + "tam": "dra", + "tat": "trk", + "tel": "dra", + "yue": "zho" + } + __LANGUAGE_PAIR_MAPPER = { + "eng-jpn": "eng-jap" + } def __init__(self, source_language, target_language) -> None: """Initialiser for the subtitle translation. @@ -48,12 +73,42 @@ def get_iso_639_alpha_2(language_code: str) -> str: lang = pycountry.languages.get(alpha_3=language_code) if lang is None: - raise ValueError("Cannot recognise %s as an ISO 639-3 language code" % language_code) + return language_code elif hasattr(lang, "alpha_2"): return lang.alpha_2 else: return lang.alpha_3 + @staticmethod + def normalise_single(language_code: str) -> str: + """Normalise a single language code. + + Arguments: + language_code {string} -- A language code from ISO 639-3. + + Returns: + string -- The language code understood by the language model. + """ + + return Translator.__LANGUAGE_CODE_MAPPER[language_code] if language_code in Translator.__LANGUAGE_CODE_MAPPER else language_code + + @staticmethod + def normalise_pair(source_language: str, target_language: str) -> List[str]: + """Normalise a pair of language codes. + + Arguments: + source_language {string} -- The source language code from ISO 639-3. + target_language {string} -- The target language code from ISO 639-3. + + Returns: + list -- The language code pair understood by the language model. + """ + + if "{}-{}".format(source_language, target_language) in Translator.__LANGUAGE_PAIR_MAPPER: + return Translator.__LANGUAGE_PAIR_MAPPER["{}-{}".format(source_language, target_language)].split("-") + else: + return [source_language, target_language] + def translate_subs(self, subs: List[SubRipItem]) -> List[SubRipItem]: """Translate a list of subtitle cues. @@ -79,72 +134,75 @@ def translate_subs(self, subs: List[SubRipItem]) -> List[SubRipItem]: Translator.__LOGGER.info("Subtitle translated") return new_subs - def __initialise_model(self, src_lang, des_lang): + def __initialise_model(self, src_lang: str, tgt_lang: str) -> None: + src_lang = Translator.normalise_single(src_lang) + tgt_lang = Translator.normalise_single(tgt_lang) + src_lang, tgt_lang = Translator.normalise_pair(src_lang, tgt_lang) try: - mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(des_lang)) + mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(des_lang)) + mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), des_lang) + mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_MT.format(src_lang, des_lang) + mt_model_name = Translator.__OPUS_MT.format(src_lang, tgt_lang) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(des_lang)) + mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(des_lang)) + mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), des_lang) + mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) self.__download_mt_model(mt_model_name) return except OSError: Translator.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, des_lang) + mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, tgt_lang) self.__download_mt_model(mt_model_name) return except OSError: Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) - message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, des_lang) + message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) Translator.__LOGGER.error(message) raise NotImplementedError(message) - def __download_mt_model(self, mt_model_name): + def __download_mt_model(self, mt_model_name: str) -> None: Translator.__LOGGER.debug("Trying to download the MT model %s" % mt_model_name) self.tokenizer = MarianTokenizer.from_pretrained(mt_model_name) self.lang_model = MarianMTModel.from_pretrained(mt_model_name) Translator.__LOGGER.debug("MT model %s downloaded" % mt_model_name) @staticmethod - def __log_and_back_off(mt_model_name): + def __log_and_back_off(mt_model_name: str): Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) time.sleep(1) @staticmethod - def __batch(iterable, size=1): - total = len(iterable) + def __batch(data: List, size: int = 1) -> Generator: + total = len(data) for ndx in range(0, total, size): - yield iterable[ndx:min(ndx + size, total)] + yield data[ndx:min(ndx + size, total)] diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature index 75def31..e764790 100644 --- a/tests/integration/feature/subaligner.feature +++ b/tests/integration/feature/subaligner.feature @@ -243,3 +243,13 @@ Feature: Subaligner CLI | subaligner_1pass | | subaligner_2pass | | subaligner | + + @languages + Scenario Outline: Test language codes display + When I run the command with languages + Then supported language codes are displayed + Examples: + | aligner | + | subaligner_1pass | + | subaligner_2pass | + | subaligner | diff --git a/tests/integration/feature/subaligner_convert.feature b/tests/integration/feature/subaligner_convert.feature index 4df2e30..3a779ed 100644 --- a/tests/integration/feature/subaligner_convert.feature +++ b/tests/integration/feature/subaligner_convert.feature @@ -63,4 +63,14 @@ Feature: Subaligner CLI Examples: | subtitle-in | language_pair | subtitle-out | | "test.srt" | eng,zho | "test_zh_srt.ttml" | + | "test.srt" | eng,spa | "test_es_srt.ttml" | + | "test.srt" | eng,hin | "test_hi_srt.ttml" | + | "test.srt" | eng,fra | "test_fr_srt.ttml" | + | "test.srt" | eng,ara | "test_ar_srt.ttml" | + | "test.srt" | eng,jpn | "test_ja_srt.ttml" | | "test.srt" | eng,rus | "test_ru_srt.ttml" | + | "test.srt" | eng,ind | "test_id_srt.ttml" | + + Scenario: Test language codes display + When I run the subaligner_convert command with languages + Then supported language codes are displayed diff --git a/tests/integration/radish/step.py b/tests/integration/radish/step.py index 34f3fb3..0323987 100644 --- a/tests/integration/radish/step.py +++ b/tests/integration/radish/step.py @@ -202,6 +202,20 @@ def expect_help_information(step, aligner): assert "usage: %s " % aligner in step.context.stdout +@when("I run the {aligner:S} command with languages") +def run_subaligner_with_languages(step, aligner): + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-lgs"], shell=False, stdout=subprocess.PIPE) + stdout, _ = process.communicate(timeout=WAIT_TIMEOUT_IN_SECONDS) + step.context.stdout = stdout.decode("utf-8") + + +@then("supported language codes are displayed") +def expect_language_codes(step): + assert "eng English" in step.context.stdout + + @then("the dual-stage help information is displayed") def expect_dual_stage_help_information(step): assert "usage: subaligner_2pass" in step.context.stdout diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py index 7bc3821..2eccae4 100644 --- a/tests/subaligner/test_translator.py +++ b/tests/subaligner/test_translator.py @@ -1,6 +1,7 @@ import os import unittest from mock import Mock, patch +from parameterized import parameterized from transformers import MarianMTModel, MarianTokenizer from subaligner.subtitle import Subtitle from subaligner.translator import Translator as Undertest @@ -16,6 +17,7 @@ def setUp(self): def test_get_iso_639_alpha_2(self): self.assertEqual("en", Undertest.get_iso_639_alpha_2("eng")) self.assertEqual("ada", Undertest.get_iso_639_alpha_2("ada")) + self.assertEqual("unk", Undertest.get_iso_639_alpha_2("unk")) @patch("transformers.MarianMTModel.from_pretrained") @patch("transformers.MarianTokenizer.from_pretrained") @@ -33,13 +35,36 @@ def test_translate_subs(self, tokenizer_from_pretrained, model_from_pretrained): self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) - def test_throw_exception_on_getting_iso_639_alpha_2(self): - try: - Undertest.get_iso_639_alpha_2("afa") - except Exception as e: - self.assertTrue(isinstance(e, ValueError)) - else: - self.fail("Should have thrown exception") + @parameterized.expand([ + ["bos", "zls"], + ["cmn", "zho"], + ["gla", "cel"], + ["grc", "grk"], + ["guj", "inc"], + ["ina", "art"], + ["jbo", "art"], + ["kan", "dra"], + ["kir", "trk"], + ["lat", "itc"], + ["lfn", "art"], + ["mya", "sit"], + ["nep", "inc"], + ["ori", "inc"], + ["sin", "inc"], + ["srp", "zls"], + ["tam", "dra"], + ["tat", "trk"], + ["tel", "dra"], + ["yue", "zho"], + ]) + def test_normalise_single(self, original, normalised): + self.assertEqual(normalised, Undertest.normalise_single(original)) + + @parameterized.expand([ + ["eng-jpn", "eng-jap"] + ]) + def test_normalise_pair(self, original, normalised): + self.assertEqual(normalised, "-".join(Undertest.normalise_pair(*original.split("-")))) @patch("transformers.MarianTokenizer.from_pretrained", side_effect=OSError) def test_throw_exception_on_translating_subs(self, mock_tokenizer_from_pretrained): From 35adc90c0565d4c5892be948cbd996ad4c86ff03 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 14 May 2021 18:37:27 +0100 Subject: [PATCH 7/9] update docs --- site/source/index.rst | 10 +++++----- site/source/usage.rst | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/site/source/index.rst b/site/source/index.rst index b382633..65578bd 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -11,14 +11,14 @@ Subaligner provides a one-stop solution on automatic subtitle synchronisation an and forced alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained upfront. Subaligner helps subtitlers not only in preprocessing raw subtitle materials (outcome from stenographers or STT workflow, etc.) but also in gaining quality control over their work within subtitle post-production. This tool -also tolerates errors occurred in live subtitles which sometimes do not completely or correctly represent what people +also tolerates errors that occurred in live subtitles which sometimes do not completely or correctly represent what people actually spoke in the companion audiovisual content. Subligner has been shipped with a command-line interface which helps users to conduct various tasks around subtitle -synchronisation and multilingual translation without writing any code. APIs targeting developers have also been provided. -With existing audiovisual and in-sync subtitle files at hand, users can train their own synchroniser with a single -command and zero setup. A handful of subtitle formats are supported and can be converted from one to another either during -synchronisation or on on-demand. +synchronisation and multilingual translation without writing any code. Application programming interfaces are also provided +to developers wanting to perform those tasks programmatically. Moreover, with existing audiovisual and in-sync subtitle files at +hand, advanced users can train their own synchronisers with a single command and zero setup. A handful of subtitle formats are supported +and can be converted from one to another either during synchronisation and translation or on on-demand. Subligner supports the following subtitle formats: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP, EBU STL, SAMI, SCC and SBV. The source code can be found on GitHub: `subaligner `_. diff --git a/site/source/usage.rst b/site/source/usage.rst index 54f8021..cc8d489 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -4,7 +4,8 @@ Usage Subaligner provides two ways of aligning subtitles: single-stage alignment and dual-stage alignment. The former way has lower latency and shifts all subtitle segments globally. The latter way has higher latency and shifts the -segments individually with an option of stretching each segment. +segments individually with an option of stretching each segment. Multilingual translation on subtitles can be achieved +together with the alignment in one go or separately (see in :doc:`Advanced Usage <./advanced_usage.rst>`). Make sure you have got the virtual environment activated upfront. @@ -18,14 +19,14 @@ Make sure you have got the virtual environment activated upfront. (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt (.venv) $ subaligner_2pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt -**Pass in single-stage or dual-stage as the alignment mode (src,tgt)**:: +**Pass in single-stage or dual-stage as the alignment mode**:: (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m single -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m dual -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt -**Translative alignment with the ISO 639-3 language code pair**:: +**Translative alignment with the ISO 639-3 language code pair (src,tgt)**:: (.venv) $ subaligner_1pass --languages (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt From f461e47bf939d415580cc7a3c654e7dd32462d30 Mon Sep 17 00:00:00 2001 From: baxtree Date: Sat, 15 May 2021 11:48:22 +0100 Subject: [PATCH 8/9] update docs --- site/source/conf.py | 1 + site/source/usage.rst | 2 +- subaligner/__main__.py | 2 +- subaligner/subaligner_1pass/__main__.py | 2 +- subaligner/subaligner_2pass/__main__.py | 2 +- subaligner/subaligner_convert/__main__.py | 2 +- subaligner/translator.py | 31 ++++++++++++----------- tests/subaligner/test_translator.py | 11 ++++---- 8 files changed, 28 insertions(+), 25 deletions(-) diff --git a/site/source/conf.py b/site/source/conf.py index 81df1ac..c0757a7 100644 --- a/site/source/conf.py +++ b/site/source/conf.py @@ -42,6 +42,7 @@ "sphinx.ext.ifconfig", "sphinx.ext.viewcode", "sphinx.ext.napoleon", + "sphinx.ext.autosectionlabel", ] # Add any paths that contain templates here, relative to this directory. diff --git a/site/source/usage.rst b/site/source/usage.rst index cc8d489..f412d23 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -5,7 +5,7 @@ Usage Subaligner provides two ways of aligning subtitles: single-stage alignment and dual-stage alignment. The former way has lower latency and shifts all subtitle segments globally. The latter way has higher latency and shifts the segments individually with an option of stretching each segment. Multilingual translation on subtitles can be achieved -together with the alignment in one go or separately (see in :doc:`Advanced Usage <./advanced_usage.rst>`). +together with the alignment in one go or separately (see in :ref:`Advanced Usage`). Make sure you have got the virtual environment activated upfront. diff --git a/subaligner/__main__.py b/subaligner/__main__.py index b0c17ae..0a6fff3 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -221,7 +221,7 @@ def main(): if FLAGS.translate is not None: source, target = FLAGS.translate.split(",") translator = Translator(source, target) - aligned_subs = translator.translate_subs(aligned_subs) + aligned_subs = translator.translate(aligned_subs) Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate, "utf-8") else: Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate) diff --git a/subaligner/subaligner_1pass/__main__.py b/subaligner/subaligner_1pass/__main__.py index 6f9cde7..592e042 100755 --- a/subaligner/subaligner_1pass/__main__.py +++ b/subaligner/subaligner_1pass/__main__.py @@ -168,7 +168,7 @@ def main(): if FLAGS.translate is not None: source, target = FLAGS.translate.split(",") translator = Translator(source, target) - subs = translator.translate_subs(subs) + subs = translator.translate(subs) Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate, "utf-8") else: Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate) diff --git a/subaligner/subaligner_2pass/__main__.py b/subaligner/subaligner_2pass/__main__.py index 5ae6873..c199937 100755 --- a/subaligner/subaligner_2pass/__main__.py +++ b/subaligner/subaligner_2pass/__main__.py @@ -201,7 +201,7 @@ def main(): if FLAGS.translate is not None: source, target = FLAGS.translate.split(",") translator = Translator(source, target) - subs_list = translator.translate_subs(subs) + subs_list = translator.translate(subs) Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate, "utf-8") else: Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate) diff --git a/subaligner/subaligner_convert/__main__.py b/subaligner/subaligner_convert/__main__.py index 9a23953..3ff48b7 100755 --- a/subaligner/subaligner_convert/__main__.py +++ b/subaligner/subaligner_convert/__main__.py @@ -116,7 +116,7 @@ def main(): if FLAGS.translate is not None: source, target = FLAGS.translate.split(",") translator = Translator(source, target) - subs_list = translator.translate_subs(subtitle.subs) + subs_list = translator.translate(subtitle.subs) Subtitle.export_subtitle(local_subtitle_path, subs_list, FLAGS.output_subtitle_path, FLAGS.frame_rate, "utf-8") Subtitle.save_subs_as_target_format(subs_list, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate, "utf-8") else: diff --git a/subaligner/translator.py b/subaligner/translator.py index 459fd24..645101f 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -42,27 +42,28 @@ class Translator(Singleton): "yue": "zho" } __LANGUAGE_PAIR_MAPPER = { - "eng-jpn": "eng-jap" + "eng-jpn": "eng-jap", + "jpn-eng": "jap-eng" } - def __init__(self, source_language, target_language) -> None: + def __init__(self, src_language, tgt_language) -> None: """Initialiser for the subtitle translation. Arguments: - source_language {string} -- The source language code from ISO 639-3. - target_language {string} -- The target language code from ISO 639-3. + src_language {string} -- The source language code derived from ISO 639-3. + tgt_language {string} -- The target language code derived from ISO 639-3. Raises: NotImplementedError -- Thrown when the model of the specified language pair is not found. """ - self.__initialise_model(source_language, target_language) + self.__initialise_model(src_language, tgt_language) @staticmethod def get_iso_639_alpha_2(language_code: str) -> str: - """Get the alpha 2 language code from a alpha 3 one. + """Find the alpha 2 language code based on an alpha 3 one. Arguments: - language_code {string} -- A language code from ISO 639-3. + language_code {string} -- An alpha 3 language code derived from ISO 639-3. Returns: string -- The alpha 2 language code if exists otherwise the alpha 3 one. @@ -84,7 +85,7 @@ def normalise_single(language_code: str) -> str: """Normalise a single language code. Arguments: - language_code {string} -- A language code from ISO 639-3. + language_code {string} -- A language code derived from ISO 639-3. Returns: string -- The language code understood by the language model. @@ -93,23 +94,23 @@ def normalise_single(language_code: str) -> str: return Translator.__LANGUAGE_CODE_MAPPER[language_code] if language_code in Translator.__LANGUAGE_CODE_MAPPER else language_code @staticmethod - def normalise_pair(source_language: str, target_language: str) -> List[str]: + def normalise_pair(src_language: str, tgt_language: str) -> List[str]: """Normalise a pair of language codes. Arguments: - source_language {string} -- The source language code from ISO 639-3. - target_language {string} -- The target language code from ISO 639-3. + src_language {string} -- The source language code derived from ISO 639-3. + tgt_language {string} -- The target language code derived from ISO 639-3. Returns: list -- The language code pair understood by the language model. """ - if "{}-{}".format(source_language, target_language) in Translator.__LANGUAGE_PAIR_MAPPER: - return Translator.__LANGUAGE_PAIR_MAPPER["{}-{}".format(source_language, target_language)].split("-") + if "{}-{}".format(src_language, tgt_language) in Translator.__LANGUAGE_PAIR_MAPPER: + return Translator.__LANGUAGE_PAIR_MAPPER["{}-{}".format(src_language, tgt_language)].split("-") else: - return [source_language, target_language] + return [src_language, tgt_language] - def translate_subs(self, subs: List[SubRipItem]) -> List[SubRipItem]: + def translate(self, subs: List[SubRipItem]) -> List[SubRipItem]: """Translate a list of subtitle cues. Arguments: diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py index 2eccae4..0afabf0 100644 --- a/tests/subaligner/test_translator.py +++ b/tests/subaligner/test_translator.py @@ -17,11 +17,11 @@ def setUp(self): def test_get_iso_639_alpha_2(self): self.assertEqual("en", Undertest.get_iso_639_alpha_2("eng")) self.assertEqual("ada", Undertest.get_iso_639_alpha_2("ada")) - self.assertEqual("unk", Undertest.get_iso_639_alpha_2("unk")) + self.assertEqual("xyz", Undertest.get_iso_639_alpha_2("xyz")) @patch("transformers.MarianMTModel.from_pretrained") @patch("transformers.MarianTokenizer.from_pretrained") - def test_translate_subs(self, tokenizer_from_pretrained, model_from_pretrained): + def test_translate(self, tokenizer_from_pretrained, model_from_pretrained): subs = Subtitle.load(self.srt_file_path).subs mock_tokenizer = Mock() mock_tokenizer.return_value = {"input_ids": None, "attention_mask": None} @@ -31,7 +31,7 @@ def test_translate_subs(self, tokenizer_from_pretrained, model_from_pretrained): tokenizer_from_pretrained.return_value = mock_tokenizer model_from_pretrained.return_value = mock_model - translated_subs = Undertest("eng", "zho").translate_subs(subs) + translated_subs = Undertest("eng", "zho").translate(subs) self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) @@ -61,7 +61,8 @@ def test_normalise_single(self, original, normalised): self.assertEqual(normalised, Undertest.normalise_single(original)) @parameterized.expand([ - ["eng-jpn", "eng-jap"] + ["eng-jpn", "eng-jap"], + ["jpn-eng", "jap-eng"], ]) def test_normalise_pair(self, original, normalised): self.assertEqual(normalised, "-".join(Undertest.normalise_pair(*original.split("-")))) @@ -70,7 +71,7 @@ def test_normalise_pair(self, original, normalised): def test_throw_exception_on_translating_subs(self, mock_tokenizer_from_pretrained): subs = Subtitle.load(self.srt_file_path).subs try: - Undertest("eng", "aar").translate_subs(subs) + Undertest("eng", "aar").translate(subs) except Exception as e: self.assertTrue(mock_tokenizer_from_pretrained.called) self.assertTrue(isinstance(e, NotImplementedError)) From a162b87d3bc300d6fb6fc1d3af854895fbca2d89 Mon Sep 17 00:00:00 2001 From: baxtree Date: Sat, 15 May 2021 12:35:48 +0100 Subject: [PATCH 9/9] pin down the version for six --- requirements-app.txt | 1 + requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements-app.txt b/requirements-app.txt index 69e9294..79efd9c 100644 --- a/requirements-app.txt +++ b/requirements-app.txt @@ -66,6 +66,7 @@ scipy~=1.5.4 scikit-learn>=0.19.1 sentencepiece~=0.1.95 setuptools>=41.0.0 +six~=1.15.0 tblib==1.3.2 tensorflow>=1.15.5,<2.5 termcolor==1.1.0 diff --git a/requirements.txt b/requirements.txt index 69e9294..79efd9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -66,6 +66,7 @@ scipy~=1.5.4 scikit-learn>=0.19.1 sentencepiece~=0.1.95 setuptools>=41.0.0 +six~=1.15.0 tblib==1.3.2 tensorflow>=1.15.5,<2.5 termcolor==1.1.0