diff --git a/README.md b/README.md index bdc156e..966e8d0 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,17 @@ $ subaligner -m single -v https://example.com/video.mp4 -s https://example.com/s $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` ``` +# Translative alignment with the ISO 639-3 language code pair (src,tgt) + +$ subaligner_1pass --languages +$ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt +$ subaligner_2pass --languages +$ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt +$ subaligner --languages +$ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt +$ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt +``` +``` # Run alignments with pipx $ pipx run subaligner -m single -v video.mp4 -s subtitle.srt @@ -104,7 +115,8 @@ $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_2pass -v $ docker run -it baxtree/subaligner subaligner_1pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt $ docker run -it baxtree/subaligner subaligner_2pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` -The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass --help`, `subaligner_2pass --help` or `subaligner --help`. +The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass -h`, `subaligner_2pass -h` or `subaligner -h`. +Additional utilities can be used after consulting `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h`. ![](figures/screencast.gif) ## Supported Formats diff --git a/requirements-app.txt b/requirements-app.txt index d55b217..79efd9c 100644 --- a/requirements-app.txt +++ b/requirements-app.txt @@ -46,6 +46,7 @@ psutil==5.6.7 py==1.10.0 pyasn1==0.4.8 pyasn1-modules==0.2.7 +pycountry~=20.7.3 pydot==1.2.4 pydot-ng==1.0.0 pydotplus==2.0.2 @@ -63,13 +64,17 @@ requests-oauthlib==1.3.0 rsa==4.7 scipy~=1.5.4 scikit-learn>=0.19.1 +sentencepiece~=0.1.95 setuptools>=41.0.0 +six~=1.15.0 tblib==1.3.2 tensorflow>=1.15.5,<2.5 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 +torch~=1.8.1 tornado==5.1.0 +transformers~=4.5.1 urllib3==1.25.9 Werkzeug>=0.15.3 zict==0.1.3 diff --git a/requirements.txt b/requirements.txt index d55b217..79efd9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,6 +46,7 @@ psutil==5.6.7 py==1.10.0 pyasn1==0.4.8 pyasn1-modules==0.2.7 +pycountry~=20.7.3 pydot==1.2.4 pydot-ng==1.0.0 pydotplus==2.0.2 @@ -63,13 +64,17 @@ requests-oauthlib==1.3.0 rsa==4.7 scipy~=1.5.4 scikit-learn>=0.19.1 +sentencepiece~=0.1.95 setuptools>=41.0.0 +six~=1.15.0 tblib==1.3.2 tensorflow>=1.15.5,<2.5 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 +torch~=1.8.1 tornado==5.1.0 +transformers~=4.5.1 urllib3==1.25.9 Werkzeug>=0.15.3 zict==0.1.3 diff --git a/site/source/acknowledgement.rst b/site/source/acknowledgement.rst index 895c00b..9fe08f6 100644 --- a/site/source/acknowledgement.rst +++ b/site/source/acknowledgement.rst @@ -12,3 +12,4 @@ Acknowledgement - `pysrt `_ - `pysubs2 `_ - `aeneas `_ + - `transformers `_ diff --git a/site/source/advanced_usage.rst b/site/source/advanced_usage.rst index 408d469..e4f95a4 100644 --- a/site/source/advanced_usage.rst +++ b/site/source/advanced_usage.rst @@ -39,6 +39,16 @@ Embeddings extracted from your media files can be reused with `-utd` or `--use_t model of another kind (instead of re-using the same model on training resumption) without going through the feature embedding process, which could take quite long to finish for a large dataset so as to be unnecessary if there is no change on it. +**Ignore sound effects:: + + (.venv) $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory --sound_effect_start_marker "(" --sound_effect_end_marker ")" + +It is not uncommon that subtitles sometimes contain sound effects (e.g., "BARK", "(applause)" and "[MUSIC]", etc.). For limited training +data sets and not sophisticated enough network architectures, the model usually cannot capture all the sound effects very well. +To filter out sound effect subtitles and only preserve the vocal ones, you can pass in `-sesm` or `--sound_effect_start_marker` and/or +`seem` or `--sound_effect_end_marker` with strings which will be used by subaligner for finding sound effects and ignoring them within the training process. +For example, the above exemplary command will treat any strings starting with "(" and ending with ")" as sound effects. + **Run alignments after training**:: (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -tod training_output_directory @@ -98,6 +108,16 @@ flags to customise the configuration on tuning: (.venv) $ subaligner_convert -i subtitle.srt -o subtitle.vtt +**Convert the subtitle to another format and translate**:: + + (.venv) $ subaligner_convert --languages + (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_zh.vtt -t eng,zho + +**Translate the subtitle without changing the format**:: + + (.venv) $ subaligner_convert --languages + (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_es.srt -t eng,spa + For output subtitles like MicroDVD relying on the frame rate, its value needs to be passed in with `-fr` or `--frame_rate`. **On Windows**:: diff --git a/site/source/conf.py b/site/source/conf.py index 550f347..c0757a7 100644 --- a/site/source/conf.py +++ b/site/source/conf.py @@ -42,6 +42,7 @@ "sphinx.ext.ifconfig", "sphinx.ext.viewcode", "sphinx.ext.napoleon", + "sphinx.ext.autosectionlabel", ] # Add any paths that contain templates here, relative to this directory. @@ -80,7 +81,9 @@ "pysubs2", "cchardet", "captionstransformer", - "bs4" + "bs4", + "transformers", + "pycountry" ] def setup(app): diff --git a/site/source/index.rst b/site/source/index.rst index 3c05244..65578bd 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -7,17 +7,18 @@ Welcome to Subaligner's documentation! ====================================== Given an out-of-sync subtitle file along with a piece of audiovisual content carrying speeches described by it, -Subaligner provides a one-stop solution on automatic subtitle synchronisation with a pretrained deep neural network and forced -alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained +Subaligner provides a one-stop solution on automatic subtitle synchronisation and translation with pretrained deep neural networks +and forced alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained upfront. Subaligner helps subtitlers not only in preprocessing raw subtitle materials (outcome from stenographers or STT workflow, etc.) but also in gaining quality control over their work within subtitle post-production. This tool -also tolerates errors occurred in live subtitles which sometimes do not completely or correctly represent what people +also tolerates errors that occurred in live subtitles which sometimes do not completely or correctly represent what people actually spoke in the companion audiovisual content. -Subligner has been shifted with a command-line interface which helps users to conduct various tasks around subtitle synchronisation -without writing any code as well as APIs targeting developers. With existing audiovisual and in-sync subtitle files at -hand, users can train their own synchroniser with a single command and zero setup. A handful of subtitle formats are supported -and can be converted from one to another either during synchronisation or on on-demand. +Subligner has been shipped with a command-line interface which helps users to conduct various tasks around subtitle +synchronisation and multilingual translation without writing any code. Application programming interfaces are also provided +to developers wanting to perform those tasks programmatically. Moreover, with existing audiovisual and in-sync subtitle files at +hand, advanced users can train their own synchronisers with a single command and zero setup. A handful of subtitle formats are supported +and can be converted from one to another either during synchronisation and translation or on on-demand. Subligner supports the following subtitle formats: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP, EBU STL, SAMI, SCC and SBV. The source code can be found on GitHub: `subaligner `_. diff --git a/site/source/usage.rst b/site/source/usage.rst index 2f6efdb..f412d23 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -4,16 +4,17 @@ Usage Subaligner provides two ways of aligning subtitles: single-stage alignment and dual-stage alignment. The former way has lower latency and shifts all subtitle segments globally. The latter way has higher latency and shifts the -segments individually with an option of stretching each segment. +segments individually with an option of stretching each segment. Multilingual translation on subtitles can be achieved +together with the alignment in one go or separately (see in :ref:`Advanced Usage`). Make sure you have got the virtual environment activated upfront. -**Single-stage alignment**:: +**Single-stage alignment (high-level shift with lower latency)**:: (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt (.venv) $ subaligner_1pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt -**Dual-stage alignment**:: +**Dual-stage alignment (low-level shift with higher latency)**:: (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt (.venv) $ subaligner_2pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt @@ -25,6 +26,16 @@ Make sure you have got the virtual environment activated upfront. (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m dual -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt +**Translative alignment with the ISO 639-3 language code pair (src,tgt)**:: + + (.venv) $ subaligner_1pass --languages + (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt + (.venv) $ subaligner_2pass --languages + (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt + (.venv) $ subaligner --languages + (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt + (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt + **Run alignments with the docker image**:: $ docker pull baxtree/subaligner diff --git a/subaligner/__main__.py b/subaligner/__main__.py index ccf51ed..0a6fff3 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -1,8 +1,8 @@ #!/usr/bin/env python """ -usage: subaligner [-h] -m {single,dual} -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-so] +usage: subaligner [-h] [-m {single,dual}] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-so] [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver] + [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lan] [-d] [-q] [-ver] Subaligner command line interface @@ -12,13 +12,16 @@ Max global log loss for alignment -so, --stretch_off Switch off stretch on non-English speech and subtitles) -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} - Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes]. + Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]. NB: This will be ignored if either -so or --stretch_off is present -fos, --exit_segfail Exit on any segment alignment failures -tod TRAINING_OUTPUT_DIRECTORY, --training_output_directory TRAINING_OUTPUT_DIRECTORY Path to the output directory containing training results -o OUTPUT, --output OUTPUT Path to the output subtitle file + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -29,7 +32,7 @@ -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH - File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .ttml, .vtt, .tmp, .dfxp, .xml, .sami, .scc, .sub, .txt, .stl, .ssa, .ytt, .srt, .sbv, .ass, .smi) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -59,7 +62,6 @@ def main(): default="", choices=["single", "dual"], help="Alignment mode: either single or dual", - required=True, ) required_args.add_argument( "-v", @@ -67,7 +69,6 @@ def main(): type=str, default="", help="File path or URL to the video file", - required=True, ) from subaligner.subtitle import Subtitle required_args.add_argument( @@ -76,7 +77,6 @@ def main(): type=str, default="", help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())), - required=True, ) parser.add_argument( "-l", @@ -98,7 +98,7 @@ def main(): type=str, choices=Language.ALLOWED_VALUES, default=Language.ENG, - help="Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes].\nNB: This will be ignored if either -so or --stretch_off is present", + help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present", ) parser.add_argument( "-fos", @@ -120,6 +120,14 @@ def main(): default="", help="Path to the output subtitle file", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -127,6 +135,10 @@ def main(): parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.mode == "": print("--mode was not passed in") sys.exit(21) @@ -153,6 +165,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException from subaligner.exception import TerminalException from subaligner.utils import Utils @@ -201,9 +214,17 @@ def main(): stretch_in_lang=stretch_in_lang, exit_segfail=exit_segfail, ) + aligned_subtitle_path = "_aligned.".join( FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output - Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + aligned_subs = translator.translate(aligned_subs) + Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate) log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs) if log_loss is None or log_loss > FLAGS.max_logloss: diff --git a/subaligner/_version.py b/subaligner/_version.py index ad394de..685b1c8 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.1.3" +__version__ = "0.1.4" diff --git a/subaligner/embedder.py b/subaligner/embedder.py index ada3efa..2793788 100644 --- a/subaligner/embedder.py +++ b/subaligner/embedder.py @@ -22,7 +22,7 @@ def __init__( hop_len: int = 512, step_sample: float = 0.04, len_sample: float = 0.075, - ): + ) -> None: """Feature embedder initialiser. Keyword Arguments: @@ -235,7 +235,7 @@ def position_to_time_str(self, position: int) -> str: def extract_data_and_label_from_audio( self, audio_file_path: str, - subtitle_file_path: str, + subtitle_file_path: Optional[str], subtitles: Optional[SubRipFile] = None, sound_effect_start_marker: Optional[str] = None, sound_effect_end_marker: Optional[str] = None, diff --git a/subaligner/hparam_tuner.py b/subaligner/hparam_tuner.py index 5b6e506..dd779f8 100644 --- a/subaligner/hparam_tuner.py +++ b/subaligner/hparam_tuner.py @@ -29,7 +29,7 @@ def __init__(self, num_of_trials: int = 5, tuning_epochs: int = 5, network_type: str = Network.LSTM, - **kwargs): + **kwargs) -> None: """Hyperparameter tuner initialiser Arguments: diff --git a/subaligner/hyperparameters.py b/subaligner/hyperparameters.py index b51ec37..1ab1dc5 100644 --- a/subaligner/hyperparameters.py +++ b/subaligner/hyperparameters.py @@ -9,7 +9,7 @@ class Hyperparameters(object): OPTIMIZERS = ["adadelta", "adagrad", "adam", "adamax", "ftrl", "nadam", "rmsprop", "sgd"] - def __init__(self): + def __init__(self) -> None: """Hyperparameters initialiser setting default values""" self.__learning_rate = 0.001 @@ -120,7 +120,7 @@ def optimizer(self, value: str) -> None: self.__optimizer = "SGD" @property - def loss(self) -> float: + def loss(self) -> str: return self.__loss @property diff --git a/subaligner/logger.py b/subaligner/logger.py index 352e863..54c7adc 100644 --- a/subaligner/logger.py +++ b/subaligner/logger.py @@ -11,7 +11,7 @@ class Logger(Singleton): VERBOSE = True QUIET = False - def __init__(self, output_log: str = "output.log"): + def __init__(self, output_log: str = "output.log") -> None: self.__loggers: Dict[str, logging.Logger] = {} self.__output_log = output_log diff --git a/subaligner/media_helper.py b/subaligner/media_helper.py index 11ad93c..a36d9df 100644 --- a/subaligner/media_helper.py +++ b/subaligner/media_helper.py @@ -6,6 +6,7 @@ import shutil import atexit import signal +import shlex from typing import Optional, Tuple, List from copy import deepcopy @@ -71,16 +72,16 @@ def extract_audio(video_file_path, decompress: bool = False, freq: int = 16000) ) command = ( - "{0} -y -xerror -i {1} -ac 2 -ar {2} -vn {3}".format( + "{0} -y -xerror -i '{1}' -ac 2 -ar {2} -vn '{3}'".format( MediaHelper.FFMPEG_BIN, video_file_path, freq, audio_file_path ) if decompress - else "{0} -y -xerror -i {1} -vn -acodec copy {2}".format( + else "{0} -y -xerror -i '{1}' -vn -acodec copy '{2}'".format( MediaHelper.FFMPEG_BIN, video_file_path, audio_file_path ) ) with subprocess.Popen( - command.split(), + shlex.split(command), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -184,16 +185,16 @@ def extract_audio_from_start_to_end(audio_file_path: str, start: str, end: Optio if end is not None: duration = MediaHelper.get_duration_in_seconds(start, end) - command = "{0} -y -xerror -i {1} -ss {2} -t {3} -acodec copy {4}".format( + command = "{0} -y -xerror -i '{1}' -ss {2} -t {3} -acodec copy '{4}'".format( MediaHelper.FFMPEG_BIN, audio_file_path, start, duration, segment_path ) else: - command = "{0} -y -xerror -i {1} -ss {2} -acodec copy {3}".format( + command = "{0} -y -xerror -i '{1}' -ss {2} -acodec copy '{3}'".format( MediaHelper.FFMPEG_BIN, audio_file_path, start, segment_path ) with subprocess.Popen( - command, - shell=True, + shlex.split(command), + shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, @@ -319,7 +320,7 @@ def get_frame_rate(file_path: str) -> float: """ with subprocess.Popen( - "{0} -i {1} -t 00:00:10 -f null /dev/null".format(MediaHelper.FFMPEG_BIN, file_path).split(), + shlex.split("{0} -i '{1}' -t 00:00:10 -f null /dev/null".format(MediaHelper.FFMPEG_BIN, file_path)), shell=False, stderr=subprocess.PIPE, close_fds=True, @@ -339,8 +340,7 @@ def get_frame_rate(file_path: str) -> float: try: std_out, std_err = process.communicate(timeout=MediaHelper.__CMD_TIME_OUT) if process.returncode != 0: - MediaHelper.__LOGGER.warning("[{}-{}] Cannot extract the frame rate from video: {}\n{}" - .format(threading.current_thread().name, process.pid, file_path, std_err)) + MediaHelper.__LOGGER.warning("[{}-{}] Cannot extract the frame rate from video: {}\n{}".format(threading.current_thread().name, process.pid, file_path, std_err)) raise NoFrameRateException( "Cannot extract the frame rate from video: {}".format(file_path) ) diff --git a/subaligner/network.py b/subaligner/network.py index 2cdaab1..6aabb79 100644 --- a/subaligner/network.py +++ b/subaligner/network.py @@ -56,7 +56,7 @@ def __init__( hyperparameters: Hyperparameters, model_path: Optional[str] = None, backend: str = "tensorflow" - ): + ) -> None: """ Network object initialiser used by factory methods. Arguments: diff --git a/subaligner/predictor.py b/subaligner/predictor.py index 50d1143..3a9b6f5 100644 --- a/subaligner/predictor.py +++ b/subaligner/predictor.py @@ -18,6 +18,7 @@ from .singleton import Singleton from .subtitle import Subtitle from .hyperparameters import Hyperparameters +from .translator import Translator from .exception import TerminalException from .exception import NoFrameRateException from .logger import Logger @@ -38,7 +39,7 @@ class Predictor(Singleton): __SEGMENT_PREDICTION_TIMEOUT = 60 # Maximum waiting time in seconds when predicting each segment - def __init__(self, **kwargs): + def __init__(self, **kwargs) -> None: """Feature predictor initialiser. Keyword Arguments: @@ -574,7 +575,7 @@ def __predict( audio_file_path {string} -- The file path of the original audio (default: {None}). subtitles {list} -- The list of SubRip files (default: {None}). max_shift_secs {float} -- The maximum seconds by which subtitle cues can be shifted (default: {None}). - previous_gap {float} -- The duration betwee the start time of the audio segment and the start time of the subtitle segment. + previous_gap {float} -- The duration between the start time of the audio segment and the start time of the subtitle segment (default: {None}). Returns: tuple -- The shifted subtitles, the audio file path and the voice probabilities of the original audio. @@ -675,7 +676,7 @@ def __predict( self.__feature_embedder.position_to_duration(pos_to_delay) - original_start ) elif subtitles is not None: # for each in second pass - seconds_to_shift = self.__feature_embedder.position_to_duration(pos_to_delay) - previous_gap + seconds_to_shift = self.__feature_embedder.position_to_duration(pos_to_delay) - previous_gap if previous_gap is not None else 0.0 else: if os.path.exists(audio_file_path): os.remove(audio_file_path) diff --git a/subaligner/singleton.py b/subaligner/singleton.py index 266813a..70671ca 100644 --- a/subaligner/singleton.py +++ b/subaligner/singleton.py @@ -6,7 +6,7 @@ class _Singleton(type): # type: ignore _instances: Dict[Any, Any] = {} - def __call__(cls, *args, **kwargs): + def __call__(cls, *args, **kwargs) -> Any: if cls not in cls._instances: cls._instances[cls] = super(_Singleton, cls).__call__( *args, **kwargs diff --git a/subaligner/subaligner_1pass/__main__.py b/subaligner/subaligner_1pass/__main__.py index ebc5197..592e042 100755 --- a/subaligner/subaligner_1pass/__main__.py +++ b/subaligner/subaligner_1pass/__main__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -usage: subaligner_1pass [-h] -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver] +usage: subaligner_1pass [-h] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] Run single-stage alignment @@ -12,6 +12,9 @@ Path to the output directory containing training results -o OUTPUT, --output OUTPUT Path to the output subtitle file + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -20,7 +23,7 @@ -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH - File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .stl, .dfxp, .xml, .vtt, .sbv, .ytt, .scc, .ttml, .smi, .sami, .ssa, .tmp, .txt, .sub, .srt, .ass) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -49,7 +52,6 @@ def main(): type=str, default="", help="File path or URL to the video file", - required=True, ) from subaligner.subtitle import Subtitle required_args.add_argument( @@ -58,7 +60,6 @@ def main(): type=str, default="", help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())), - required=True, ) parser.add_argument( "-l", @@ -81,6 +82,14 @@ def main(): default="", help="Path to the output subtitle file", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -88,6 +97,11 @@ def main(): parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + from aeneas.language import Language + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.video_path == "": print("--video_path was not passed in") sys.exit(21) @@ -108,6 +122,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException from subaligner.exception import TerminalException from subaligner.utils import Utils @@ -149,7 +164,14 @@ def main(): aligned_subtitle_path = "_aligned.".join( FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output - Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + subs = translator.translate(subs) + Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, subs, aligned_subtitle_path, frame_rate) log_loss = predictor.get_log_loss(voice_probabilities, subs) if log_loss is None or log_loss > FLAGS.max_logloss: diff --git a/subaligner/subaligner_2pass/__main__.py b/subaligner/subaligner_2pass/__main__.py index f279092..c199937 100755 --- a/subaligner/subaligner_2pass/__main__.py +++ b/subaligner/subaligner_2pass/__main__.py @@ -1,8 +1,8 @@ #!/usr/bin/env python """ -usage: subaligner_2pass [-h] -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-so] +usage: subaligner_2pass [-h] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-so] [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver] + [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] Run dual-stage alignment @@ -12,13 +12,16 @@ Max global log loss for alignment -so, --stretch_off Switch off stretch on subtitles for non-English speech -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} - Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes]. + Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]. NB: This will be ignored if either -so or --stretch_off is present -fos, --exit_segfail Exit on any segment alignment failures -tod TRAINING_OUTPUT_DIRECTORY, --training_output_directory TRAINING_OUTPUT_DIRECTORY Path to the output directory containing training results -o OUTPUT, --output OUTPUT Path to the output subtitle file + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -27,7 +30,7 @@ -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH - File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .ass, .sbv, .srt, .vtt, .ttml, .dfxp, .scc, .txt, .tmp, .smi, .ssa, .sami, .xml, .sub, .stl, .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -56,7 +59,6 @@ def main(): type=str, default="", help="File path or URL to the video file", - required=True, ) from subaligner.subtitle import Subtitle required_args.add_argument( @@ -65,7 +67,6 @@ def main(): type=str, default="", help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())), - required=True, ) parser.add_argument( "-l", @@ -87,7 +88,7 @@ def main(): type=str, choices=Language.ALLOWED_VALUES, default=Language.ENG, - help="Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes].\nNB: This will be ignored if either -so or --stretch_off is present", + help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present", ) parser.add_argument( "-fos", @@ -109,6 +110,14 @@ def main(): default="", help="Path to the output subtitle file", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", @@ -116,6 +125,10 @@ def main(): parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.video_path == "": print("--video_path was not passed in") sys.exit(21) @@ -139,6 +152,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException from subaligner.exception import TerminalException from subaligner.utils import Utils @@ -183,7 +197,14 @@ def main(): aligned_subtitle_path = "_aligned.".join( FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output - Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + subs_list = translator.translate(subs) + Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, subs_list, aligned_subtitle_path, frame_rate) log_loss = predictor.get_log_loss(voice_probabilities, subs_list) if log_loss is None or log_loss > FLAGS.max_logloss: diff --git a/subaligner/subaligner_convert/__main__.py b/subaligner/subaligner_convert/__main__.py index 1977030..3ff48b7 100755 --- a/subaligner/subaligner_convert/__main__.py +++ b/subaligner/subaligner_convert/__main__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -usage: subaligner_convert [-h] -i INPUT_SUBTITLE_PATH -o OUTPUT_SUBTITLE_PATH [-f FRAME_RATE] [-d] [-q] [-ver] +usage: subaligner_convert [-h] -i INPUT_SUBTITLE_PATH -o OUTPUT_SUBTITLE_PATH [-fr FRAME_RATE] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] Convert a subtitle from the input format to the output format @@ -8,6 +8,9 @@ -h, --help show this help message and exit -fr FRAME_RATE, --frame_rate FRAME_RATE Frame rate used by conversion to formats such as MicroDVD + -t TRANSLATE, --translate TRANSLATE + Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit @@ -47,7 +50,6 @@ def main(): type=str, default="", help="File path or URL to the input subtitle file", - required=True, ) required_args.add_argument( "-o", @@ -55,7 +57,6 @@ def main(): type=str, default="", help="File path to the output subtitle file", - required=True, ) parser.add_argument( "-fr", @@ -64,12 +65,26 @@ def main(): default=None, help="Frame rate used by conversion to formats such as MicroDVD", ) + parser.add_argument( + "-t", + "--translate", + type=str, + help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", + ) + parser.add_argument("-lgs", "--languages", action="store_true", + help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", help="Print out debugging information") parser.add_argument("-q", "--quiet", action="store_true", help="Switch off logging information") parser.add_argument("-ver", "--version", action="version", version=__version__) FLAGS, unparsed = parser.parse_known_args() + + from aeneas.language import Language + if FLAGS.languages: + for line in Language.CODE_TO_HUMAN_LIST: + print(line.replace("\t", " ")) + sys.exit(0) if FLAGS.input_subtitle_path == "": print("--input_subtitle_path was not passed in") sys.exit(21) @@ -85,6 +100,7 @@ def main(): Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.subtitle import Subtitle + from subaligner.translator import Translator from subaligner.exception import UnsupportedFormatException, TerminalException from subaligner.utils import Utils @@ -96,7 +112,16 @@ def main(): Utils.download_file(FLAGS.input_subtitle_path, local_subtitle_path) subtitle = Subtitle.load(local_subtitle_path) - Subtitle.save_subs_as_target_format(subtitle.subs, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate) + + if FLAGS.translate is not None: + source, target = FLAGS.translate.split(",") + translator = Translator(source, target) + subs_list = translator.translate(subtitle.subs) + Subtitle.export_subtitle(local_subtitle_path, subs_list, FLAGS.output_subtitle_path, FLAGS.frame_rate, "utf-8") + Subtitle.save_subs_as_target_format(subs_list, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate, "utf-8") + else: + Subtitle.export_subtitle(local_subtitle_path, subtitle.subs, FLAGS.output_subtitle_path, FLAGS.frame_rate) + Subtitle.save_subs_as_target_format(subtitle.subs, local_subtitle_path, FLAGS.output_subtitle_path, FLAGS.frame_rate) print("Subtitle converted and saved to: {}".format(FLAGS.output_subtitle_path)) except UnsupportedFormatException as e: print( diff --git a/subaligner/subtitle.py b/subaligner/subtitle.py index c687423..ea2ce9b 100644 --- a/subaligner/subtitle.py +++ b/subaligner/subtitle.py @@ -39,7 +39,7 @@ class Subtitle(object): SBV_EXTENSIONS = [".sbv"] YT_TRANSCRIPT_EXTENSIONS = [".ytt"] - def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str): + def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str) -> None: """Subtitle object initialiser. Arguments: @@ -375,7 +375,7 @@ def shift_subtitle( return shifted_subtitle_file_path @staticmethod - def save_subs_as_target_format(subs: List[SubRipItem], source_file_path: str, target_file_path: str, frame_rate: Optional[float] = None) -> None: + def save_subs_as_target_format(subs: List[SubRipItem], source_file_path: str, target_file_path: str, frame_rate: Optional[float] = None, encoding: Optional[str] = None) -> None: """Save SubRipItems with the format determined by the target file extension. Arguments: @@ -383,14 +383,15 @@ def save_subs_as_target_format(subs: List[SubRipItem], source_file_path: str, ta source_file_path {string} -- The path to the original subtitle file. target_file_path {string} -- The path to the output subtitle file. frame_rate {float} -- The frame rate used by conversion to formats such as MicroDVD + encoding {str} -- The encoding of the exported output file {default: None}. """ - encoding = Utils.detect_encoding(source_file_path) + encoding = Utils.detect_encoding(source_file_path) if encoding is None else encoding _, file_extension = os.path.splitext(target_file_path.lower()) Subtitle.__save_subtitle_by_extension(file_extension, subs, source_file_path, target_file_path, encoding, frame_rate) @staticmethod - def export_subtitle(source_file_path: str, subs: List[SubRipItem], target_file_path: str, frame_rate: float = 25.0) -> None: + def export_subtitle(source_file_path: str, subs: List[SubRipItem], target_file_path: str, frame_rate: float = 25.0, encoding: Optional[str] = None) -> None: """Export subtitle in the format determined by the file extension. Arguments: @@ -398,9 +399,10 @@ def export_subtitle(source_file_path: str, subs: List[SubRipItem], target_file_p subs {list} -- A list of SubRipItems. target_file_path {string} -- The path to the exported subtitle file. frame_rate {float} -- The frame rate for frame-based subtitle formats {default: 25.0}. + encoding {str} -- The encoding of the exported subtitle file {default: None}. """ - encoding = Utils.detect_encoding(source_file_path) + encoding = Utils.detect_encoding(source_file_path) if encoding is None else encoding _, file_extension = os.path.splitext(source_file_path.lower()) Subtitle.__save_subtitle_by_extension(file_extension, subs, source_file_path, target_file_path, encoding, frame_rate, is_exporting=True) diff --git a/subaligner/trainer.py b/subaligner/trainer.py index 71442e8..97a1adc 100644 --- a/subaligner/trainer.py +++ b/subaligner/trainer.py @@ -25,7 +25,7 @@ class Trainer(object): __LOGGER = Logger().get_logger(__name__) __MAX_BYTES = 2 ** 31 - 1 - def __init__(self, feature_embedder: FeatureEmbedder): + def __init__(self, feature_embedder: FeatureEmbedder) -> None: """Initialiser for the training process. Arguments: diff --git a/subaligner/translator.py b/subaligner/translator.py new file mode 100644 index 0000000..645101f --- /dev/null +++ b/subaligner/translator.py @@ -0,0 +1,209 @@ +import math +import pycountry +import time +from copy import deepcopy +from pysrt import SubRipItem +from tqdm import tqdm +from transformers import MarianMTModel, MarianTokenizer +from typing import List, Generator +from .singleton import Singleton +from .logger import Logger + + +class Translator(Singleton): + """Translate subtitles. + """ + + __LOGGER = Logger().get_logger(__name__) + __TENSOR_TYPE = "pt" + __OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" + __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" + __TRANSLATING_BATCH_SIZE = 10 + __LANGUAGE_CODE_MAPPER = { + "bos": "zls", + "cmn": "zho", + "gla": "cel", + "grc": "grk", + "guj": "inc", + "ina": "art", + "jbo": "art", + "kan": "dra", + "kir": "trk", + "lat": "itc", + "lfn": "art", + "mya": "sit", + "nep": "inc", + "ori": "inc", + "sin": "inc", + "srp": "zls", + "tam": "dra", + "tat": "trk", + "tel": "dra", + "yue": "zho" + } + __LANGUAGE_PAIR_MAPPER = { + "eng-jpn": "eng-jap", + "jpn-eng": "jap-eng" + } + + def __init__(self, src_language, tgt_language) -> None: + """Initialiser for the subtitle translation. + + Arguments: + src_language {string} -- The source language code derived from ISO 639-3. + tgt_language {string} -- The target language code derived from ISO 639-3. + + Raises: + NotImplementedError -- Thrown when the model of the specified language pair is not found. + """ + self.__initialise_model(src_language, tgt_language) + + @staticmethod + def get_iso_639_alpha_2(language_code: str) -> str: + """Find the alpha 2 language code based on an alpha 3 one. + + Arguments: + language_code {string} -- An alpha 3 language code derived from ISO 639-3. + + Returns: + string -- The alpha 2 language code if exists otherwise the alpha 3 one. + + Raises: + ValueError -- Thrown when the input language code cannot be recognised. + """ + + lang = pycountry.languages.get(alpha_3=language_code) + if lang is None: + return language_code + elif hasattr(lang, "alpha_2"): + return lang.alpha_2 + else: + return lang.alpha_3 + + @staticmethod + def normalise_single(language_code: str) -> str: + """Normalise a single language code. + + Arguments: + language_code {string} -- A language code derived from ISO 639-3. + + Returns: + string -- The language code understood by the language model. + """ + + return Translator.__LANGUAGE_CODE_MAPPER[language_code] if language_code in Translator.__LANGUAGE_CODE_MAPPER else language_code + + @staticmethod + def normalise_pair(src_language: str, tgt_language: str) -> List[str]: + """Normalise a pair of language codes. + + Arguments: + src_language {string} -- The source language code derived from ISO 639-3. + tgt_language {string} -- The target language code derived from ISO 639-3. + + Returns: + list -- The language code pair understood by the language model. + """ + + if "{}-{}".format(src_language, tgt_language) in Translator.__LANGUAGE_PAIR_MAPPER: + return Translator.__LANGUAGE_PAIR_MAPPER["{}-{}".format(src_language, tgt_language)].split("-") + else: + return [src_language, tgt_language] + + def translate(self, subs: List[SubRipItem]) -> List[SubRipItem]: + """Translate a list of subtitle cues. + + Arguments: + subs {list} -- A list of SubRipItems. + + Returns: + {list} -- A list of new SubRipItems holding the translation results. + """ + + translated_texts = [] + self.lang_model.eval() + new_subs = deepcopy(subs) + src_texts = [sub.text for sub in new_subs] + num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) + Translator.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) + for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): + tokenizer = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.lang_model.generate(**tokenizer) + translated_texts.extend([self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]) + for index in range(len(new_subs)): + new_subs[index].text = translated_texts[index] + Translator.__LOGGER.info("Subtitle translated") + return new_subs + + def __initialise_model(self, src_lang: str, tgt_lang: str) -> None: + src_lang = Translator.normalise_single(src_lang) + tgt_lang = Translator.normalise_single(tgt_lang) + src_lang, tgt_lang = Translator.normalise_pair(src_lang, tgt_lang) + try: + mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_MT.format(src_lang, tgt_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__log_and_back_off(mt_model_name) + try: + mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, tgt_lang) + self.__download_mt_model(mt_model_name) + return + except OSError: + Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) + message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) + Translator.__LOGGER.error(message) + raise NotImplementedError(message) + + def __download_mt_model(self, mt_model_name: str) -> None: + Translator.__LOGGER.debug("Trying to download the MT model %s" % mt_model_name) + self.tokenizer = MarianTokenizer.from_pretrained(mt_model_name) + self.lang_model = MarianMTModel.from_pretrained(mt_model_name) + Translator.__LOGGER.debug("MT model %s downloaded" % mt_model_name) + + @staticmethod + def __log_and_back_off(mt_model_name: str): + Translator.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) + time.sleep(1) + + @staticmethod + def __batch(data: List, size: int = 1) -> Generator: + total = len(data) + for ndx in range(0, total, size): + yield data[ndx:min(ndx + size, total)] diff --git a/subaligner/utils.py b/subaligner/utils.py index eae0235..fd06699 100644 --- a/subaligner/utils.py +++ b/subaligner/utils.py @@ -4,6 +4,7 @@ import requests import shutil import cchardet +import shlex from pycaption import ( CaptionConverter, @@ -84,7 +85,7 @@ def srt2vtt(srt_file_path: str, vtt_file_path: Optional[str] = None, timeout_sec _vtt_file_path = srt_file_path.replace(".srt", ".vtt") if vtt_file_path is None else vtt_file_path encoding = Utils.detect_encoding(srt_file_path) - command = "{0} -y -sub_charenc {1} -i {2} -f webvtt {3}".format(Utils.FFMPEG_BIN, encoding, srt_file_path, _vtt_file_path) + command = "{0} -y -sub_charenc {1} -i '{2}' -f webvtt '{3}'".format(Utils.FFMPEG_BIN, encoding, srt_file_path, _vtt_file_path) timeout_msg = "Timeout on converting SubRip to WebVTT: {}".format(srt_file_path) error_msg = "Cannot convert SubRip to WebVTT: {}".format(srt_file_path) @@ -111,7 +112,7 @@ def vtt2srt(vtt_file_path: str, srt_file_path: Optional[str] = None, timeout_sec _srt_file_path = vtt_file_path.replace(".vtt", ".srt") if srt_file_path is None else srt_file_path encoding = Utils.detect_encoding(vtt_file_path) - command = "{0} -y -sub_charenc {1} -i {2} -f srt {3}".format(Utils.FFMPEG_BIN, encoding, vtt_file_path, _srt_file_path) + command = "{0} -y -sub_charenc {1} -i '{2}' -f srt '{3}'".format(Utils.FFMPEG_BIN, encoding, vtt_file_path, _srt_file_path) timeout_msg = "Timeout on converting WebVTT to SubRip: {}".format(vtt_file_path) error_msg = "Cannot convert WebVTT to SubRip: {}".format(vtt_file_path) @@ -400,7 +401,7 @@ def sbv2srt(sbv_file_path: str, srt_file_path: Optional[str] = None) -> None: caption.encoding = encoding if srt_file_path is None: - srt_file_path = srt_file_path.replace(".sbv", ".srt") + srt_file_path = sbv_file_path.replace(".sbv", ".srt") with open(srt_file_path, "w") as file: srt_writer = SrtWriter(file, captions) @@ -468,7 +469,7 @@ def ytt2srt(transcript_file_path: str, srt_file_path: Optional[str] = None) -> N caption.encoding = encoding if srt_file_path is None: - srt_file_path = srt_file_path.replace(".ytt", ".srt") + srt_file_path = transcript_file_path.replace(".ytt", ".srt") with open(srt_file_path, "w") as file: srt_writer = SrtWriter(file, captions) @@ -488,7 +489,7 @@ def extract_teletext_as_subtitle(ts_file_path: str, page_num: int, output_file_p timeout_secs {int} -- The timeout in seconds on extraction {default: 30}. """ - command = "{0} -y -fix_sub_duration -txt_page {1} -txt_format text -i {2} {3}".format(Utils.FFMPEG_BIN, page_num, ts_file_path, output_file_path) + command = "{0} -y -fix_sub_duration -txt_page {1} -txt_format text -i '{2}' '{3}'".format(Utils.FFMPEG_BIN, page_num, ts_file_path, output_file_path) timeout_msg = "Timeout on extracting Teletext from transport stream: {} on page: {}".format(ts_file_path, page_num) error_msg = "Cannot extract Teletext from transport stream: {} on page: {}".format(ts_file_path, page_num) @@ -514,7 +515,7 @@ def extract_matroska_subtitle(mkv_file_path: str, stream_index: int, output_file timeout_secs {int} -- The timeout in seconds on extraction {default: 30}. """ - command = "{0} -y -i {1} -map 0:s:{2} {3}".format(Utils.FFMPEG_BIN, mkv_file_path, stream_index, output_file_path) + command = "{0} -y -i '{1}' -map 0:s:{2} '{3}'".format(Utils.FFMPEG_BIN, mkv_file_path, stream_index, output_file_path) timeout_msg = "Timeout on extracting the subtitle from file: {} with stream index: {}".format(mkv_file_path, stream_index) error_msg = "Cannot extract the subtitle from file: {} with stream index: {}".format(mkv_file_path, stream_index) @@ -566,7 +567,7 @@ def contains_embedded_subtitles(video_file_path: str, timeout_secs: int = 30) -> bool -- True if the video contains embedded subtitles or False otherwise. """ - command = "{0} -y -i {1} -c copy -map 0:s -f null - -v 0 -hide_banner".format(Utils.FFMPEG_BIN, video_file_path) + command = "{0} -y -i '{1}' -c copy -map 0:s -f null - -v 0 -hide_banner".format(Utils.FFMPEG_BIN, video_file_path) timeout_msg = "Timeout on detecting embedded subtitles from file: {}".format(video_file_path) error_msg = "Embedded subtitle detection failed for file: {}".format(video_file_path) @@ -614,7 +615,7 @@ def __convert_subtitle(source_file_path: str, source_ext: str, target_file_path: @staticmethod def _run_command(command: str, timeout_secs: int, timeout_msg: str, error_msg: str, callback: Callable[[int, str], Any]) -> Any: with subprocess.Popen( - command.split(), + shlex.split(command), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature index ef534d3..e764790 100644 --- a/tests/integration/feature/subaligner.feature +++ b/tests/integration/feature/subaligner.feature @@ -189,6 +189,25 @@ Feature: Subaligner CLI | subaligner | single | | subaligner | dual | + Scenario: Test alignments with the file path containing whitespace ([] == " ") + Given I have a video file "test[]spaced.mp4" + And I have a subtitle file "test[]spaced.vtt" + When I run the alignment with subaligner on them with dual stage + Then a new subtitle file "test[]spaced_aligned.vtt" is generated + + @translation + Scenario Outline: Test translation on aligned subtitles + Given I have a video file "test.mp4" + And I have a subtitle file + When I run the alignment with on them with stage and for translation + Then a new subtitle file is generated + Examples: + | aligner | mode | subtitle-in | language-pair | subtitle-out | + | subaligner | single | "test.srt" | eng,zho | "test_aligned.srt" | + | subaligner | dual | "test.srt" | eng,spa | "test_aligned.srt" | + | subaligner_1pass | | "test.srt" | eng,fra | "test_aligned.srt" | + | subaligner_2pass | | "test.srt" | eng,deu | "test_aligned.srt" | + @exception Scenario Outline: Test errors out on unsupported subtitle input Given I have a video file "test.mp4" @@ -224,3 +243,13 @@ Feature: Subaligner CLI | subaligner_1pass | | subaligner_2pass | | subaligner | + + @languages + Scenario Outline: Test language codes display + When I run the command with languages + Then supported language codes are displayed + Examples: + | aligner | + | subaligner_1pass | + | subaligner_2pass | + | subaligner | diff --git a/tests/integration/feature/subaligner_convert.feature b/tests/integration/feature/subaligner_convert.feature index 6dc8787..3a779ed 100644 --- a/tests/integration/feature/subaligner_convert.feature +++ b/tests/integration/feature/subaligner_convert.feature @@ -55,3 +55,22 @@ Feature: Subaligner CLI Given I have a subtitle file "https://raw.githubusercontent.com/baxtree/subaligner/master/tests/subaligner/resource/test.srt" When I run the converter with "test_srt.ttml" as the output Then a new subtitle file "test_srt.ttml" is generated + + Scenario Outline: Test subtitle conversion with translation + Given I have a subtitle file + When I run the converter with for translation and as the output + Then a new subtitle file is generated + Examples: + | subtitle-in | language_pair | subtitle-out | + | "test.srt" | eng,zho | "test_zh_srt.ttml" | + | "test.srt" | eng,spa | "test_es_srt.ttml" | + | "test.srt" | eng,hin | "test_hi_srt.ttml" | + | "test.srt" | eng,fra | "test_fr_srt.ttml" | + | "test.srt" | eng,ara | "test_ar_srt.ttml" | + | "test.srt" | eng,jpn | "test_ja_srt.ttml" | + | "test.srt" | eng,rus | "test_ru_srt.ttml" | + | "test.srt" | eng,ind | "test_id_srt.ttml" | + + Scenario: Test language codes display + When I run the subaligner_convert command with languages + Then supported language codes are displayed diff --git a/tests/integration/radish/step.py b/tests/integration/radish/step.py index 7397987..0323987 100644 --- a/tests/integration/radish/step.py +++ b/tests/integration/radish/step.py @@ -15,7 +15,7 @@ def video_file(step, file_name): if file_name.lower().startswith("http"): step.context.video_file_path = file_name else: - step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name) + step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name).replace("[]", " ") @given('I have a subtitle file "{file_name:S}"') @@ -23,7 +23,7 @@ def subtitle_file(step, file_name): if file_name.lower().startswith("http"): step.context.subtitle_path_or_selector = file_name else: - step.context.subtitle_path_or_selector = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name) + step.context.subtitle_path_or_selector = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name).replace("[]", " ") @given('I have selector "{selector:S}" for the embedded subtitle') @@ -49,6 +49,26 @@ def run_subaligner(step, aligner, mode): step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) +@when("I run the alignment with {aligner:S} on them with {mode:S} stage and {language_pair:S} for translation") +def run_subaligner_with_translation(step, aligner, mode, language_pair): + if mode == "": + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-v", step.context.video_file_path, + "-s", step.context.subtitle_path_or_selector, + "-t", language_pair, + "-q"], shell=False) + else: + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-m", mode, + "-v", step.context.video_file_path, + "-s", step.context.subtitle_path_or_selector, + "-t", language_pair, + "-q"], shell=False) + step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) + + @when('I run the alignment with {aligner:S} on them with {mode:S} stage and output "{file_name:S}"') def run_subaligner_with_output(step, aligner, mode, file_name): if mode == "": @@ -133,7 +153,7 @@ def run_subaligner_with_custom_model(step, aligner, mode): @then('a new subtitle file "{file_name:S}" is generated') def expect_result(step, file_name): - output_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name) + output_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", file_name.replace("[]", " ")) assert step.context.exit_code == 0 assert os.path.isfile(output_file_path) is True @@ -182,6 +202,20 @@ def expect_help_information(step, aligner): assert "usage: %s " % aligner in step.context.stdout +@when("I run the {aligner:S} command with languages") +def run_subaligner_with_languages(step, aligner): + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-lgs"], shell=False, stdout=subprocess.PIPE) + stdout, _ = process.communicate(timeout=WAIT_TIMEOUT_IN_SECONDS) + step.context.stdout = stdout.decode("utf-8") + + +@then("supported language codes are displayed") +def expect_language_codes(step): + assert "eng English" in step.context.stdout + + @then("the dual-stage help information is displayed") def expect_dual_stage_help_information(step): assert "usage: subaligner_2pass" in step.context.stdout @@ -194,7 +228,7 @@ def unsupported_subtitle(step): @given("I have an unsupported video file") def unsupported_video(step): - step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", "unsupported") + step.context.video_file_path = os.path.join(PWD, "..", "..", "subaligner", "resource", "unsupported").replace("[]", " ") @given('I have an audiovisual file directory "{av_dir:S}"') @@ -290,6 +324,18 @@ def run_subtitle_converter(step, output_subtitle): step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) +@when('I run the converter with {language_pair:S} for translation and "{output_subtitle:S}" as the output') +def run_subtitle_converter_with_translation(step, language_pair, output_subtitle): + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", "subaligner_convert"), + "-i", step.context.subtitle_path_or_selector, + "-o", os.path.join(PWD, "..", "..", "subaligner", "resource", output_subtitle), + "-fr", "25.0", + "-t", language_pair, + "-q"] + step.text.split(" "), shell=False, stdout=subprocess.PIPE) + step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) + + @before.each_scenario(on_tags="train or hyperparameter-tuning") def create_training_output_dir(scenario): scenario.context.temp_dir = tempfile.mkdtemp() diff --git a/tests/subaligner/resource/test spaced.mp4 b/tests/subaligner/resource/test spaced.mp4 new file mode 120000 index 0000000..954f241 --- /dev/null +++ b/tests/subaligner/resource/test spaced.mp4 @@ -0,0 +1 @@ +test.mp4 \ No newline at end of file diff --git a/tests/subaligner/resource/test spaced.vtt b/tests/subaligner/resource/test spaced.vtt new file mode 120000 index 0000000..96a0225 --- /dev/null +++ b/tests/subaligner/resource/test spaced.vtt @@ -0,0 +1 @@ +test.vtt \ No newline at end of file diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py new file mode 100644 index 0000000..0afabf0 --- /dev/null +++ b/tests/subaligner/test_translator.py @@ -0,0 +1,79 @@ +import os +import unittest +from mock import Mock, patch +from parameterized import parameterized +from transformers import MarianMTModel, MarianTokenizer +from subaligner.subtitle import Subtitle +from subaligner.translator import Translator as Undertest + + +class TranslatorTests(unittest.TestCase): + + def setUp(self): + self.srt_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "resource/test.srt" + ) + + def test_get_iso_639_alpha_2(self): + self.assertEqual("en", Undertest.get_iso_639_alpha_2("eng")) + self.assertEqual("ada", Undertest.get_iso_639_alpha_2("ada")) + self.assertEqual("xyz", Undertest.get_iso_639_alpha_2("xyz")) + + @patch("transformers.MarianMTModel.from_pretrained") + @patch("transformers.MarianTokenizer.from_pretrained") + def test_translate(self, tokenizer_from_pretrained, model_from_pretrained): + subs = Subtitle.load(self.srt_file_path).subs + mock_tokenizer = Mock() + mock_tokenizer.return_value = {"input_ids": None, "attention_mask": None} + mock_tokenizer.decode.return_value = "translated" + mock_model = Mock() + mock_model.generate.return_value = [None] * len(subs) + tokenizer_from_pretrained.return_value = mock_tokenizer + model_from_pretrained.return_value = mock_model + + translated_subs = Undertest("eng", "zho").translate(subs) + + self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) + + @parameterized.expand([ + ["bos", "zls"], + ["cmn", "zho"], + ["gla", "cel"], + ["grc", "grk"], + ["guj", "inc"], + ["ina", "art"], + ["jbo", "art"], + ["kan", "dra"], + ["kir", "trk"], + ["lat", "itc"], + ["lfn", "art"], + ["mya", "sit"], + ["nep", "inc"], + ["ori", "inc"], + ["sin", "inc"], + ["srp", "zls"], + ["tam", "dra"], + ["tat", "trk"], + ["tel", "dra"], + ["yue", "zho"], + ]) + def test_normalise_single(self, original, normalised): + self.assertEqual(normalised, Undertest.normalise_single(original)) + + @parameterized.expand([ + ["eng-jpn", "eng-jap"], + ["jpn-eng", "jap-eng"], + ]) + def test_normalise_pair(self, original, normalised): + self.assertEqual(normalised, "-".join(Undertest.normalise_pair(*original.split("-")))) + + @patch("transformers.MarianTokenizer.from_pretrained", side_effect=OSError) + def test_throw_exception_on_translating_subs(self, mock_tokenizer_from_pretrained): + subs = Subtitle.load(self.srt_file_path).subs + try: + Undertest("eng", "aar").translate(subs) + except Exception as e: + self.assertTrue(mock_tokenizer_from_pretrained.called) + self.assertTrue(isinstance(e, NotImplementedError)) + else: + self.fail("Should have thrown exception") diff --git a/tests/subaligner/test_utils.py b/tests/subaligner/test_utils.py index 57f1305..1e9ad73 100644 --- a/tests/subaligner/test_utils.py +++ b/tests/subaligner/test_utils.py @@ -245,7 +245,7 @@ def test_ytt2srt(self): def test_extract_teletext_as_srt(self, mocked_run_command): Undertest.extract_teletext_as_subtitle("ts_file_path", 888, "srt_file_path") - mocked_run_command.assert_called_once_with("ffmpeg -y -fix_sub_duration -txt_page 888 -txt_format text -i {} {}".format("ts_file_path", "srt_file_path"), ANY, ANY, ANY, ANY) + mocked_run_command.assert_called_once_with("ffmpeg -y -fix_sub_duration -txt_page 888 -txt_format text -i {} {}".format("'ts_file_path'", "'srt_file_path'"), ANY, ANY, ANY, ANY) def test_extract_matroska_subtitle(self): output_file_path = os.path.join(self.resource_tmp, "extracted.matroska.srt")