Merge pull request #61 from baxtree/development

prepare for the new release
baxtree · May 15, 2021 · c3908c9 · c3908c9
2 parents 201b266 + a162b87
commit c3908c9
Show file tree

Hide file tree

Showing 32 changed files with 614 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -83,6 +83,17 @@ $ subaligner -m single -v https://example.com/video.mp4 -s https://example.com/s
 $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
 ```
 ```
+# Translative alignment with the ISO 639-3 language code pair (src,tgt)
+
+$ subaligner_1pass --languages
+$ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt
+$ subaligner_2pass --languages
+$ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt
+$ subaligner --languages
+$ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt
+$ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt
+```
+```
 # Run alignments with pipx
 
 $ pipx run subaligner -m single -v video.mp4 -s subtitle.srt
@@ -104,7 +115,8 @@ $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_2pass -v
 $ docker run -it baxtree/subaligner subaligner_1pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
 $ docker run -it baxtree/subaligner subaligner_2pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
 ```
-The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass --help`, `subaligner_2pass --help` or `subaligner --help`.
+The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass -h`, `subaligner_2pass -h` or `subaligner -h`.
+Additional utilities can be used after consulting `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h`.
 
 ![](figures/screencast.gif)
 ## Supported Formats

diff --git a/requirements-app.txt b/requirements-app.txt
@@ -46,6 +46,7 @@ psutil==5.6.7
 py==1.10.0
 pyasn1==0.4.8
 pyasn1-modules==0.2.7
+pycountry~=20.7.3
 pydot==1.2.4
 pydot-ng==1.0.0
 pydotplus==2.0.2
@@ -63,13 +64,17 @@ requests-oauthlib==1.3.0
 rsa==4.7
 scipy~=1.5.4
 scikit-learn>=0.19.1
+sentencepiece~=0.1.95
 setuptools>=41.0.0
+six~=1.15.0
 tblib==1.3.2
 tensorflow>=1.15.5,<2.5
 termcolor==1.1.0
 toml==0.10.0
 toolz==0.9.0
+torch~=1.8.1
 tornado==5.1.0
+transformers~=4.5.1
 urllib3==1.25.9
 Werkzeug>=0.15.3
 zict==0.1.3

diff --git a/requirements.txt b/requirements.txt
@@ -46,6 +46,7 @@ psutil==5.6.7
 py==1.10.0
 pyasn1==0.4.8
 pyasn1-modules==0.2.7
+pycountry~=20.7.3
 pydot==1.2.4
 pydot-ng==1.0.0
 pydotplus==2.0.2
@@ -63,13 +64,17 @@ requests-oauthlib==1.3.0
 rsa==4.7
 scipy~=1.5.4
 scikit-learn>=0.19.1
+sentencepiece~=0.1.95
 setuptools>=41.0.0
+six~=1.15.0
 tblib==1.3.2
 tensorflow>=1.15.5,<2.5
 termcolor==1.1.0
 toml==0.10.0
 toolz==0.9.0
+torch~=1.8.1
 tornado==5.1.0
+transformers~=4.5.1
 urllib3==1.25.9
 Werkzeug>=0.15.3
 zict==0.1.3

diff --git a/site/source/acknowledgement.rst b/site/source/acknowledgement.rst
@@ -12,3 +12,4 @@ Acknowledgement
     - `pysrt <https://github.com/byroot/pysrt>`_
     - `pysubs2 <https://github.com/tkarabela/pysubs2>`_
     - `aeneas <https://www.readbeyond.it/aeneas/>`_
+    - `transformers <https://huggingface.co/transformers/>`_
diff --git a/site/source/advanced_usage.rst b/site/source/advanced_usage.rst
@@ -39,6 +39,16 @@ Embeddings extracted from your media files can be reused with `-utd` or `--use_t
 model of another kind (instead of re-using the same model on training resumption) without going through the feature embedding process,
 which could take quite long to finish for a large dataset so as to be unnecessary if there is no change on it.
 
+**Ignore sound effects::
+
+    (.venv) $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory --sound_effect_start_marker "(" --sound_effect_end_marker ")"
+
+It is not uncommon that subtitles sometimes contain sound effects (e.g., "BARK", "(applause)" and "[MUSIC]", etc.). For limited training
+data sets and not sophisticated enough network architectures, the model usually cannot capture all the sound effects very well.
+To filter out sound effect subtitles and only preserve the vocal ones, you can pass in `-sesm` or `--sound_effect_start_marker` and/or
+`seem` or `--sound_effect_end_marker` with strings which will be used by subaligner for finding sound effects and ignoring them within the training process.
+For example, the above exemplary command will treat any strings starting with "(" and ending with ")" as sound effects.
+
 **Run alignments after training**::
 
     (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -tod training_output_directory
@@ -98,6 +108,16 @@ flags to customise the configuration on tuning:
 
     (.venv) $ subaligner_convert -i subtitle.srt -o subtitle.vtt
 
+**Convert the subtitle to another format and translate**::
+
+    (.venv) $ subaligner_convert --languages
+    (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_zh.vtt -t eng,zho
+
+**Translate the subtitle without changing the format**::
+
+    (.venv) $ subaligner_convert --languages
+    (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_es.srt -t eng,spa
+
 For output subtitles like MicroDVD relying on the frame rate, its value needs to be passed in with `-fr` or `--frame_rate`.
 
 **On Windows**::

diff --git a/site/source/conf.py b/site/source/conf.py
@@ -42,6 +42,7 @@
     "sphinx.ext.ifconfig",
     "sphinx.ext.viewcode",
     "sphinx.ext.napoleon",
+    "sphinx.ext.autosectionlabel",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -80,7 +81,9 @@
     "pysubs2",
     "cchardet",
     "captionstransformer",
-    "bs4"
+    "bs4",
+    "transformers",
+    "pycountry"
 ]
 
 def setup(app):

diff --git a/site/source/index.rst b/site/source/index.rst
@@ -7,17 +7,18 @@ Welcome to Subaligner's documentation!
 ======================================
 
 Given an out-of-sync subtitle file along with a piece of audiovisual content carrying speeches described by it,
-Subaligner provides a one-stop solution on automatic subtitle synchronisation with a pretrained deep neural network and forced
-alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained
+Subaligner provides a one-stop solution on automatic subtitle synchronisation and translation with pretrained deep neural networks
+and forced alignments. In essence, aligning subtitles is a dual-stage process with a Bidirectional Long Short-Term Memory network trained
 upfront. Subaligner helps subtitlers not only in preprocessing raw subtitle materials (outcome from stenographers or
 STT workflow, etc.) but also in gaining quality control over their work within subtitle post-production. This tool
-also tolerates errors occurred in live subtitles which sometimes do not completely or correctly represent what people
+also tolerates errors that occurred in live subtitles which sometimes do not completely or correctly represent what people
 actually spoke in the companion audiovisual content.
 
-Subligner has been shifted with a command-line interface which helps users to conduct various tasks around subtitle synchronisation
-without writing any code as well as APIs targeting developers. With existing audiovisual and in-sync subtitle files at
-hand, users can train their own synchroniser with a single command and zero setup. A handful of subtitle formats are supported
-and can be converted from one to another either during synchronisation or on on-demand.
+Subligner has been shipped with a command-line interface which helps users to conduct various tasks around subtitle
+synchronisation and multilingual translation without writing any code. Application programming interfaces are also provided
+to developers wanting to perform those tasks programmatically. Moreover, with existing audiovisual and in-sync subtitle files at
+hand, advanced users can train their own synchronisers with a single command and zero setup. A handful of subtitle formats are supported
+and can be converted from one to another either during synchronisation and translation or on on-demand.
 
 Subligner supports the following subtitle formats: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP,
 EBU STL, SAMI, SCC and SBV. The source code can be found on GitHub: `subaligner <https://github.com/baxtree/subaligner>`_.

diff --git a/site/source/usage.rst b/site/source/usage.rst
@@ -4,16 +4,17 @@ Usage
 
 Subaligner provides two ways of aligning subtitles: single-stage alignment and dual-stage alignment. The former way has
 lower latency and shifts all subtitle segments globally. The latter way has higher latency and shifts the
-segments individually with an option of stretching each segment.
+segments individually with an option of stretching each segment. Multilingual translation on subtitles can be achieved
+together with the alignment in one go or separately (see in :ref:`Advanced Usage`).
 
 Make sure you have got the virtual environment activated upfront.
 
-**Single-stage alignment**::
+**Single-stage alignment (high-level shift with lower latency)**::
 
     (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt
     (.venv) $ subaligner_1pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt
 
-**Dual-stage alignment**::
+**Dual-stage alignment (low-level shift with higher latency)**::
 
     (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt
     (.venv) $ subaligner_2pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt
@@ -25,6 +26,16 @@ Make sure you have got the virtual environment activated upfront.
     (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt
     (.venv) $ subaligner -m dual -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt
 
+**Translative alignment with the ISO 639-3 language code pair (src,tgt)**::
+
+    (.venv) $ subaligner_1pass --languages
+    (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt
+    (.venv) $ subaligner_2pass --languages
+    (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt
+    (.venv) $ subaligner --languages
+    (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt
+    (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt
+
 **Run alignments with the docker image**::
 
     $ docker pull baxtree/subaligner

diff --git a/subaligner/__main__.py b/subaligner/__main__.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 """
-usage: subaligner [-h] -m {single,dual} -v VIDEO_PATH -s SUBTITLE_PATH [-l MAX_LOGLOSS] [-so]
+usage: subaligner [-h] [-m {single,dual}] [-v VIDEO_PATH] [-s SUBTITLE_PATH] [-l MAX_LOGLOSS] [-so]
                   [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
-                  [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-d] [-q] [-ver]
+                  [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-lan] [-d] [-q] [-ver]
 
 Subaligner command line interface
 
@@ -12,13 +12,16 @@
                         Max global log loss for alignment
   -so, --stretch_off    Switch off stretch on non-English speech and subtitles)
   -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}
-                        Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes].
+                        Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].
                         NB: This will be ignored if either -so or --stretch_off is present
   -fos, --exit_segfail  Exit on any segment alignment failures
   -tod TRAINING_OUTPUT_DIRECTORY, --training_output_directory TRAINING_OUTPUT_DIRECTORY
                         Path to the output directory containing training results
   -o OUTPUT, --output OUTPUT
                         Path to the output subtitle file
+  -t TRANSLATE, --translate TRANSLATE
+                        Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)
+  -lgs, --languages     Print out language codes used for stretch and translation
   -d, --debug           Print out debugging information
   -q, --quiet           Switch off logging information
   -ver, --version       show program's version number and exit
@@ -29,7 +32,7 @@
   -v VIDEO_PATH, --video_path VIDEO_PATH
                         File path or URL to the video file
   -s SUBTITLE_PATH, --subtitle_path SUBTITLE_PATH
-                        File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .dfxp, .ass, .xml, .tmp, .ssa, .srt, .txt, .sami, .sub, .ttml, .smi, .stl, .scc, .sbv and .ytt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
+                        File path or URL to the subtitle file (Extensions of supported subtitles: .ttml, .vtt, .tmp, .dfxp, .xml, .sami, .scc, .sub, .txt, .stl, .ssa, .ytt, .srt, .sbv, .ass, .smi) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
 """
 
 import argparse
@@ -59,15 +62,13 @@ def main():
         default="",
         choices=["single", "dual"],
         help="Alignment mode: either single or dual",
-        required=True,
     )
     required_args.add_argument(
         "-v",
         "--video_path",
         type=str,
         default="",
         help="File path or URL to the video file",
-        required=True,
     )
     from subaligner.subtitle import Subtitle
     required_args.add_argument(
@@ -76,7 +77,6 @@ def main():
         type=str,
         default="",
         help="File path or URL to the subtitle file (Extensions of supported subtitles: {}) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)".format(", ".join(Subtitle.subtitle_extensions())),
-        required=True,
     )
     parser.add_argument(
         "-l",
@@ -98,7 +98,7 @@ def main():
         type=str,
         choices=Language.ALLOWED_VALUES,
         default=Language.ENG,
-        help="Stretch the subtitle with the supported ISO 639-2 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes].\nNB: This will be ignored if either -so or --stretch_off is present",
+        help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present",
     )
     parser.add_argument(
         "-fos",
@@ -120,13 +120,25 @@ def main():
         default="",
         help="Path to the output subtitle file",
     )
+    parser.add_argument(
+        "-t",
+        "--translate",
+        type=str,
+        help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)",
+    )
+    parser.add_argument("-lgs", "--languages", action="store_true",
+                        help="Print out language codes used for stretch and translation")
     parser.add_argument("-d", "--debug", action="store_true",
                         help="Print out debugging information")
     parser.add_argument("-q", "--quiet", action="store_true",
                         help="Switch off logging information")
     parser.add_argument("-ver", "--version", action="version", version=__version__)
     FLAGS, unparsed = parser.parse_known_args()
 
+    if FLAGS.languages:
+        for line in Language.CODE_TO_HUMAN_LIST:
+            print(line.replace("\t", "  "))
+        sys.exit(0)
     if FLAGS.mode == "":
         print("--mode was not passed in")
         sys.exit(21)
@@ -153,6 +165,7 @@ def main():
     Logger.VERBOSE = FLAGS.debug
     Logger.QUIET = FLAGS.quiet
     from subaligner.predictor import Predictor
+    from subaligner.translator import Translator
     from subaligner.exception import UnsupportedFormatException
     from subaligner.exception import TerminalException
     from subaligner.utils import Utils
@@ -201,9 +214,17 @@ def main():
                 stretch_in_lang=stretch_in_lang,
                 exit_segfail=exit_segfail,
             )
+
         aligned_subtitle_path = "_aligned.".join(
             FLAGS.subtitle_path.rsplit(".", 1)).replace(".stl", ".srt") if FLAGS.output == "" else FLAGS.output
-        Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate)
+
+        if FLAGS.translate is not None:
+            source, target = FLAGS.translate.split(",")
+            translator = Translator(source, target)
+            aligned_subs = translator.translate(aligned_subs)
+            Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate, "utf-8")
+        else:
+            Subtitle.export_subtitle(local_subtitle_path, aligned_subs, aligned_subtitle_path, frame_rate)
 
         log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs)
         if log_loss is None or log_loss > FLAGS.max_logloss:

diff --git a/subaligner/_version.py b/subaligner/_version.py
@@ -1,2 +1,2 @@
 """The semver for the current release."""
-__version__ = "0.1.3"
+__version__ = "0.1.4"
diff --git a/subaligner/embedder.py b/subaligner/embedder.py
@@ -22,7 +22,7 @@ def __init__(
         hop_len: int = 512,
         step_sample: float = 0.04,
         len_sample: float = 0.075,
-    ):
+    ) -> None:
         """Feature embedder initialiser.
 
         Keyword Arguments:
@@ -235,7 +235,7 @@ def position_to_time_str(self, position: int) -> str:
     def extract_data_and_label_from_audio(
         self,
         audio_file_path: str,
-        subtitle_file_path: str,
+        subtitle_file_path: Optional[str],
         subtitles: Optional[SubRipFile] = None,
         sound_effect_start_marker: Optional[str] = None,
         sound_effect_end_marker: Optional[str] = None,

diff --git a/subaligner/hparam_tuner.py b/subaligner/hparam_tuner.py
@@ -29,7 +29,7 @@ def __init__(self,
                  num_of_trials: int = 5,
                  tuning_epochs: int = 5,
                  network_type: str = Network.LSTM,
-                 **kwargs):
+                 **kwargs) -> None:
         """Hyperparameter tuner initialiser
 
         Arguments:

diff --git a/subaligner/hyperparameters.py b/subaligner/hyperparameters.py
@@ -9,7 +9,7 @@ class Hyperparameters(object):
 
     OPTIMIZERS = ["adadelta", "adagrad", "adam", "adamax", "ftrl", "nadam", "rmsprop", "sgd"]
 
-    def __init__(self):
+    def __init__(self) -> None:
         """Hyperparameters initialiser setting default values"""
 
         self.__learning_rate = 0.001
@@ -120,7 +120,7 @@ def optimizer(self, value: str) -> None:
             self.__optimizer = "SGD"
 
     @property
-    def loss(self) -> float:
+    def loss(self) -> str:
         return self.__loss
 
     @property

diff --git a/subaligner/logger.py b/subaligner/logger.py
@@ -11,7 +11,7 @@ class Logger(Singleton):
     VERBOSE = True
     QUIET = False
 
-    def __init__(self, output_log: str = "output.log"):
+    def __init__(self, output_log: str = "output.log") -> None:
         self.__loggers: Dict[str, logging.Logger] = {}
         self.__output_log = output_log