fix normalizer and add config for finetunne

traderpedroso · Aug 28, 2024 · 0318871 · 0318871
1 parent ca97496
commit 0318871
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 30 deletions.
diff --git a/configs/autoreg_config.yaml b/configs/autoreg_config.yaml
@@ -0,0 +1,51 @@
+
+paths:
+  checkpoint_dir: checkpoints   # Directory to store model checkpoints and tensorboard, will be created if not existing.
+  data_dir: datasets            # Directory to store processed data, will be created if not existing.
+
+preprocessing:
+  languages: ['de', 'en_us']    # All languages in the dataset.
+
+  # Text (grapheme) and phoneme symbols, either provide a string or list of strings.
+  # Symbols in the dataset will be filtered according to these lists!
+  text_symbols: 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜß'
+  phoneme_symbols: ['a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ç', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɛ', 'ɝ', 'ɹ', 'ɡ', 'ɪ', 'ʁ', 'ʃ', 'ʊ', 'ʌ', 'ʏ', 'ʒ', 'ʔ', 'ˈ', 'ˌ', 'ː', '̃', '̍', '̥', '̩', '̯', '͡', 'θ']
+
+  char_repeats: 1                # Number of grapheme character repeats to allow for mapping to longer phoneme sequences.
+                                 # Set to 1 for autoreg_transformer.
+  lowercase: true                # Whether to lowercase the grapheme input.
+  n_val: 5000                    # Default number of validation data points if no explicit validation data is provided.
+
+
+model:
+  type: 'autoreg_transformer'        # Whether to use a forward transformer or autoregressive transformer model.
+                                     # Choices: ['transformer', 'autoreg_transformer']
+  d_model: 512
+  d_fft: 1024
+  layers: 4
+  dropout: 0.1
+  heads: 4
+
+training:
+
+  # Hyperparams for learning rate and scheduler.
+  # The scheduler is reducing the lr on plateau of phoneme error rate (tested every n_generate_steps).
+
+  learning_rate: 0.0001              # Learning rate of Adam.
+  warmup_steps: 10000                # Linear increase of the lr from zero to the given lr within the given number of steps.
+  scheduler_plateau_factor: 0.5      # Factor to multiply learning rate on plateau.
+  scheduler_plateau_patience: 10     # Number of text generations with no improvement to tolerate.
+  batch_size: 32                     # Training batch size.
+  batch_size_val: 32                 # Validation batch size.
+  epochs: 500                        # Number of epochs to train.
+  generate_steps: 10000              # Interval of training steps to generate sample outputs. Also, at this step the phoneme and word
+                                     # error rates are calculated for the scheduler.
+  validate_steps: 10000              # Interval of training steps to validate the model
+                                     # (for the autoregressive model this is teacher-forced).
+  checkpoint_steps: 100000           # Interval of training steps to save the model.
+  n_generate_samples: 10             # Number of result samples to show on tensorboard.
+  store_phoneme_dict_in_model: true  # Whether to store the raw phoneme dict in the model.
+                                     # It will be loaded by the phonemizer object.
+  ddp_backend: 'nccl'                # Backend used by Torch DDP
+  ddp_host: 'localhost'              # Hostname used by Torch DDP
+  ddp_post: '12355'                    # Port used by Torch DDP
diff --git a/configs/forward_config.yaml b/configs/forward_config.yaml
@@ -0,0 +1,51 @@
+
+paths:
+  checkpoint_dir: checkpoints   # Directory to store model checkpoints and tensorboard, will be created if not existing.
+  data_dir: datasets            # Directory to store processed data, will be created if not existing.
+
+preprocessing:
+  languages: ['de', 'en_us']    # All languages in the dataset.
+
+  # Text (grapheme) and phoneme symbols, either provide a string or list of strings.
+  # Symbols in the dataset will be filtered according to these lists!
+  text_symbols: 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜß'
+  phoneme_symbols: ['a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ç', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɛ', 'ɝ', 'ɹ', 'ɡ', 'ɪ', 'ʁ', 'ʃ', 'ʊ', 'ʌ', 'ʏ', 'ʒ', 'ʔ', 'ˈ', 'ˌ', 'ː', '̃', '̍', '̥', '̩', '̯', '͡', 'θ']
+
+  char_repeats: 3                # Number of grapheme character repeats to allow for mapping to longer phoneme sequences.
+                                 # Set to 1 for autoreg_transformer.
+  lowercase: true                # Whether to lowercase the grapheme input.
+  n_val: 5000                    # Default number of validation data points if no explicit validation data is provided.
+
+
+model:
+  type: 'transformer'            # Whether to use a forward transformer or autoregressive transformer model.
+                                 # Choices: ['transformer', 'autoreg_transformer']
+  d_model: 512
+  d_fft: 1024
+  layers: 6
+  dropout: 0.1
+  heads: 4
+
+training:
+
+  # Hyperparams for learning rate and scheduler.
+  # The scheduler is reducing the lr on plateau of phoneme error rate (tested every n_generate_steps).
+
+  learning_rate: 0.0001              # Learning rate of Adam.
+  warmup_steps: 10000                # Linear increase of the lr from zero to the given lr within the given number of steps.
+  scheduler_plateau_factor: 0.5      # Factor to multiply learning rate on plateau.
+  scheduler_plateau_patience: 10     # Number of text generations with no improvement to tolerate.
+  batch_size: 32                     # Training batch size.
+  batch_size_val: 32                 # Validation batch size.
+  epochs: 500                        # Number of epochs to train.
+  generate_steps: 10000              # Interval of training steps to generate sample outputs. Also, at this step the phoneme and word
+                                     # error rates are calculated for the scheduler.
+  validate_steps: 10000              # Interval of training steps to validate the model
+                                     # (for the autoregressive model this is teacher-forced).
+  checkpoint_steps: 100000           # Interval of training steps to save the model.
+  n_generate_samples: 10             # Number of result samples to show on tensorboard.
+  store_phoneme_dict_in_model: true  # Whether to store the raw phoneme dict in the model.
+                                     # It will be loaded by the phonemizer object.
+  ddp_backend: 'nccl'                # Backend used by Torch DDP
+  ddp_host: 'localhost'              # Hostname used by Torch DDP
+  ddp_post: '12355'                    # Port used by Torch DDP
diff --git a/setup.py b/setup.py
@@ -7,10 +7,10 @@
 DeepPhonemizerBR is compatible with Python 3.6+ and is distributed under the MIT license.
 
 """
-# Version: 0.0.6
+# Version: 0.0.7
 setup(
     name="xphonebr",
-    version="0.0.6",
+    version="0.0.7",
     author="Emerson Pedroso",
     author_email="traderpedroso@icloud.com",
     description="Grapheme to phoneme conversion and tools for tts with deep learning.",

diff --git a/xphonebr/Util/norm.py b/xphonebr/Util/norm.py
@@ -65,18 +65,6 @@
         ("ghz", "giga-hertz"),
         ("km", "quilômetro"),
         ("ltda", "limitada"),
-        ("jan", "janeiro"),
-        ("fev", "fevereiro"),
-        ("mar", "março"),
-        ("abr", "abril"),
-        ("mai", "maio"),
-        ("jun", "junho"),
-        ("jul", "julho"),
-        ("ago", "agosto"),
-        ("set", "setembro"),
-        ("out", "outubro"),
-        ("nov", "novembro"),
-        ("dez", "dezembro"),
         ("pág", "página"),
         ("págs", "páginas"),
         ("s.a", "sociedade anônima"),
@@ -166,8 +154,12 @@ def money_to_words_integers(match):
         return f"{amount_text} {currency_text}"
 
     # Expressão regular para valores com milhões e bilhões
-    text = re.sub(r"(R\$|€|£|\$) (\d+)( milhões| bilhões)", money_to_words_millions, text)
-    text = re.sub(r"(R\$|€|£|\$)(\d+)( milhões| bilhões)", money_to_words_millions, text)
+    text = re.sub(
+        r"(R\$|€|£|\$) (\d+)( milhões| bilhões)", money_to_words_millions, text
+    )
+    text = re.sub(
+        r"(R\$|€|£|\$)(\d+)( milhões| bilhões)", money_to_words_millions, text
+    )
 
     # Expressão regular para valores com centavos
     text = re.sub(r"(R\$|€|£|\$) (\d+),(\d{2})", money_to_words_cents, text)
@@ -212,18 +204,17 @@ def _normalize_numbers_with_letters(text):
     )
 
 
-
 def normalizer(text):
-        text = _normalize_percentages(text)
-        text = _normalize_time(text)
-        text = _normalize_money(text)
-        text = _normalize_am_pm_times(text)
-        text = _normalize_numbers_with_letters(text)
-        text = _normalize_numbers(text)
-        text = _normalize_abbreviations(text)
-        text = replace_punctuation(text)
-        text = remove_aux_symbols(text)
-        text = remove_punctuation_at_begin(text)
-        text = collapse_whitespace(text)
-        text = re.sub(r"([^\.,!\?\-…])$", r"\1.", text)
-        return text
+    text = _normalize_percentages(text)
+    text = _normalize_time(text)
+    text = _normalize_money(text)
+    text = _normalize_am_pm_times(text)
+    text = _normalize_numbers_with_letters(text)
+    text = _normalize_numbers(text)
+    text = _normalize_abbreviations(text)
+    text = replace_punctuation(text)
+    text = remove_aux_symbols(text)
+    text = remove_punctuation_at_begin(text)
+    text = collapse_whitespace(text)
+    text = re.sub(r"([^\.,!\?\-…])$", r"\1.", text)
+    return text