pyproject.toml

[tool.poetry]
name = "stepwise_tacotron"
version = "0.1.1"
description = "Tacotron with stepwise monotonic attention"
authors = ["Thông Nguyễn"]
license = "MIT"

[tool.poetry.dependencies]
python = ">=3.7,<3.11"
Unidecode = "^1.3.2"
toml = "^0.10.2"
tensorflow-cpu = "^2.8.0"
pax3 = "^0.5.5"
opax = "^0.2.8"
jax = { version = "^0.3.0", optional = true }
matplotlib = "^3.5.1"
librosa = "^0.9.1"
tqdm = "^4.62.3"
pooch = "^1.6.0"
fire = "^0.4.0"
soxr = "^0.2.7"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tacotron]

# training
TRAINING_STEPS = 500000
BATCH_SIZE = 64
LR=1024e-6 # learning rate
MODEL_PREFIX = "mono_tts_cbhg_small"
LOG_DIR = "./logs"
CKPT_DIR = "./ckpts"
USE_MP = false  # use mixed-precision training
STEPS_PER_CALL = 10
TEST_DATA_SIZE = 100

# data
TF_DATA_DIR = "./tf_data" # tensorflow data directory
TF_GTA_DATA_DIR = "./tf_gta_data" # tf gta data directory
SAMPLE_RATE = 24000 # convert to this sample rate if needed
MEL_DIM = 80 # the dimension of melspectrogram features
MEL_MIN = 1e-5
PAD = "_" # padding character
PAD_TOKEN = 0
END_CHARACTER = "■"  # to signal the end of the transcript

# model
RR = 1 # reduction factor
MAX_RR = 2
ATTN_BIAS = 0.0 # control how slow the attention moves forward
SIGMOID_NOISE = 2.0
PRENET_DIM = 128
TEXT_DIM = 256
RNN_DIM = 512
ATTN_RNN_DIM = 256
ATTN_HIDDEN_DIM = 128
POSTNET_DIM = 512