v1

zhenye234 · Aug 29, 2023 · 18ce9d3 · 18ce9d3
1 parent 8662604
commit 18ce9d3
Show file tree

Hide file tree

Showing 38 changed files with 173,377 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+DUMMY/
+__pycache__/
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2023 Zhen YE
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,2 +1,49 @@
-# CoMoSpeech
-coming soon
+
+
+# COMOSPEECH
+
+Implementation of the Comospeech. For all details check out our paper accepted to ACM MM 2023: CoMoSpeech  One-Step Speech and Singing Voice Synthesis via Consistency Model.
+
+**Authors**: Zhen Ye, Wei Xue, Xu Tan, Jie Chen, Qifeng Liu, Yike Guo.
+
+
+
+## Abstract
+
+**Demo page**: [link](https://comospeech.github.io/).
+
+Denoising diffusion probabilistic models (DDPMs) have shown promising performance for speech synthesis. However, a large number of iterative steps are required to achieve high sample quality, which restricts the inference speed. Maintaining sample quality while increasing sampling speed has become a challenging task. In this paper, we propose a **Co**nsistency **Mo**del-based Speech synthesis method, CoMoSpeech, which   achieve speech synthesis through a single diffusion sampling step while achieving high audio quality. The consistency constraint is applied to distill a consistency model from a well-designed diffusion-based teacher model, which ultimately yields superior performances in the distilled CoMoSpeech. 
+Our experiments show that by generating audio recordings by a single sampling step, the CoMoSpeech achieves an inference speed more than 150 times faster than real-time on a single NVIDIA A100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based speech synthesis truly practical. Meanwhile, objective and subjective evaluations on text-to-speech and singing voice synthesis show that the proposed teacher models yield the best audio quality, and the one-step sampling based CoMoSpeech achieves the best inference speed with better or comparable audio quality to other conventional multi-step diffusion model baselines.
+
+## Prepare
+
+Build `monotonic_align` code (Cython):
+
+```bash
+cd model/monotonic_align; python setup.py build_ext --inplace; cd ../..
+```
+
+
+
+## Inference
+
+Run script `inference.py` by providing path to the text file, path to the  checkpoint, number of sampling :
+```bash
+    python inference.py -f <text file> -c <checkpoint> -t <sampling steps> 
+```
+Check out folder called `out` for generated audios. Note that in params file. Teacher = True is for our teacher model, False is for our ComoSpeech. In addition, we use the same vocoder in [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/blob/main/Grad-TTS/). You can download it and put into checkpts folder.
+
+
+## Training
+
+We use LJSpeech datasets and follow the train/test/val split in fastspeech2, you can change the split in fs2_txt folder. Then run script `train.py` ,
+```bash
+    python train.py 
+```
+Note that in params file. Teacher = True is for our teacher model, False is for our ComoSpeech. While training Comospeech, teacher checkpoint directory should be provide.
+
+## Acknowledgement
+I would like to extend a special thanks to authors of Grad-TTS, since our code base is mainlly borrowed from  [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/blob/main/Grad-TTS/).
+
+## Contact
+You are welcome to send pull requests or share some ideas with me. Contact information: Zhen YE ( zhenye312@gmail.com )
diff --git a/data.py b/data.py
@@ -0,0 +1,178 @@
+import random
+import numpy as np
+
+import torch
+import torchaudio as ta
+
+from text import text_to_sequence, cmudict
+from text.symbols import symbols
+from utils import parse_filelist, intersperse
+from model.utils import fix_len_compatibility
+from params import seed as random_seed
+
+import sys
+sys.path.insert(0, 'hifi-gan')
+from meldataset import mel_spectrogram
+
+
+class TextMelDataset(torch.utils.data.Dataset):
+    def __init__(self, filelist_path, cmudict_path, add_blank=True,
+                 n_fft=1024, n_mels=80, sample_rate=22050,
+                 hop_length=256, win_length=1024, f_min=0., f_max=8000):
+        self.filepaths_and_text = parse_filelist(filelist_path)
+        self.cmudict = cmudict.CMUDict(cmudict_path)
+        self.add_blank = add_blank
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.f_min = f_min
+        self.f_max = f_max
+        random.seed(random_seed)
+        # random.shuffle(self.filepaths_and_text)
+
+    def get_pair(self, filepath_and_text):
+        filepath, text = filepath_and_text[0], filepath_and_text[1]
+        text = self.get_text(text, add_blank=self.add_blank)
+        mel = self.get_mel(filepath)
+        return (text, mel)
+
+    def get_mel(self, filepath):
+        audio, sr = ta.load(filepath)
+        assert sr == self.sample_rate
+        mel = mel_spectrogram(audio, self.n_fft, self.n_mels, self.sample_rate, self.hop_length,
+                              self.win_length, self.f_min, self.f_max, center=False).squeeze()
+        return mel
+
+    def get_text(self, text, add_blank=True):
+        text_norm = text_to_sequence(text, dictionary=self.cmudict)
+        if self.add_blank:
+            text_norm = intersperse(text_norm, len(symbols))  # add a blank token, whose id number is len(symbols)
+        text_norm = torch.IntTensor(text_norm)
+        return text_norm
+
+    def __getitem__(self, index):
+        text, mel = self.get_pair(self.filepaths_and_text[index])
+        item = {'y': mel, 'x': text}
+        return item
+
+    def __len__(self):
+        return len(self.filepaths_and_text)
+
+    def sample_test_batch(self, size):
+        idx =[0,1] # np.random.choice(range(len(self)), size=size, replace=False)
+        test_batch = []
+        for index in idx:
+            test_batch.append(self.__getitem__(index))
+        return test_batch
+
+
+class TextMelBatchCollate(object):
+    def __call__(self, batch):
+        B = len(batch)
+        y_max_length = max([item['y'].shape[-1] for item in batch])
+        y_max_length = fix_len_compatibility(y_max_length)
+        x_max_length = max([item['x'].shape[-1] for item in batch])
+        n_feats = batch[0]['y'].shape[-2]
+
+        y = torch.zeros((B, n_feats, y_max_length), dtype=torch.float32)
+        x = torch.zeros((B, x_max_length), dtype=torch.long)
+        y_lengths, x_lengths = [], []
+
+        for i, item in enumerate(batch):
+            y_, x_ = item['y'], item['x']
+            y_lengths.append(y_.shape[-1])
+            x_lengths.append(x_.shape[-1])
+            y[i, :, :y_.shape[-1]] = y_
+            x[i, :x_.shape[-1]] = x_
+
+        y_lengths = torch.LongTensor(y_lengths)
+        x_lengths = torch.LongTensor(x_lengths)
+        return {'x': x, 'x_lengths': x_lengths, 'y': y, 'y_lengths': y_lengths}
+
+
+class TextMelSpeakerDataset(torch.utils.data.Dataset):
+    def __init__(self, filelist_path, cmudict_path, add_blank=True,
+                 n_fft=1024, n_mels=80, sample_rate=22050,
+                 hop_length=256, win_length=1024, f_min=0., f_max=8000):
+        super().__init__()
+        self.filelist = parse_filelist(filelist_path, split_char='|')
+        self.cmudict = cmudict.CMUDict(cmudict_path)
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.f_min = f_min
+        self.f_max = f_max
+        self.add_blank = add_blank
+        random.seed(random_seed)
+        random.shuffle(self.filelist)
+
+    def get_triplet(self, line):
+        filepath, text, speaker = line[0], line[1], line[2]
+        text = self.get_text(text, add_blank=self.add_blank)
+        mel = self.get_mel(filepath)
+        speaker = self.get_speaker(speaker)
+        return (text, mel, speaker)
+
+    def get_mel(self, filepath):
+        audio, sr = ta.load(filepath)
+        assert sr == self.sample_rate
+        mel = mel_spectrogram(audio, self.n_fft, self.n_mels, self.sample_rate, self.hop_length,
+                              self.win_length, self.f_min, self.f_max, center=False).squeeze()
+        return mel
+
+    def get_text(self, text, add_blank=True):
+        text_norm = text_to_sequence(text, dictionary=self.cmudict)
+        if self.add_blank:
+            text_norm = intersperse(text_norm, len(symbols))  # add a blank token, whose id number is len(symbols)
+        text_norm = torch.LongTensor(text_norm)
+        return text_norm
+
+    def get_speaker(self, speaker):
+        speaker = torch.LongTensor([int(speaker)])
+        return speaker
+
+    def __getitem__(self, index):
+        text, mel, speaker = self.get_triplet(self.filelist[index])
+        item = {'y': mel, 'x': text, 'spk': speaker}
+        return item
+
+    def __len__(self):
+        return len(self.filelist)
+
+    def sample_test_batch(self, size):
+        idx = np.random.choice(range(len(self)), size=size, replace=False)
+        test_batch = []
+        for index in idx:
+            test_batch.append(self.__getitem__(index))
+        return test_batch
+
+
+class TextMelSpeakerBatchCollate(object):
+    def __call__(self, batch):
+        B = len(batch)
+        y_max_length = max([item['y'].shape[-1] for item in batch])
+        y_max_length = fix_len_compatibility(y_max_length)
+        x_max_length = max([item['x'].shape[-1] for item in batch])
+        n_feats = batch[0]['y'].shape[-2]
+
+        y = torch.zeros((B, n_feats, y_max_length), dtype=torch.float32)
+        x = torch.zeros((B, x_max_length), dtype=torch.long)
+        y_lengths, x_lengths = [], []
+        spk = []
+
+        for i, item in enumerate(batch):
+            y_, x_, spk_ = item['y'], item['x'], item['spk']
+            y_lengths.append(y_.shape[-1])
+            x_lengths.append(x_.shape[-1])
+            y[i, :, :y_.shape[-1]] = y_
+            x[i, :x_.shape[-1]] = x_
+            spk.append(spk_)
+
+        y_lengths = torch.LongTensor(y_lengths)
+        x_lengths = torch.LongTensor(x_lengths)
+        spk = torch.cat(spk, dim=0)
+        return {'x': x, 'x_lengths': x_lengths, 'y': y, 'y_lengths': y_lengths, 'spk': spk}