minor update

rwth-i6 · Jan 9, 2025 · 3901e9d · 3901e9d
1 parent 00a6e1b
commit 3901e9d
Show file tree

Hide file tree

Showing 12 changed files with 2,685 additions and 75 deletions.
diff --git a/users/zhang/datasets/__init__.py b/users/zhang/datasets/__init__.py
@@ -0,0 +1,3 @@
+"""
+Datasets
+"""
diff --git a/users/zhang/datasets/librispeech.py b/users/zhang/datasets/librispeech.py
diff --git a/users/zhang/datasets/task.py b/users/zhang/datasets/task.py
@@ -0,0 +1,53 @@
+"""
+Dataset / task interface
+"""
+
+from __future__ import annotations
+from typing import Dict, Callable, Sequence, Optional
+import dataclasses
+
+from returnn_common.datasets_old_2022_10.interface import DatasetConfig
+from i6_experiments.users.zeyer.datasets.score_results import RecogOutput, ScoreResult, ScoreResultCollection, MeasureType
+
+
+@dataclasses.dataclass
+class Task:
+    """
+    Covers the training dataset and dev/eval etc. for recognition, including how to score it.
+    This goes beyond :class:`DatasetConfig`, or rather covers multiple :class:`DatasetConfig`.
+
+    It should be possible to replace Librispeech by Switchboard. Maybe even translation tasks later.
+
+    Note that the dataset would also already include things like feature extraction details, output labels (BPE etc).
+    """
+    name: str  # to differentiate between different tasks. might be used for the output dir name
+
+    # for training
+    train_dataset: DatasetConfig  # also includes cross-validation dataset for learning rate scheduling etc
+    train_epoch_split: int
+
+    # for recognition
+    dev_dataset: DatasetConfig  # used to select best epoch, maybe tune LM scale or so.
+    eval_datasets: Dict[str, DatasetConfig]
+
+    main_measure_type: MeasureType  # e.g. WER%
+    main_measure_name: str  # e.g. dataset name but arbitrary, just to describe the main measure value
+
+    score_recog_output_func: Callable[[DatasetConfig, RecogOutput], ScoreResult]
+
+    # For prior calculation
+    prior_dataset: Optional[DatasetConfig] = None
+
+    # e.g. for bpe_to_words or so. This is here because it depends on the type of vocab.
+    recog_post_proc_funcs: Sequence[Callable[[RecogOutput], RecogOutput]] = ()
+
+    def default_collect_score_results(self, score_results: Dict[str, ScoreResult]) -> ScoreResultCollection:
+        """using main_measure_name as the main key in score_results"""
+        from i6_experiments.users.zeyer.datasets.score_results import join_score_results
+        return join_score_results(score_results, main_measure_key=self.main_measure_name)
+
+    collect_score_results_func: Callable[[Dict[str, ScoreResult]], ScoreResultCollection] = None
+
+    def __post_init__(self):
+        if self.collect_score_results_func is None:
+            self.collect_score_results_func = self.default_collect_score_results
diff --git a/users/zhang/datasets/utils.py b/users/zhang/datasets/utils.py
@@ -0,0 +1,118 @@
+import os
+
+from typing import Dict, Tuple, Union, Any, Optional, Sequence
+
+from i6_core.lib import corpus
+from sisyphus import Job, Task as SisTask, tk
+from i6_core.util import uopen
+
+class CorpusReplaceOrthFromPyDictJob(Job):
+    """
+    Merge HDF pseudo labels back into a bliss corpus
+    """
+
+    def __init__(self, bliss_corpus, recog_words_file, segment_file=None):
+        """
+        :param Path bliss_corpus: Bliss corpus
+        :param Path recog_words_file: a recog_words file
+        :param Path|None segment_file: only replace the segments as specified in the segment file
+        """
+        self.bliss_corpus = bliss_corpus
+        self.recog_words_file = recog_words_file
+        self.segment_file = segment_file
+
+        self.out_corpus = self.output_path("corpus.xml.gz")
+
+    def tasks(self):
+        yield SisTask("run", mini_task=True)
+
+    def run(self):
+        c = corpus.Corpus()
+        c.load(self.bliss_corpus.get_path())
+
+        if self.segment_file:
+            with uopen(self.segment_file.get_path(), "rt") as f:
+                segments_whitelist = set(l.strip() for l in f.readlines() if len(l.strip()) > 0)
+            segment_iterator = filter(lambda s: s.fullname() in segments_whitelist, c.segments())
+        else:
+            segment_iterator = c.segments()
+
+        d = eval(uopen(self.recog_words_file, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
+        assert isinstance(d, dict), "Has to be a dict containing the path to the search output file"
+
+        assert c.fullname() in d["path"], "Corpus not in search output"
+
+        d = eval(uopen(d["path"][c.fullname()], "rt").read(), {"nan": float("nan"), "inf": float("inf")})
+        assert isinstance(d, dict), "only search output file with dict format is supported"
+
+        j = 0
+        for segment in segment_iterator:
+            assert segment.fullname() in d, f"Segment {segment.fullname()} not in search output"
+            line = d[segment.fullname()]
+            if len(line) == 0:
+                assert segment.recording is not None, f"Segment {segment.fullname()} has no recording"
+                assert len(segment.recording.segments) == 1, f"Recording {segment.recording.fullname()} has more than one segment ({segment.recording.segments})"
+                print(f"Segment {segment.fullname()} has empty pseudo label. It should be {segment.orth}")
+                c.remove_recording(segment.recording)
+                j += 1
+            else:
+                segment.orth = line.strip()
+        n = len(c.recordings)
+        m = len(d)
+        assert m == n + j, f"Number of segments in corpus ({n+j}) does not match number of segments in search output ({m})"
+
+        print(f"Number of segments with empty pseudo label: {j} out of {m}, Percentage: {j/m}")
+        c.dump(self.out_corpus.get_path())
+
+def get_ogg_zip_dict_pseudo_labels(bliss_corpus_dict: Dict[str, tk.Path]) -> Dict[str, tk.Path]:
+    from i6_core.returnn.oggzip import BlissToOggZipJob
+    import os
+
+    ogg_zip_dict = {}
+    for name, bliss_corpus in bliss_corpus_dict.items():
+        ogg_zip_job = BlissToOggZipJob(
+            bliss_corpus,
+            no_audio=True,
+            returnn_python_exe=None,
+            returnn_root=None,
+        )
+        ogg_zip_job.add_alias(os.path.join("datasets", "LibriSpeech-PseudoLabels", "%s_ogg_zip_job" % name.replace('-', '_')))
+        ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip
+
+    return ogg_zip_dict
+
+class MetaDataset():
+    """
+    Represents :class:`MetaDataset` in RETURNN
+
+    Only allows the MetaDataset to be used with an explicit control dataset.
+    """
+
+    def __init__(self,
+                 data_map: Dict[str, Tuple[str, str]],
+                 datasets: Dict[str, Dict],
+                 seq_order_control_dataset: str,
+                 other_opts: Optional[Dict[str, Any]] = None):
+        """
+        :param data_map:
+        :param datasets:
+        :param seq_order_control_dataset:
+        :param dict other_opts:
+        """
+        self.data_map = data_map
+        self.datasets = datasets
+        assert seq_order_control_dataset in datasets
+        self.seq_order_control_dataset = seq_order_control_dataset
+        if other_opts is None:
+            other_opts = {}
+        self.other_opts = other_opts
+
+    def as_returnn_opts(self):
+        d = {
+            'class': 'MetaDataset',
+            'data_map': self.data_map,
+            'datasets': self.datasets,
+            'seq_order_control_dataset': self.seq_order_control_dataset
+        }
+        d.update(self.other_opts)
+        return d
diff --git a/users/zhang/experiments/configs.py b/users/zhang/experiments/configs.py
@@ -60,6 +60,9 @@
     (8, 250): [279_000, 558_000, 621_000],  # ~2485steps/ep, 250 eps -> 621k steps in total
     (8, 500): [558_000, 1_117_000, 1_242_000],  # ~2485steps/ep, 500 eps -> 1.242k steps in total
     (10, 500): [443_000, 887_000, 986_000],  # ~1973 steps/epoch, total steps after 500 epochs: ~986k
+    (15, 50): [27_900, 55_800, 62_000],  # total steps after 50 epochs: ~62k
+    (15, 113): [63_000, 126_000, 140_000],  # total steps after 113 epochs: ~140k
+    (15, 225): [126_000, 252_000, 280_000],  # total steps after 225 epochs: ~281k
     (15, 400): [234_000, 469_000, 521_000],  # total steps after 400 epochs: ~521k
     (15, 500): [295_000, 590_000, 652_000],  # total steps after 500 epochs: ~652k
     (15, 600): [352_000, 704_000, 782_000],  # total steps after 600 epochs: ~782k