• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2019-2023 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""
16This file contains specific audio dataset loading classes. You can easily use
17these classes to load the prepared dataset. For example:
18    LJSpeechDataset: which is lj speech dataset.
19    YesNoDataset: which is yes or no dataset.
20    SpeechCommandsDataset: which is speech commands dataset.
21    TedliumDataset: which is tedlium dataset.
22    ...
23After declaring the dataset object, you can further apply dataset operations
24(e.g. filter, skip, concat, map, batch) on it.
25"""
26import mindspore._c_dataengine as cde
27
28from .datasets import AudioBaseDataset, MappableDataset
29from .validators import check_cmu_arctic_dataset, check_gtzan_dataset, check_libri_tts_dataset, \
30    check_lj_speech_dataset, check_speech_commands_dataset, check_tedlium_dataset, check_yes_no_dataset
31
32from ..core.validator_helpers import replace_none
33
34
35class CMUArcticDataset(MappableDataset, AudioBaseDataset):
36    """
37    CMU Arctic dataset.
38
39    The generated dataset has four columns: :py:obj:`[waveform, sample_rate, transcript, utterance_id]` .
40    The tensor of column :py:obj:`waveform` is of the float32 type.
41    The tensor of column :py:obj:`sample_rate` is of a scalar of uint32 type.
42    The tensor of column :py:obj:`transcript` is of a scalar of string type.
43    The tensor of column :py:obj:`utterance_id` is of a scalar of string type.
44
45    Args:
46        dataset_dir (str): Path to the root directory that contains the dataset.
47        name (str, optional): Part of this dataset, can be ``'aew'``, ``'ahw'``, ``'aup'``,
48            ``'awb'``, ``'axb'``, ``'bdl'``, ``'clb'``, ``'eey'``, ``'fem'``, ``'gka'``, ``'jmk'``,
49            ``'ksp'``, ``'ljm'``, ``'lnh'``, ``'rms'``, ``'rxr'``, ``'slp'`` or ``'slt'``.
50            Default: ``None``, means ``'aew'``.
51        num_samples (int, optional): The number of audio to be included in the dataset.
52            Default: ``None``, will read all audio.
53        num_parallel_workers (int, optional): Number of worker threads to read the data.
54            Default: ``None``, will use global default workers(8), it can be set
55            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
56        shuffle (bool, optional): Whether or not to perform shuffle on the dataset.
57            Default: ``None``, expected order behavior shown in the table below.
58        sampler (Sampler, optional): Object used to choose samples from the
59            dataset. Default: ``None``, expected order behavior shown in the table below.
60        num_shards (int, optional): Number of shards that the dataset will be divided into.
61            Default: ``None``, no dividing. When this argument is specified, `num_samples`
62            reflects the max sample number of per shard.
63        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``, will use ``0``. This
64            argument can only be specified when `num_shards` is also specified.
65        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
66            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
67            Default: ``None``, which means no cache is used.
68
69    Raises:
70        RuntimeError: If `dataset_dir` does not contain data files.
71        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
72        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
73        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
74        RuntimeError: If `num_shards` is specified but `shard_id` is None.
75        RuntimeError: If `shard_id` is specified but `num_shards` is None.
76        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
77
78    Tutorial Examples:
79        - `Load & Process Data With Dataset Pipeline
80          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
81
82    Note:
83        - Not support :class:`mindspore.dataset.PKSampler` for `sampler` parameter yet.
84        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
85          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
86
87    .. include:: mindspore.dataset.sampler.txt
88
89    Examples:
90        >>> import mindspore.dataset as ds
91        >>> cmu_arctic_dataset_directory = "/path/to/cmu_arctic_dataset_directory"
92        >>>
93        >>> # 1) Read 500 samples (audio files) in cmu_arctic_dataset_directory
94        >>> dataset = ds.CMUArcticDataset(cmu_arctic_dataset_directory, name="ahw", num_samples=500)
95        >>>
96        >>> # 2) Read all samples (audio files) in cmu_arctic_dataset_directory
97        >>> dataset = ds.CMUArcticDataset(cmu_arctic_dataset_directory)
98
99    About CMUArctic dataset:
100
101    The CMU Arctic databases are designed for the purpose of speech synthesis research.
102    These single speaker speech databases have been carefully recorded under studio conditions
103    and consist of approximately 1200 phonetically balanced English utterances. In addition to wavefiles,
104    the databases provide complete support for the Festival Speech Synthesis System, including pre-built
105    voices that may be used as is. The entire package is distributed as free software, without restriction
106    on commercial or non-commercial use.
107
108    You can construct the following directory structure from CMUArctic dataset and read by MindSpore's API.
109
110    .. code-block::
111
112        .
113        └── cmu_arctic_dataset_directory
114            ├── cmu_us_aew_arctic
115            │    ├── wav
116            │    │    ├──arctic_a0001.wav
117            │    │    ├──arctic_a0002.wav
118            │    │    ├──...
119            │    ├── etc
120            │    │    └── txt.done.data
121            ├── cmu_us_ahw_arctic
122            │    ├── wav
123            │    │    ├──arctic_a0001.wav
124            │    │    ├──arctic_a0002.wav
125            │    │    ├──...
126            │    └── etc
127            │         └── txt.done.data
128            └──...
129
130    Citation:
131
132    .. code-block::
133
134        @article{LTI2003CMUArctic,
135        title        = {CMU ARCTIC databases for speech synthesis},
136        author       = {John Kominek and Alan W Black},
137        journal      = {Language Technologies Institute [Online]},
138        year         = {2003}
139        howpublished = {http://www.festvox.org/cmu_arctic/}
140        }
141    """
142
143    @check_cmu_arctic_dataset
144    def __init__(self, dataset_dir, name=None, num_samples=None, num_parallel_workers=None, shuffle=None,
145                 sampler=None, num_shards=None, shard_id=None, cache=None):
146        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
147                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
148
149        self.dataset_dir = dataset_dir
150        self.name = replace_none(name, "aew")
151
152    def parse(self, children=None):
153        return cde.CMUArcticNode(self.dataset_dir, self.name, self.sampler)
154
155
156class GTZANDataset(MappableDataset, AudioBaseDataset):
157    """
158    GTZAN dataset.
159
160    The generated dataset has three columns: :py:obj:`[waveform, sample_rate, label]` .
161    The tensor of column :py:obj:`waveform` is of the float32 type.
162    The tensor of column :py:obj:`sample_rate` is of a scalar of uint32 type.
163    The tensor of column :py:obj:`label` is of a scalar of string type.
164
165    Args:
166        dataset_dir (str): Path to the root directory that contains the dataset.
167        usage (str, optional): Usage of this dataset, can be 'train', 'valid', 'test' or 'all'.
168            Default: ``None`` , will read all samples.
169        num_samples (int, optional): The number of audio to be included in the dataset.
170            Default: ``None`` , will read all audio.
171        num_parallel_workers (int, optional): Number of worker threads to read the data.
172            Default: ``None`` , will use global default workers(8), it can be set
173            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
174        shuffle (bool, optional): Whether or not to perform shuffle on the dataset.
175            Default: ``None`` , expected order behavior shown in the table below.
176        sampler (Sampler, optional): Object used to choose samples from the
177            dataset. Default: ``None`` , expected order behavior shown in the table below.
178        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
179            When this argument is specified, `num_samples` reflects the max sample number of per shard.
180        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
181            argument can only be specified when `num_shards` is also specified.
182        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
183            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
184            Default: ``None`` , which means no cache is used.
185
186    Raises:
187        RuntimeError: If `dataset_dir` does not contain data files.
188        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
189        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
190        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
191        RuntimeError: If `num_shards` is specified but `shard_id` is None.
192        RuntimeError: If `shard_id` is specified but `num_shards` is None.
193        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
194
195    Tutorial Examples:
196        - `Load & Process Data With Dataset Pipeline
197          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
198
199    Note:
200        - Not support :class:`mindspore.dataset.PKSampler` for `sampler` parameter yet.
201        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
202          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
203
204    .. include:: mindspore.dataset.sampler.txt
205
206    Examples:
207        >>> import mindspore.dataset as ds
208        >>> gtzan_dataset_directory = "/path/to/gtzan_dataset_directory"
209        >>>
210        >>> # 1) Read 500 samples (audio files) in gtzan_dataset_directory
211        >>> dataset = ds.GTZANDataset(gtzan_dataset_directory, usage="all", num_samples=500)
212        >>>
213        >>> # 2) Read all samples (audio files) in gtzan_dataset_directory
214        >>> dataset = ds.GTZANDataset(gtzan_dataset_directory)
215
216    About GTZAN dataset:
217
218    The GTZAN dataset appears in at least 100 published works and is the most commonly used
219    public dataset for evaluation in machine listening research for music genre recognition.
220    It consists of 1000 audio tracks, each of which is 30 seconds long. It contains 10 genres (blues,
221    classical, country, disco, hiphop, jazz, metal, pop, reggae and rock), each of which is
222    represented by 100 tracks. The tracks are all 22050Hz Mono 16-bit audio files in .wav format.
223
224    You can construct the following directory structure from GTZAN dataset and read by MindSpore's API.
225
226    .. code-block::
227
228        .
229        └── gtzan_dataset_directory
230            ├── blues
231            │    ├──blues.00000.wav
232            │    ├──blues.00001.wav
233            │    ├──blues.00002.wav
234            │    ├──...
235            ├── disco
236            │    ├──disco.00000.wav
237            │    ├──disco.00001.wav
238            │    ├──disco.00002.wav
239            │    └──...
240            └──...
241
242    Citation:
243
244    .. code-block::
245
246        @misc{tzanetakis_essl_cook_2001,
247        author    = "Tzanetakis, George and Essl, Georg and Cook, Perry",
248        title     = "Automatic Musical Genre Classification Of Audio Signals",
249        url       = "http://ismir2001.ismir.net/pdf/tzanetakis.pdf",
250        publisher = "The International Society for Music Information Retrieval",
251        year      = "2001"
252        }
253    """
254
255    @check_gtzan_dataset
256    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None,
257                 sampler=None, num_shards=None, shard_id=None, cache=None):
258        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
259                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
260
261        self.dataset_dir = dataset_dir
262        self.usage = replace_none(usage, "all")
263
264    def parse(self, children=None):
265        return cde.GTZANNode(self.dataset_dir, self.usage, self.sampler)
266
267
268class LibriTTSDataset(MappableDataset, AudioBaseDataset):
269    """
270    LibriTTS dataset.
271
272    The generated dataset has seven columns :py:obj:`[waveform, sample_rate, original_text, normalized_text,
273    speaker_id, chapter_id, utterance_id]` .
274    The tensor of column :py:obj:`waveform` is of the float32 type.
275    The tensor of column :py:obj:`sample_rate` is of a scalar of uint32 type.
276    The tensor of column :py:obj:`original_text` is of a scalar of string type.
277    The tensor of column :py:obj:`normalized_text` is of a scalar of string type.
278    The tensor of column :py:obj:`speaker_id` is of a scalar of uint32 type.
279    The tensor of column :py:obj:`chapter_id` is of a scalar of uint32 type.
280    The tensor of column :py:obj:`utterance_id` is of a scalar of string type.
281
282    Args:
283        dataset_dir (str): Path to the root directory that contains the dataset.
284        usage (str, optional): Part of this dataset, can be ``'dev-clean'``, ``'dev-other'``, ``'test-clean'``,
285            ``'test-other'``, ``'train-clean-100'``, ``'train-clean-360'``, ``'train-other-500'``, or ``'all'``.
286            Default: ``None`` , means ``'all'``.
287        num_samples (int, optional): The number of images to be included in the dataset.
288            Default: ``None`` , will read all audio.
289        num_parallel_workers (int, optional): Number of worker threads to read the data.
290            Default: ``None`` , will use global default workers(8), it can be set
291            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
292        shuffle (bool, optional): Whether or not to perform shuffle on the dataset.
293            Default: ``None`` , expected order behavior shown in the table below.
294        sampler (Sampler, optional): Object used to choose samples from the
295            dataset. Default: ``None`` , expected order behavior shown in the table below.
296        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
297            When this argument is specified, `num_samples` reflects the max sample number of per shard.
298        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
299            argument can only be specified when `num_shards` is also specified.
300        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
301            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
302            Default: ``None`` , which means no cache is used.
303
304    Raises:
305        RuntimeError: If `dataset_dir` does not contain data files.
306        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
307        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
308        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
309        RuntimeError: If `num_shards` is specified but `shard_id` is None.
310        RuntimeError: If `shard_id` is specified but `num_shards` is None.
311        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
312
313    Tutorial Examples:
314        - `Load & Process Data With Dataset Pipeline
315          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
316
317    Note:
318        - Not support :class:`mindspore.dataset.PKSampler` for `sampler` parameter yet.
319        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
320          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
321
322    .. include:: mindspore.dataset.sampler.txt
323
324    Examples:
325        >>> import mindspore.dataset as ds
326        >>> libri_tts_dataset_dir = "/path/to/libri_tts_dataset_directory"
327        >>>
328        >>> # 1) Read 500 samples (audio files) in libri_tts_dataset_directory
329        >>> dataset = ds.LibriTTSDataset(libri_tts_dataset_dir, usage="train-clean-100", num_samples=500)
330        >>>
331        >>> # 2) Read all samples (audio files) in libri_tts_dataset_directory
332        >>> dataset = ds.LibriTTSDataset(libri_tts_dataset_dir)
333
334    About LibriTTS dataset:
335
336    LibriTTS is a multi-speaker English corpus of approximately 585 hours of read English speech at 24kHz
337    sampling rate, prepared by Heiga Zen with the assistance of Google Speech and Google Brain team members.
338    The LibriTTS corpus is designed for TTS research. It is derived from the original materials (mp3 audio
339    files from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus.
340
341    You can construct the following directory structure from LibriTTS dataset and read by MindSpore's API.
342
343    .. code-block::
344
345        .
346        └── libri_tts_dataset_directory
347            ├── dev-clean
348            │    ├── 116
349            │    │    ├── 288045
350            |    |    |    ├── 116_288045.trans.tsv
351            │    │    │    ├── 116_288045_000003_000000.wav
352            │    │    │    └──...
353            │    │    ├── 288046
354            |    |    |    ├── 116_288046.trans.tsv
355            |    |    |    ├── 116_288046_000003_000000.wav
356            │    |    |    └── ...
357            |    |    └── ...
358            │    ├── 1255
359            │    │    ├── 138279
360            |    |    |    ├── 1255_138279.trans.tsv
361            │    │    │    ├── 1255_138279_000001_000000.wav
362            │    │    │    └── ...
363            │    │    ├── 74899
364            |    |    |    ├── 1255_74899.trans.tsv
365            |    |    |    ├── 1255_74899_000001_000000.wav
366            │    |    |    └── ...
367            |    |    └── ...
368            |    └── ...
369            └── ...
370
371    Citation:
372
373    .. code-block::
374
375        @article{lecun2010mnist,
376        title        = {LIBRITTS handwritten digit database},
377        author       = {zpw, NBU},
378        journal      = {ATT Labs [Online]},
379        volume       = {2},
380        year         = {2010},
381        howpublished = {http://www.openslr.org/resources/60/},
382        description  = {The LibriSpeech ASR corpus (http://www.openslr.org/12/) [1] has been used in
383                        various research projects. However, as it was originally designed for ASR research,
384                        there are some undesired properties when using for TTS research}
385        }
386    """
387
388    @check_libri_tts_dataset
389    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None,
390                 sampler=None, num_shards=None, shard_id=None, cache=None):
391        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
392                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
393
394        self.dataset_dir = dataset_dir
395        self.usage = replace_none(usage, "all")
396
397    def parse(self, children=None):
398        return cde.LibriTTSNode(self.dataset_dir, self.usage, self.sampler)
399
400
401class LJSpeechDataset(MappableDataset, AudioBaseDataset):
402    """
403    LJSpeech dataset.
404
405    The generated dataset has four columns :py:obj:`[waveform, sample_rate, transcription, normalized_transcript]` .
406    The column :py:obj:`waveform` is a tensor of the float32 type.
407    The column :py:obj:`sample_rate` is a scalar of the int32 type.
408    The column :py:obj:`transcription` is a scalar of the string type.
409    The column :py:obj:`normalized_transcript` is a scalar of the string type.
410
411    Args:
412        dataset_dir (str): Path to the root directory that contains the dataset.
413        num_samples (int, optional): The number of audios to be included in the dataset.
414            Default: ``None`` , all audios.
415        num_parallel_workers (int, optional): Number of worker threads to read the data.
416            Default: ``None`` , will use global default workers(8), it can be set
417            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
418        shuffle (bool, optional): Whether to perform shuffle on the dataset. Default: ``None`` , expected
419            order behavior shown in the table below.
420        sampler (Sampler, optional): Object used to choose samples from the dataset.
421            Default: ``None`` , expected order behavior shown in the table below.
422        num_shards (int, optional): Number of shards that the dataset will be divided into.
423            Default: ``None`` . When this argument is specified, `num_samples` reflects
424            the maximum sample number of per shard.
425        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
426            argument can only be specified when `num_shards` is also specified.
427        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
428            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
429            Default: ``None`` , which means no cache is used.
430
431    Raises:
432        RuntimeError: If `dataset_dir` does not contain data files.
433        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
434        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
435        RuntimeError: If `num_shards` is specified but `shard_id` is None.
436        RuntimeError: If `shard_id` is specified but `num_shards` is None.
437        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
438        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
439
440    Tutorial Examples:
441        - `Load & Process Data With Dataset Pipeline
442          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
443
444    Note:
445        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
446          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
447
448    .. include:: mindspore.dataset.sampler.txt
449
450    Examples:
451        >>> import mindspore.dataset as ds
452        >>> lj_speech_dataset_dir = "/path/to/lj_speech_dataset_directory"
453        >>>
454        >>> # 1) Get all samples from LJSPEECH dataset in sequence
455        >>> dataset = ds.LJSpeechDataset(dataset_dir=lj_speech_dataset_dir, shuffle=False)
456        >>>
457        >>> # 2) Randomly select 350 samples from LJSPEECH dataset
458        >>> dataset = ds.LJSpeechDataset(dataset_dir=lj_speech_dataset_dir, num_samples=350, shuffle=True)
459        >>>
460        >>> # 3) Get samples from LJSPEECH dataset for shard 0 in a 2-way distributed training
461        >>> dataset = ds.LJSpeechDataset(dataset_dir=lj_speech_dataset_dir, num_shards=2, shard_id=0)
462        >>>
463        >>> # In LJSPEECH dataset, each dictionary has keys "waveform", "sample_rate", "transcription"
464        >>> # and "normalized_transcript"
465
466    About LJSPEECH dataset:
467
468    This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker
469    reading passages from 7 non-fiction books. A transcription is provided for each clip.
470    Clips vary in length from 1 to 10 seconds and have a total length of approximately 24 hours.
471
472    The texts were published between 1884 and 1964, and are in the public domain.
473    The audio was recorded in 2016-17 by the LibriVox project and is also in the public domain.
474
475    Here is the original LJSPEECH dataset structure.
476    You can unzip the dataset files into the following directory structure and read by MindSpore's API.
477
478    .. code-block::
479
480        .
481        └── LJSpeech-1.1
482            ├── README
483            ├── metadata.csv
484            └── wavs
485                ├── LJ001-0001.wav
486                ├── LJ001-0002.wav
487                ├── LJ001-0003.wav
488                ├── LJ001-0004.wav
489                ├── LJ001-0005.wav
490                ├── LJ001-0006.wav
491                ├── LJ001-0007.wav
492                ├── LJ001-0008.wav
493                ...
494                ├── LJ050-0277.wav
495                └── LJ050-0278.wav
496
497    Citation:
498
499    .. code-block::
500
501        @misc{lj_speech17,
502        author       = {Keith Ito and Linda Johnson},
503        title        = {The LJ Speech Dataset},
504        howpublished = {url{https://keithito.com/LJ-Speech-Dataset}},
505        year         = 2017
506        }
507    """
508
509    @check_lj_speech_dataset
510    def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None,
511                 sampler=None, num_shards=None, shard_id=None, cache=None):
512        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
513                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
514        self.dataset_dir = dataset_dir
515
516    def parse(self, children=None):
517        return cde.LJSpeechNode(self.dataset_dir, self.sampler)
518
519
520class SpeechCommandsDataset(MappableDataset, AudioBaseDataset):
521    """
522    Speech Commands dataset.
523
524    The generated dataset has five columns :py:obj:`[waveform, sample_rate, label, speaker_id, utterance_number]` .
525    The tensor of column :py:obj:`waveform` is a vector of the float32 type.
526    The tensor of column :py:obj:`sample_rate` is a scalar of the int32 type.
527    The tensor of column :py:obj:`label` is a scalar of the string type.
528    The tensor of column :py:obj:`speaker_id` is a scalar of the string type.
529    The tensor of column :py:obj:`utterance_number` is a scalar of the int32 type.
530
531    Args:
532        dataset_dir (str): Path to the root directory that contains the dataset.
533        usage (str, optional): Usage of this dataset, can be ``'train'``, ``'test'``, ``'valid'`` or ``'all'``.
534            ``'train'`` will read from 84,843 samples, ``'test'`` will read from 11,005 samples, ``'valid'``
535            will read from 9,981 test samples and ``'all'`` will read from all 105,829 samples.
536            Default: ``None`` , will read all samples.
537        num_samples (int, optional): The number of samples to be included in the dataset.
538            Default: ``None`` , will read all samples.
539        num_parallel_workers (int, optional): Number of worker threads to read the data.
540            Default: ``None`` , will use global default workers(8), it can be set
541            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
542        shuffle (bool, optional): Whether or not to perform shuffle on the dataset.
543            Default: ``None`` , expected order behavior shown in the table below.
544        sampler (Sampler, optional): Object used to choose samples from the dataset.
545            Default: ``None`` , expected order behavior shown in the table below.
546        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
547            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
548        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` .
549            This argument can only be specified when `num_shards` is also specified.
550        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
551            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
552            Default: ``None`` , which means no cache is used.
553
554    Raises:
555        RuntimeError: If `dataset_dir` does not contain data files.
556        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
557        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
558        RuntimeError: If `num_shards` is specified but `shard_id` is None.
559        RuntimeError: If `shard_id` is specified but `num_shards` is None.
560        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
561        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
562
563    Tutorial Examples:
564        - `Load & Process Data With Dataset Pipeline
565          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
566
567    Note:
568        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
569          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
570
571    .. include:: mindspore.dataset.sampler.txt
572
573    Examples:
574        >>> import mindspore.dataset as ds
575        >>> speech_commands_dataset_dir = "/path/to/speech_commands_dataset_directory"
576        >>>
577        >>> # Read 3 samples from SpeechCommands dataset
578        >>> dataset = ds.SpeechCommandsDataset(dataset_dir=speech_commands_dataset_dir, num_samples=3)
579
580    About SpeechCommands dataset:
581
582    The SpeechCommands is database for limited_vocabulary speech recognition, containing 105,829 audio samples of
583    '.wav' format.
584
585    Here is the original SpeechCommands dataset structure.
586    You can unzip the dataset files into this directory structure and read by MindSpore's API.
587
588    .. code-block::
589
590        .
591        └── speech_commands_dataset_dir
592             ├── cat
593                  ├── b433eff_nohash_0.wav
594                  ├── 5a33edf_nohash_1.wav
595                  └──....
596             ├── dog
597                  ├── b433w2w_nohash_0.wav
598                  └──....
599             ├── four
600             └── ....
601
602    Citation:
603
604    .. code-block::
605
606        @article{2018Speech,
607        title={Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition},
608        author={Warden, P.},
609        year={2018}
610        }
611    """
612
613    @check_speech_commands_dataset
614    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None,
615                 sampler=None, num_shards=None, shard_id=None, cache=None):
616        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
617                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
618
619        self.dataset_dir = dataset_dir
620        self.usage = replace_none(usage, "all")
621
622    def parse(self, children=None):
623        return cde.SpeechCommandsNode(self.dataset_dir, self.usage, self.sampler)
624
625
626class TedliumDataset(MappableDataset, AudioBaseDataset):
627    """
628    Tedlium dataset.
629    The columns of generated dataset depend on the source SPH files and the corresponding STM files.
630
631    The generated dataset has six columns :py:obj:`[waveform, sample_rate, transcript, talk_id, speaker_id,
632    identifier]` .
633
634    The data type of column `waveform` is float32, the data type of column `sample_rate` is int32,
635    and the data type of columns `transcript` , `talk_id` , `speaker_id` and `identifier` is string.
636
637    Args:
638        dataset_dir (str): Path to the root directory that contains the dataset.
639        release (str): Release of the dataset, can be ``'release1'``, ``'release2'``, ``'release3'``.
640        usage (str, optional): Usage of this dataset.
641            For release1 or release2, can be ``'train'``, ``'test'``, ``'dev'`` or ``'all'``.
642            ``'train'`` will read from train samples,
643            ``'test'`` will read from test samples,
644            ``'dev'`` will read from dev samples,
645            ``'all'`` will read from all samples.
646            For `release3`, can only be ``'all'``, it will read from data samples. Default: ``None`` , all samples.
647        extensions (str, optional): Extensions of the SPH files, only ``'.sph'`` is valid.
648            Default: ``None`` , set to ``".sph"``.
649        num_samples (int, optional): The number of audio samples to be included in the dataset.
650            Default: ``None`` , all samples.
651        num_parallel_workers (int, optional): Number of worker threads to read the data.
652            Default: ``None`` , will use global default workers(8), it can be set
653            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
654        shuffle (bool, optional): Whether to perform shuffle on the dataset. Default: ``None`` , expected
655            order behavior shown in the table below.
656        sampler (Sampler, optional): Object used to choose samples from the
657            dataset. Default: ``None`` , expected order behavior shown in the table below.
658        num_shards (int, optional): Number of shards that the dataset will be divided
659            into. Default: ``None`` . When this argument is specified, `num_samples` reflects
660            the maximum sample number of per shard.
661        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
662            argument can only be specified when `num_shards` is also specified.
663        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
664            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
665            Default: ``None`` , which means no cache is used.
666
667    Raises:
668        RuntimeError: If `dataset_dir` does not contain stm files.
669        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
670        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
671        RuntimeError: If `num_shards` is specified but `shard_id` is None.
672        RuntimeError: If `shard_id` is specified but `num_shards` is None.
673        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
674        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
675
676    Tutorial Examples:
677        - `Load & Process Data With Dataset Pipeline
678          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
679
680    Note:
681        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
682          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
683
684    .. include:: mindspore.dataset.sampler.txt
685
686    Examples:
687        >>> import mindspore.dataset as ds
688        >>> # 1) Get all train samples from TEDLIUM_release1 dataset in sequence.
689        >>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium1_dataset_directory",
690        ...                             release="release1", shuffle=False)
691        >>>
692        >>> # 2) Randomly select 10 samples from TEDLIUM_release2 dataset.
693        >>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium2_dataset_directory",
694        ...                             release="release2", num_samples=10, shuffle=True)
695        >>>
696        >>> # 3) Get samples from TEDLIUM_release-3 dataset for shard 0 in a 2-way distributed training.
697        >>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium3_dataset_directory",
698        ...                             release="release3", num_shards=2, shard_id=0)
699        >>>
700        >>> # In TEDLIUM dataset, each dictionary has keys : waveform, sample_rate, transcript, talk_id,
701        >>> # speaker_id and identifier.
702
703    About TEDLIUM_release1 dataset:
704
705    The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled at 16kHz.
706    It contains about 118 hours of speech.
707
708    About TEDLIUM_release2 dataset:
709
710    This is the TED-LIUM corpus release 2, licensed under Creative Commons BY-NC-ND 3.0. All talks and text are
711    property of TED Conferences LLC. The TED-LIUM corpus was made from audio talks and their transcriptions available
712    on the TED website. We have prepared and filtered these data in order to train acoustic models to participate to
713    the International Workshop on Spoken Language Translation 2011 (the LIUM English/French SLT system reached the
714    first rank in the SLT task).
715
716    About TEDLIUM_release-3 dataset:
717
718    This is the TED-LIUM corpus release 3, licensed under Creative Commons BY-NC-ND 3.0. All talks and text are
719    property of TED Conferences LLC. This new TED-LIUM release was made through a collaboration between the Ubiqus
720    company and the LIUM (University of Le Mans, France).
721
722    You can unzip the dataset files into the following directory structure and read by MindSpore's API.
723
724    The structure of TEDLIUM release2 is the same as TEDLIUM release1, only the data is different.
725
726    .. code-block::
727
728        .
729        └──TEDLIUM_release1
730            └── dev
731                ├── sph
732                    ├── AlGore_2009.sph
733                    ├── BarrySchwartz_2005G.sph
734                ├── stm
735                    ├── AlGore_2009.stm
736                    ├── BarrySchwartz_2005G.stm
737            └── test
738                ├── sph
739                    ├── AimeeMullins_2009P.sph
740                    ├── BillGates_2010.sph
741                ├── stm
742                    ├── AimeeMullins_2009P.stm
743                    ├── BillGates_2010.stm
744            └── train
745                ├── sph
746                    ├── AaronHuey_2010X.sph
747                    ├── AdamGrosser_2007.sph
748                ├── stm
749                    ├── AaronHuey_2010X.stm
750                    ├── AdamGrosser_2007.stm
751            └── readme
752            └── TEDLIUM.150k.dic
753
754    The directory structure of TEDLIUM release3 is slightly different.
755
756    .. code-block::
757
758        .
759        └──TEDLIUM_release-3
760            └── data
761                ├── ctl
762                ├── sph
763                    ├── 911Mothers_2010W.sph
764                    ├── AalaElKhani.sph
765                ├── stm
766                    ├── 911Mothers_2010W.stm
767                    ├── AalaElKhani.stm
768            └── doc
769            └── legacy
770            └── LM
771            └── speaker-adaptation
772            └── readme
773            └── TEDLIUM.150k.dic
774
775    Citation:
776
777    .. code-block::
778
779        @article{
780          title={TED-LIUM: an automatic speech recognition dedicated corpus},
781          author={A. Rousseau, P. Deléglise, Y. Estève},
782          journal={Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)},
783          year={May 2012},
784          biburl={https://www.openslr.org/7/}
785        }
786
787        @article{
788          title={Enhancing the TED-LIUM Corpus with Selected Data for Language Modeling and More TED Talks},
789          author={A. Rousseau, P. Deléglise, and Y. Estève},
790          journal={Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)},
791          year={May 2014},
792          biburl={https://www.openslr.org/19/}
793        }
794
795        @article{
796          title={TED-LIUM 3: twice as much data and corpus repartition for experiments on speaker adaptation},
797          author={François Hernandez, Vincent Nguyen, Sahar Ghannay, Natalia Tomashenko, and Yannick Estève},
798          journal={the 20th International Conference on Speech and Computer (SPECOM 2018)},
799          year={September 2018},
800          biburl={https://www.openslr.org/51/}
801        }
802    """
803
804    @check_tedlium_dataset
805    def __init__(self, dataset_dir, release, usage=None, extensions=None, num_samples=None,
806                 num_parallel_workers=None, shuffle=None, sampler=None, num_shards=None,
807                 shard_id=None, cache=None):
808        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
809                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
810        self.dataset_dir = dataset_dir
811        self.extensions = replace_none(extensions, ".sph")
812        self.release = release
813        self.usage = replace_none(usage, "all")
814
815    def parse(self, children=None):
816        return cde.TedliumNode(self.dataset_dir, self.release, self.usage, self.extensions, self.sampler)
817
818
819class YesNoDataset(MappableDataset, AudioBaseDataset):
820    """
821    YesNo dataset.
822
823    The generated dataset has three columns :py:obj:`[waveform, sample_rate, labels]` .
824    The tensor of column :py:obj:`waveform` is a vector of the float32 type.
825    The tensor of column :py:obj:`sample_rate` is a scalar of the int32 type.
826    The tensor of column :py:obj:`labels` is a scalar of the int32 type.
827
828    Args:
829        dataset_dir (str): Path to the root directory that contains the dataset.
830        num_samples (int, optional): The number of images to be included in the dataset.
831            Default: ``None`` , will read all images.
832        num_parallel_workers (int, optional): Number of worker threads to read the data.
833            Default: ``None`` , will use global default workers(8), it can be set
834            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
835        shuffle (bool, optional): Whether or not to perform shuffle on the dataset.
836            Default: ``None`` , expected order behavior shown in the table below.
837        sampler (Sampler, optional): Object used to choose samples from the
838            dataset. Default: ``None`` , expected order behavior shown in the table below.
839        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
840            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
841        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This argument can only
842            be specified when `num_shards` is also specified.
843        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
844            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
845            Default: ``None`` , which means no cache is used.
846
847    Raises:
848        RuntimeError: If `dataset_dir` does not contain data files.
849        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
850        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
851        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
852        RuntimeError: If `num_shards` is specified but `shard_id` is None.
853        RuntimeError: If `shard_id` is specified but `num_shards` is None.
854        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
855
856    Tutorial Examples:
857        - `Load & Process Data With Dataset Pipeline
858          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
859
860    Note:
861        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
862          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
863
864    .. include:: mindspore.dataset.sampler.txt
865
866    Examples:
867        >>> import mindspore.dataset as ds
868        >>> yes_no_dataset_dir = "/path/to/yes_no_dataset_directory"
869        >>>
870        >>> # Read 3 samples from YesNo dataset
871        >>> dataset = ds.YesNoDataset(dataset_dir=yes_no_dataset_dir, num_samples=3)
872        >>>
873        >>> # Note: In YesNo dataset, each dictionary has keys "waveform", "sample_rate", "label"
874
875    About YesNo dataset:
876
877    Yesno is an audio dataset consisting of 60 recordings of one individual saying yes or no in Hebrew; each
878    recording is eight words long.
879
880    Here is the original YesNo dataset structure.
881    You can unzip the dataset files into this directory structure and read by MindSpore's API.
882
883    .. code-block::
884
885        .
886        └── yes_no_dataset_dir
887             ├── 1_1_0_0_1_1_0_0.wav
888             ├── 1_0_0_0_1_1_0_0.wav
889             ├── 1_1_0_0_1_1_0_0.wav
890             └──....
891
892    Citation:
893
894    .. code-block::
895
896        @NetworkResource{Kaldi_audio_project,
897        author    = {anonymous},
898        url       = "http://wwww.openslr.org/1/"
899        }
900    """
901
902    @check_yes_no_dataset
903    def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None,
904                 sampler=None, num_shards=None, shard_id=None, cache=None):
905        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
906                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
907
908        self.dataset_dir = dataset_dir
909
910    def parse(self, children=None):
911        return cde.YesNoNode(self.dataset_dir, self.sampler)
912