1# Copyright 2019-2023 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16This file contains specific audio dataset loading classes. You can easily use 17these classes to load the prepared dataset. For example: 18 LJSpeechDataset: which is lj speech dataset. 19 YesNoDataset: which is yes or no dataset. 20 SpeechCommandsDataset: which is speech commands dataset. 21 TedliumDataset: which is tedlium dataset. 22 ... 23After declaring the dataset object, you can further apply dataset operations 24(e.g. filter, skip, concat, map, batch) on it. 25""" 26import mindspore._c_dataengine as cde 27 28from .datasets import AudioBaseDataset, MappableDataset 29from .validators import check_cmu_arctic_dataset, check_gtzan_dataset, check_libri_tts_dataset, \ 30 check_lj_speech_dataset, check_speech_commands_dataset, check_tedlium_dataset, check_yes_no_dataset 31 32from ..core.validator_helpers import replace_none 33 34 35class CMUArcticDataset(MappableDataset, AudioBaseDataset): 36 """ 37 CMU Arctic dataset. 38 39 The generated dataset has four columns: :py:obj:`[waveform, sample_rate, transcript, utterance_id]` . 40 The tensor of column :py:obj:`waveform` is of the float32 type. 41 The tensor of column :py:obj:`sample_rate` is of a scalar of uint32 type. 42 The tensor of column :py:obj:`transcript` is of a scalar of string type. 43 The tensor of column :py:obj:`utterance_id` is of a scalar of string type. 44 45 Args: 46 dataset_dir (str): Path to the root directory that contains the dataset. 47 name (str, optional): Part of this dataset, can be ``'aew'``, ``'ahw'``, ``'aup'``, 48 ``'awb'``, ``'axb'``, ``'bdl'``, ``'clb'``, ``'eey'``, ``'fem'``, ``'gka'``, ``'jmk'``, 49 ``'ksp'``, ``'ljm'``, ``'lnh'``, ``'rms'``, ``'rxr'``, ``'slp'`` or ``'slt'``. 50 Default: ``None``, means ``'aew'``. 51 num_samples (int, optional): The number of audio to be included in the dataset. 52 Default: ``None``, will read all audio. 53 num_parallel_workers (int, optional): Number of worker threads to read the data. 54 Default: ``None``, will use global default workers(8), it can be set 55 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 56 shuffle (bool, optional): Whether or not to perform shuffle on the dataset. 57 Default: ``None``, expected order behavior shown in the table below. 58 sampler (Sampler, optional): Object used to choose samples from the 59 dataset. Default: ``None``, expected order behavior shown in the table below. 60 num_shards (int, optional): Number of shards that the dataset will be divided into. 61 Default: ``None``, no dividing. When this argument is specified, `num_samples` 62 reflects the max sample number of per shard. 63 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None``, will use ``0``. This 64 argument can only be specified when `num_shards` is also specified. 65 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 66 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 67 Default: ``None``, which means no cache is used. 68 69 Raises: 70 RuntimeError: If `dataset_dir` does not contain data files. 71 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 72 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 73 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 74 RuntimeError: If `num_shards` is specified but `shard_id` is None. 75 RuntimeError: If `shard_id` is specified but `num_shards` is None. 76 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 77 78 Tutorial Examples: 79 - `Load & Process Data With Dataset Pipeline 80 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 81 82 Note: 83 - Not support :class:`mindspore.dataset.PKSampler` for `sampler` parameter yet. 84 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 85 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 86 87 .. include:: mindspore.dataset.sampler.txt 88 89 Examples: 90 >>> import mindspore.dataset as ds 91 >>> cmu_arctic_dataset_directory = "/path/to/cmu_arctic_dataset_directory" 92 >>> 93 >>> # 1) Read 500 samples (audio files) in cmu_arctic_dataset_directory 94 >>> dataset = ds.CMUArcticDataset(cmu_arctic_dataset_directory, name="ahw", num_samples=500) 95 >>> 96 >>> # 2) Read all samples (audio files) in cmu_arctic_dataset_directory 97 >>> dataset = ds.CMUArcticDataset(cmu_arctic_dataset_directory) 98 99 About CMUArctic dataset: 100 101 The CMU Arctic databases are designed for the purpose of speech synthesis research. 102 These single speaker speech databases have been carefully recorded under studio conditions 103 and consist of approximately 1200 phonetically balanced English utterances. In addition to wavefiles, 104 the databases provide complete support for the Festival Speech Synthesis System, including pre-built 105 voices that may be used as is. The entire package is distributed as free software, without restriction 106 on commercial or non-commercial use. 107 108 You can construct the following directory structure from CMUArctic dataset and read by MindSpore's API. 109 110 .. code-block:: 111 112 . 113 └── cmu_arctic_dataset_directory 114 ├── cmu_us_aew_arctic 115 │ ├── wav 116 │ │ ├──arctic_a0001.wav 117 │ │ ├──arctic_a0002.wav 118 │ │ ├──... 119 │ ├── etc 120 │ │ └── txt.done.data 121 ├── cmu_us_ahw_arctic 122 │ ├── wav 123 │ │ ├──arctic_a0001.wav 124 │ │ ├──arctic_a0002.wav 125 │ │ ├──... 126 │ └── etc 127 │ └── txt.done.data 128 └──... 129 130 Citation: 131 132 .. code-block:: 133 134 @article{LTI2003CMUArctic, 135 title = {CMU ARCTIC databases for speech synthesis}, 136 author = {John Kominek and Alan W Black}, 137 journal = {Language Technologies Institute [Online]}, 138 year = {2003} 139 howpublished = {http://www.festvox.org/cmu_arctic/} 140 } 141 """ 142 143 @check_cmu_arctic_dataset 144 def __init__(self, dataset_dir, name=None, num_samples=None, num_parallel_workers=None, shuffle=None, 145 sampler=None, num_shards=None, shard_id=None, cache=None): 146 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 147 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 148 149 self.dataset_dir = dataset_dir 150 self.name = replace_none(name, "aew") 151 152 def parse(self, children=None): 153 return cde.CMUArcticNode(self.dataset_dir, self.name, self.sampler) 154 155 156class GTZANDataset(MappableDataset, AudioBaseDataset): 157 """ 158 GTZAN dataset. 159 160 The generated dataset has three columns: :py:obj:`[waveform, sample_rate, label]` . 161 The tensor of column :py:obj:`waveform` is of the float32 type. 162 The tensor of column :py:obj:`sample_rate` is of a scalar of uint32 type. 163 The tensor of column :py:obj:`label` is of a scalar of string type. 164 165 Args: 166 dataset_dir (str): Path to the root directory that contains the dataset. 167 usage (str, optional): Usage of this dataset, can be 'train', 'valid', 'test' or 'all'. 168 Default: ``None`` , will read all samples. 169 num_samples (int, optional): The number of audio to be included in the dataset. 170 Default: ``None`` , will read all audio. 171 num_parallel_workers (int, optional): Number of worker threads to read the data. 172 Default: ``None`` , will use global default workers(8), it can be set 173 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 174 shuffle (bool, optional): Whether or not to perform shuffle on the dataset. 175 Default: ``None`` , expected order behavior shown in the table below. 176 sampler (Sampler, optional): Object used to choose samples from the 177 dataset. Default: ``None`` , expected order behavior shown in the table below. 178 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 179 When this argument is specified, `num_samples` reflects the max sample number of per shard. 180 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 181 argument can only be specified when `num_shards` is also specified. 182 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 183 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 184 Default: ``None`` , which means no cache is used. 185 186 Raises: 187 RuntimeError: If `dataset_dir` does not contain data files. 188 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 189 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 190 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 191 RuntimeError: If `num_shards` is specified but `shard_id` is None. 192 RuntimeError: If `shard_id` is specified but `num_shards` is None. 193 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 194 195 Tutorial Examples: 196 - `Load & Process Data With Dataset Pipeline 197 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 198 199 Note: 200 - Not support :class:`mindspore.dataset.PKSampler` for `sampler` parameter yet. 201 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 202 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 203 204 .. include:: mindspore.dataset.sampler.txt 205 206 Examples: 207 >>> import mindspore.dataset as ds 208 >>> gtzan_dataset_directory = "/path/to/gtzan_dataset_directory" 209 >>> 210 >>> # 1) Read 500 samples (audio files) in gtzan_dataset_directory 211 >>> dataset = ds.GTZANDataset(gtzan_dataset_directory, usage="all", num_samples=500) 212 >>> 213 >>> # 2) Read all samples (audio files) in gtzan_dataset_directory 214 >>> dataset = ds.GTZANDataset(gtzan_dataset_directory) 215 216 About GTZAN dataset: 217 218 The GTZAN dataset appears in at least 100 published works and is the most commonly used 219 public dataset for evaluation in machine listening research for music genre recognition. 220 It consists of 1000 audio tracks, each of which is 30 seconds long. It contains 10 genres (blues, 221 classical, country, disco, hiphop, jazz, metal, pop, reggae and rock), each of which is 222 represented by 100 tracks. The tracks are all 22050Hz Mono 16-bit audio files in .wav format. 223 224 You can construct the following directory structure from GTZAN dataset and read by MindSpore's API. 225 226 .. code-block:: 227 228 . 229 └── gtzan_dataset_directory 230 ├── blues 231 │ ├──blues.00000.wav 232 │ ├──blues.00001.wav 233 │ ├──blues.00002.wav 234 │ ├──... 235 ├── disco 236 │ ├──disco.00000.wav 237 │ ├──disco.00001.wav 238 │ ├──disco.00002.wav 239 │ └──... 240 └──... 241 242 Citation: 243 244 .. code-block:: 245 246 @misc{tzanetakis_essl_cook_2001, 247 author = "Tzanetakis, George and Essl, Georg and Cook, Perry", 248 title = "Automatic Musical Genre Classification Of Audio Signals", 249 url = "http://ismir2001.ismir.net/pdf/tzanetakis.pdf", 250 publisher = "The International Society for Music Information Retrieval", 251 year = "2001" 252 } 253 """ 254 255 @check_gtzan_dataset 256 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, 257 sampler=None, num_shards=None, shard_id=None, cache=None): 258 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 259 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 260 261 self.dataset_dir = dataset_dir 262 self.usage = replace_none(usage, "all") 263 264 def parse(self, children=None): 265 return cde.GTZANNode(self.dataset_dir, self.usage, self.sampler) 266 267 268class LibriTTSDataset(MappableDataset, AudioBaseDataset): 269 """ 270 LibriTTS dataset. 271 272 The generated dataset has seven columns :py:obj:`[waveform, sample_rate, original_text, normalized_text, 273 speaker_id, chapter_id, utterance_id]` . 274 The tensor of column :py:obj:`waveform` is of the float32 type. 275 The tensor of column :py:obj:`sample_rate` is of a scalar of uint32 type. 276 The tensor of column :py:obj:`original_text` is of a scalar of string type. 277 The tensor of column :py:obj:`normalized_text` is of a scalar of string type. 278 The tensor of column :py:obj:`speaker_id` is of a scalar of uint32 type. 279 The tensor of column :py:obj:`chapter_id` is of a scalar of uint32 type. 280 The tensor of column :py:obj:`utterance_id` is of a scalar of string type. 281 282 Args: 283 dataset_dir (str): Path to the root directory that contains the dataset. 284 usage (str, optional): Part of this dataset, can be ``'dev-clean'``, ``'dev-other'``, ``'test-clean'``, 285 ``'test-other'``, ``'train-clean-100'``, ``'train-clean-360'``, ``'train-other-500'``, or ``'all'``. 286 Default: ``None`` , means ``'all'``. 287 num_samples (int, optional): The number of images to be included in the dataset. 288 Default: ``None`` , will read all audio. 289 num_parallel_workers (int, optional): Number of worker threads to read the data. 290 Default: ``None`` , will use global default workers(8), it can be set 291 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 292 shuffle (bool, optional): Whether or not to perform shuffle on the dataset. 293 Default: ``None`` , expected order behavior shown in the table below. 294 sampler (Sampler, optional): Object used to choose samples from the 295 dataset. Default: ``None`` , expected order behavior shown in the table below. 296 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 297 When this argument is specified, `num_samples` reflects the max sample number of per shard. 298 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 299 argument can only be specified when `num_shards` is also specified. 300 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 301 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 302 Default: ``None`` , which means no cache is used. 303 304 Raises: 305 RuntimeError: If `dataset_dir` does not contain data files. 306 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 307 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 308 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 309 RuntimeError: If `num_shards` is specified but `shard_id` is None. 310 RuntimeError: If `shard_id` is specified but `num_shards` is None. 311 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 312 313 Tutorial Examples: 314 - `Load & Process Data With Dataset Pipeline 315 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 316 317 Note: 318 - Not support :class:`mindspore.dataset.PKSampler` for `sampler` parameter yet. 319 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 320 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 321 322 .. include:: mindspore.dataset.sampler.txt 323 324 Examples: 325 >>> import mindspore.dataset as ds 326 >>> libri_tts_dataset_dir = "/path/to/libri_tts_dataset_directory" 327 >>> 328 >>> # 1) Read 500 samples (audio files) in libri_tts_dataset_directory 329 >>> dataset = ds.LibriTTSDataset(libri_tts_dataset_dir, usage="train-clean-100", num_samples=500) 330 >>> 331 >>> # 2) Read all samples (audio files) in libri_tts_dataset_directory 332 >>> dataset = ds.LibriTTSDataset(libri_tts_dataset_dir) 333 334 About LibriTTS dataset: 335 336 LibriTTS is a multi-speaker English corpus of approximately 585 hours of read English speech at 24kHz 337 sampling rate, prepared by Heiga Zen with the assistance of Google Speech and Google Brain team members. 338 The LibriTTS corpus is designed for TTS research. It is derived from the original materials (mp3 audio 339 files from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus. 340 341 You can construct the following directory structure from LibriTTS dataset and read by MindSpore's API. 342 343 .. code-block:: 344 345 . 346 └── libri_tts_dataset_directory 347 ├── dev-clean 348 │ ├── 116 349 │ │ ├── 288045 350 | | | ├── 116_288045.trans.tsv 351 │ │ │ ├── 116_288045_000003_000000.wav 352 │ │ │ └──... 353 │ │ ├── 288046 354 | | | ├── 116_288046.trans.tsv 355 | | | ├── 116_288046_000003_000000.wav 356 │ | | └── ... 357 | | └── ... 358 │ ├── 1255 359 │ │ ├── 138279 360 | | | ├── 1255_138279.trans.tsv 361 │ │ │ ├── 1255_138279_000001_000000.wav 362 │ │ │ └── ... 363 │ │ ├── 74899 364 | | | ├── 1255_74899.trans.tsv 365 | | | ├── 1255_74899_000001_000000.wav 366 │ | | └── ... 367 | | └── ... 368 | └── ... 369 └── ... 370 371 Citation: 372 373 .. code-block:: 374 375 @article{lecun2010mnist, 376 title = {LIBRITTS handwritten digit database}, 377 author = {zpw, NBU}, 378 journal = {ATT Labs [Online]}, 379 volume = {2}, 380 year = {2010}, 381 howpublished = {http://www.openslr.org/resources/60/}, 382 description = {The LibriSpeech ASR corpus (http://www.openslr.org/12/) [1] has been used in 383 various research projects. However, as it was originally designed for ASR research, 384 there are some undesired properties when using for TTS research} 385 } 386 """ 387 388 @check_libri_tts_dataset 389 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, 390 sampler=None, num_shards=None, shard_id=None, cache=None): 391 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 392 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 393 394 self.dataset_dir = dataset_dir 395 self.usage = replace_none(usage, "all") 396 397 def parse(self, children=None): 398 return cde.LibriTTSNode(self.dataset_dir, self.usage, self.sampler) 399 400 401class LJSpeechDataset(MappableDataset, AudioBaseDataset): 402 """ 403 LJSpeech dataset. 404 405 The generated dataset has four columns :py:obj:`[waveform, sample_rate, transcription, normalized_transcript]` . 406 The column :py:obj:`waveform` is a tensor of the float32 type. 407 The column :py:obj:`sample_rate` is a scalar of the int32 type. 408 The column :py:obj:`transcription` is a scalar of the string type. 409 The column :py:obj:`normalized_transcript` is a scalar of the string type. 410 411 Args: 412 dataset_dir (str): Path to the root directory that contains the dataset. 413 num_samples (int, optional): The number of audios to be included in the dataset. 414 Default: ``None`` , all audios. 415 num_parallel_workers (int, optional): Number of worker threads to read the data. 416 Default: ``None`` , will use global default workers(8), it can be set 417 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 418 shuffle (bool, optional): Whether to perform shuffle on the dataset. Default: ``None`` , expected 419 order behavior shown in the table below. 420 sampler (Sampler, optional): Object used to choose samples from the dataset. 421 Default: ``None`` , expected order behavior shown in the table below. 422 num_shards (int, optional): Number of shards that the dataset will be divided into. 423 Default: ``None`` . When this argument is specified, `num_samples` reflects 424 the maximum sample number of per shard. 425 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 426 argument can only be specified when `num_shards` is also specified. 427 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 428 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 429 Default: ``None`` , which means no cache is used. 430 431 Raises: 432 RuntimeError: If `dataset_dir` does not contain data files. 433 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 434 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 435 RuntimeError: If `num_shards` is specified but `shard_id` is None. 436 RuntimeError: If `shard_id` is specified but `num_shards` is None. 437 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 438 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 439 440 Tutorial Examples: 441 - `Load & Process Data With Dataset Pipeline 442 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 443 444 Note: 445 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 446 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 447 448 .. include:: mindspore.dataset.sampler.txt 449 450 Examples: 451 >>> import mindspore.dataset as ds 452 >>> lj_speech_dataset_dir = "/path/to/lj_speech_dataset_directory" 453 >>> 454 >>> # 1) Get all samples from LJSPEECH dataset in sequence 455 >>> dataset = ds.LJSpeechDataset(dataset_dir=lj_speech_dataset_dir, shuffle=False) 456 >>> 457 >>> # 2) Randomly select 350 samples from LJSPEECH dataset 458 >>> dataset = ds.LJSpeechDataset(dataset_dir=lj_speech_dataset_dir, num_samples=350, shuffle=True) 459 >>> 460 >>> # 3) Get samples from LJSPEECH dataset for shard 0 in a 2-way distributed training 461 >>> dataset = ds.LJSpeechDataset(dataset_dir=lj_speech_dataset_dir, num_shards=2, shard_id=0) 462 >>> 463 >>> # In LJSPEECH dataset, each dictionary has keys "waveform", "sample_rate", "transcription" 464 >>> # and "normalized_transcript" 465 466 About LJSPEECH dataset: 467 468 This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker 469 reading passages from 7 non-fiction books. A transcription is provided for each clip. 470 Clips vary in length from 1 to 10 seconds and have a total length of approximately 24 hours. 471 472 The texts were published between 1884 and 1964, and are in the public domain. 473 The audio was recorded in 2016-17 by the LibriVox project and is also in the public domain. 474 475 Here is the original LJSPEECH dataset structure. 476 You can unzip the dataset files into the following directory structure and read by MindSpore's API. 477 478 .. code-block:: 479 480 . 481 └── LJSpeech-1.1 482 ├── README 483 ├── metadata.csv 484 └── wavs 485 ├── LJ001-0001.wav 486 ├── LJ001-0002.wav 487 ├── LJ001-0003.wav 488 ├── LJ001-0004.wav 489 ├── LJ001-0005.wav 490 ├── LJ001-0006.wav 491 ├── LJ001-0007.wav 492 ├── LJ001-0008.wav 493 ... 494 ├── LJ050-0277.wav 495 └── LJ050-0278.wav 496 497 Citation: 498 499 .. code-block:: 500 501 @misc{lj_speech17, 502 author = {Keith Ito and Linda Johnson}, 503 title = {The LJ Speech Dataset}, 504 howpublished = {url{https://keithito.com/LJ-Speech-Dataset}}, 505 year = 2017 506 } 507 """ 508 509 @check_lj_speech_dataset 510 def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None, 511 sampler=None, num_shards=None, shard_id=None, cache=None): 512 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 513 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 514 self.dataset_dir = dataset_dir 515 516 def parse(self, children=None): 517 return cde.LJSpeechNode(self.dataset_dir, self.sampler) 518 519 520class SpeechCommandsDataset(MappableDataset, AudioBaseDataset): 521 """ 522 Speech Commands dataset. 523 524 The generated dataset has five columns :py:obj:`[waveform, sample_rate, label, speaker_id, utterance_number]` . 525 The tensor of column :py:obj:`waveform` is a vector of the float32 type. 526 The tensor of column :py:obj:`sample_rate` is a scalar of the int32 type. 527 The tensor of column :py:obj:`label` is a scalar of the string type. 528 The tensor of column :py:obj:`speaker_id` is a scalar of the string type. 529 The tensor of column :py:obj:`utterance_number` is a scalar of the int32 type. 530 531 Args: 532 dataset_dir (str): Path to the root directory that contains the dataset. 533 usage (str, optional): Usage of this dataset, can be ``'train'``, ``'test'``, ``'valid'`` or ``'all'``. 534 ``'train'`` will read from 84,843 samples, ``'test'`` will read from 11,005 samples, ``'valid'`` 535 will read from 9,981 test samples and ``'all'`` will read from all 105,829 samples. 536 Default: ``None`` , will read all samples. 537 num_samples (int, optional): The number of samples to be included in the dataset. 538 Default: ``None`` , will read all samples. 539 num_parallel_workers (int, optional): Number of worker threads to read the data. 540 Default: ``None`` , will use global default workers(8), it can be set 541 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 542 shuffle (bool, optional): Whether or not to perform shuffle on the dataset. 543 Default: ``None`` , expected order behavior shown in the table below. 544 sampler (Sampler, optional): Object used to choose samples from the dataset. 545 Default: ``None`` , expected order behavior shown in the table below. 546 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 547 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 548 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . 549 This argument can only be specified when `num_shards` is also specified. 550 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 551 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 552 Default: ``None`` , which means no cache is used. 553 554 Raises: 555 RuntimeError: If `dataset_dir` does not contain data files. 556 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 557 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 558 RuntimeError: If `num_shards` is specified but `shard_id` is None. 559 RuntimeError: If `shard_id` is specified but `num_shards` is None. 560 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 561 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 562 563 Tutorial Examples: 564 - `Load & Process Data With Dataset Pipeline 565 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 566 567 Note: 568 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 569 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 570 571 .. include:: mindspore.dataset.sampler.txt 572 573 Examples: 574 >>> import mindspore.dataset as ds 575 >>> speech_commands_dataset_dir = "/path/to/speech_commands_dataset_directory" 576 >>> 577 >>> # Read 3 samples from SpeechCommands dataset 578 >>> dataset = ds.SpeechCommandsDataset(dataset_dir=speech_commands_dataset_dir, num_samples=3) 579 580 About SpeechCommands dataset: 581 582 The SpeechCommands is database for limited_vocabulary speech recognition, containing 105,829 audio samples of 583 '.wav' format. 584 585 Here is the original SpeechCommands dataset structure. 586 You can unzip the dataset files into this directory structure and read by MindSpore's API. 587 588 .. code-block:: 589 590 . 591 └── speech_commands_dataset_dir 592 ├── cat 593 ├── b433eff_nohash_0.wav 594 ├── 5a33edf_nohash_1.wav 595 └──.... 596 ├── dog 597 ├── b433w2w_nohash_0.wav 598 └──.... 599 ├── four 600 └── .... 601 602 Citation: 603 604 .. code-block:: 605 606 @article{2018Speech, 607 title={Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition}, 608 author={Warden, P.}, 609 year={2018} 610 } 611 """ 612 613 @check_speech_commands_dataset 614 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, 615 sampler=None, num_shards=None, shard_id=None, cache=None): 616 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 617 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 618 619 self.dataset_dir = dataset_dir 620 self.usage = replace_none(usage, "all") 621 622 def parse(self, children=None): 623 return cde.SpeechCommandsNode(self.dataset_dir, self.usage, self.sampler) 624 625 626class TedliumDataset(MappableDataset, AudioBaseDataset): 627 """ 628 Tedlium dataset. 629 The columns of generated dataset depend on the source SPH files and the corresponding STM files. 630 631 The generated dataset has six columns :py:obj:`[waveform, sample_rate, transcript, talk_id, speaker_id, 632 identifier]` . 633 634 The data type of column `waveform` is float32, the data type of column `sample_rate` is int32, 635 and the data type of columns `transcript` , `talk_id` , `speaker_id` and `identifier` is string. 636 637 Args: 638 dataset_dir (str): Path to the root directory that contains the dataset. 639 release (str): Release of the dataset, can be ``'release1'``, ``'release2'``, ``'release3'``. 640 usage (str, optional): Usage of this dataset. 641 For release1 or release2, can be ``'train'``, ``'test'``, ``'dev'`` or ``'all'``. 642 ``'train'`` will read from train samples, 643 ``'test'`` will read from test samples, 644 ``'dev'`` will read from dev samples, 645 ``'all'`` will read from all samples. 646 For `release3`, can only be ``'all'``, it will read from data samples. Default: ``None`` , all samples. 647 extensions (str, optional): Extensions of the SPH files, only ``'.sph'`` is valid. 648 Default: ``None`` , set to ``".sph"``. 649 num_samples (int, optional): The number of audio samples to be included in the dataset. 650 Default: ``None`` , all samples. 651 num_parallel_workers (int, optional): Number of worker threads to read the data. 652 Default: ``None`` , will use global default workers(8), it can be set 653 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 654 shuffle (bool, optional): Whether to perform shuffle on the dataset. Default: ``None`` , expected 655 order behavior shown in the table below. 656 sampler (Sampler, optional): Object used to choose samples from the 657 dataset. Default: ``None`` , expected order behavior shown in the table below. 658 num_shards (int, optional): Number of shards that the dataset will be divided 659 into. Default: ``None`` . When this argument is specified, `num_samples` reflects 660 the maximum sample number of per shard. 661 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 662 argument can only be specified when `num_shards` is also specified. 663 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 664 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 665 Default: ``None`` , which means no cache is used. 666 667 Raises: 668 RuntimeError: If `dataset_dir` does not contain stm files. 669 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 670 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 671 RuntimeError: If `num_shards` is specified but `shard_id` is None. 672 RuntimeError: If `shard_id` is specified but `num_shards` is None. 673 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 674 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 675 676 Tutorial Examples: 677 - `Load & Process Data With Dataset Pipeline 678 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 679 680 Note: 681 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 682 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 683 684 .. include:: mindspore.dataset.sampler.txt 685 686 Examples: 687 >>> import mindspore.dataset as ds 688 >>> # 1) Get all train samples from TEDLIUM_release1 dataset in sequence. 689 >>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium1_dataset_directory", 690 ... release="release1", shuffle=False) 691 >>> 692 >>> # 2) Randomly select 10 samples from TEDLIUM_release2 dataset. 693 >>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium2_dataset_directory", 694 ... release="release2", num_samples=10, shuffle=True) 695 >>> 696 >>> # 3) Get samples from TEDLIUM_release-3 dataset for shard 0 in a 2-way distributed training. 697 >>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium3_dataset_directory", 698 ... release="release3", num_shards=2, shard_id=0) 699 >>> 700 >>> # In TEDLIUM dataset, each dictionary has keys : waveform, sample_rate, transcript, talk_id, 701 >>> # speaker_id and identifier. 702 703 About TEDLIUM_release1 dataset: 704 705 The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled at 16kHz. 706 It contains about 118 hours of speech. 707 708 About TEDLIUM_release2 dataset: 709 710 This is the TED-LIUM corpus release 2, licensed under Creative Commons BY-NC-ND 3.0. All talks and text are 711 property of TED Conferences LLC. The TED-LIUM corpus was made from audio talks and their transcriptions available 712 on the TED website. We have prepared and filtered these data in order to train acoustic models to participate to 713 the International Workshop on Spoken Language Translation 2011 (the LIUM English/French SLT system reached the 714 first rank in the SLT task). 715 716 About TEDLIUM_release-3 dataset: 717 718 This is the TED-LIUM corpus release 3, licensed under Creative Commons BY-NC-ND 3.0. All talks and text are 719 property of TED Conferences LLC. This new TED-LIUM release was made through a collaboration between the Ubiqus 720 company and the LIUM (University of Le Mans, France). 721 722 You can unzip the dataset files into the following directory structure and read by MindSpore's API. 723 724 The structure of TEDLIUM release2 is the same as TEDLIUM release1, only the data is different. 725 726 .. code-block:: 727 728 . 729 └──TEDLIUM_release1 730 └── dev 731 ├── sph 732 ├── AlGore_2009.sph 733 ├── BarrySchwartz_2005G.sph 734 ├── stm 735 ├── AlGore_2009.stm 736 ├── BarrySchwartz_2005G.stm 737 └── test 738 ├── sph 739 ├── AimeeMullins_2009P.sph 740 ├── BillGates_2010.sph 741 ├── stm 742 ├── AimeeMullins_2009P.stm 743 ├── BillGates_2010.stm 744 └── train 745 ├── sph 746 ├── AaronHuey_2010X.sph 747 ├── AdamGrosser_2007.sph 748 ├── stm 749 ├── AaronHuey_2010X.stm 750 ├── AdamGrosser_2007.stm 751 └── readme 752 └── TEDLIUM.150k.dic 753 754 The directory structure of TEDLIUM release3 is slightly different. 755 756 .. code-block:: 757 758 . 759 └──TEDLIUM_release-3 760 └── data 761 ├── ctl 762 ├── sph 763 ├── 911Mothers_2010W.sph 764 ├── AalaElKhani.sph 765 ├── stm 766 ├── 911Mothers_2010W.stm 767 ├── AalaElKhani.stm 768 └── doc 769 └── legacy 770 └── LM 771 └── speaker-adaptation 772 └── readme 773 └── TEDLIUM.150k.dic 774 775 Citation: 776 777 .. code-block:: 778 779 @article{ 780 title={TED-LIUM: an automatic speech recognition dedicated corpus}, 781 author={A. Rousseau, P. Deléglise, Y. Estève}, 782 journal={Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)}, 783 year={May 2012}, 784 biburl={https://www.openslr.org/7/} 785 } 786 787 @article{ 788 title={Enhancing the TED-LIUM Corpus with Selected Data for Language Modeling and More TED Talks}, 789 author={A. Rousseau, P. Deléglise, and Y. Estève}, 790 journal={Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)}, 791 year={May 2014}, 792 biburl={https://www.openslr.org/19/} 793 } 794 795 @article{ 796 title={TED-LIUM 3: twice as much data and corpus repartition for experiments on speaker adaptation}, 797 author={François Hernandez, Vincent Nguyen, Sahar Ghannay, Natalia Tomashenko, and Yannick Estève}, 798 journal={the 20th International Conference on Speech and Computer (SPECOM 2018)}, 799 year={September 2018}, 800 biburl={https://www.openslr.org/51/} 801 } 802 """ 803 804 @check_tedlium_dataset 805 def __init__(self, dataset_dir, release, usage=None, extensions=None, num_samples=None, 806 num_parallel_workers=None, shuffle=None, sampler=None, num_shards=None, 807 shard_id=None, cache=None): 808 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 809 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 810 self.dataset_dir = dataset_dir 811 self.extensions = replace_none(extensions, ".sph") 812 self.release = release 813 self.usage = replace_none(usage, "all") 814 815 def parse(self, children=None): 816 return cde.TedliumNode(self.dataset_dir, self.release, self.usage, self.extensions, self.sampler) 817 818 819class YesNoDataset(MappableDataset, AudioBaseDataset): 820 """ 821 YesNo dataset. 822 823 The generated dataset has three columns :py:obj:`[waveform, sample_rate, labels]` . 824 The tensor of column :py:obj:`waveform` is a vector of the float32 type. 825 The tensor of column :py:obj:`sample_rate` is a scalar of the int32 type. 826 The tensor of column :py:obj:`labels` is a scalar of the int32 type. 827 828 Args: 829 dataset_dir (str): Path to the root directory that contains the dataset. 830 num_samples (int, optional): The number of images to be included in the dataset. 831 Default: ``None`` , will read all images. 832 num_parallel_workers (int, optional): Number of worker threads to read the data. 833 Default: ``None`` , will use global default workers(8), it can be set 834 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 835 shuffle (bool, optional): Whether or not to perform shuffle on the dataset. 836 Default: ``None`` , expected order behavior shown in the table below. 837 sampler (Sampler, optional): Object used to choose samples from the 838 dataset. Default: ``None`` , expected order behavior shown in the table below. 839 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 840 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 841 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This argument can only 842 be specified when `num_shards` is also specified. 843 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 844 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 845 Default: ``None`` , which means no cache is used. 846 847 Raises: 848 RuntimeError: If `dataset_dir` does not contain data files. 849 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 850 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 851 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 852 RuntimeError: If `num_shards` is specified but `shard_id` is None. 853 RuntimeError: If `shard_id` is specified but `num_shards` is None. 854 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 855 856 Tutorial Examples: 857 - `Load & Process Data With Dataset Pipeline 858 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 859 860 Note: 861 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 862 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 863 864 .. include:: mindspore.dataset.sampler.txt 865 866 Examples: 867 >>> import mindspore.dataset as ds 868 >>> yes_no_dataset_dir = "/path/to/yes_no_dataset_directory" 869 >>> 870 >>> # Read 3 samples from YesNo dataset 871 >>> dataset = ds.YesNoDataset(dataset_dir=yes_no_dataset_dir, num_samples=3) 872 >>> 873 >>> # Note: In YesNo dataset, each dictionary has keys "waveform", "sample_rate", "label" 874 875 About YesNo dataset: 876 877 Yesno is an audio dataset consisting of 60 recordings of one individual saying yes or no in Hebrew; each 878 recording is eight words long. 879 880 Here is the original YesNo dataset structure. 881 You can unzip the dataset files into this directory structure and read by MindSpore's API. 882 883 .. code-block:: 884 885 . 886 └── yes_no_dataset_dir 887 ├── 1_1_0_0_1_1_0_0.wav 888 ├── 1_0_0_0_1_1_0_0.wav 889 ├── 1_1_0_0_1_1_0_0.wav 890 └──.... 891 892 Citation: 893 894 .. code-block:: 895 896 @NetworkResource{Kaldi_audio_project, 897 author = {anonymous}, 898 url = "http://wwww.openslr.org/1/" 899 } 900 """ 901 902 @check_yes_no_dataset 903 def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None, 904 sampler=None, num_shards=None, shard_id=None, cache=None): 905 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 906 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 907 908 self.dataset_dir = dataset_dir 909 910 def parse(self, children=None): 911 return cde.YesNoNode(self.dataset_dir, self.sampler) 912