1# Copyright 2019-2023 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16This file contains specific text dataset loading classes. You can easily use 17these classes to load the prepared dataset. For example: 18 IMDBDataset: which is IMDB dataset. 19 WikiTextDataset: which is Wiki text dataset. 20 CLUEDataset: which is CLUE dataset. 21 YelpReviewDataset: which is yelp review dataset. 22 ... 23After declaring the dataset object, you can further apply dataset operations 24(e.g. filter, skip, concat, map, batch) on it. 25""" 26import mindspore._c_dataengine as cde 27 28from .datasets import TextBaseDataset, SourceDataset, MappableDataset, Shuffle 29from .validators import check_imdb_dataset, check_iwslt2016_dataset, check_iwslt2017_dataset, \ 30 check_penn_treebank_dataset, check_ag_news_dataset, check_amazon_review_dataset, check_udpos_dataset, \ 31 check_wiki_text_dataset, check_conll2000_dataset, check_cluedataset, \ 32 check_sogou_news_dataset, check_textfiledataset, check_dbpedia_dataset, check_yelp_review_dataset, \ 33 check_en_wik9_dataset, check_yahoo_answers_dataset, check_multi30k_dataset, check_squad_dataset, \ 34 check_sst2_dataset 35 36from ..core.validator_helpers import replace_none 37 38 39class AGNewsDataset(SourceDataset, TextBaseDataset): 40 """ 41 AG News dataset. 42 43 The generated dataset has three columns: :py:obj:`[index, title, description]` , 44 and the data type of three columns is string type. 45 46 Args: 47 dataset_dir (str): Path to the root directory that contains the dataset. 48 usage (str, optional): Acceptable usages include ``'train'`` , ``'test'`` and ``'all'`` . 49 Default: ``None`` , all samples. 50 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , 51 reads the full dataset. 52 num_parallel_workers (int, optional): Number of worker threads to read the data. 53 Default: ``None`` , will use global default workers(8), it can be set 54 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 55 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 56 Bool type and Shuffle enum are both supported to pass in. 57 Default: ``Shuffle.GLOBAL`` . 58 If `shuffle` is ``False``, no shuffling will be performed. 59 If `shuffle` is ``True``, it is equivalent to setting `shuffle` to 60 ``mindspore.dataset.Shuffle.GLOBAL`` . 61 Set the mode of data shuffling by passing in enumeration variables: 62 63 - ``Shuffle.GLOBAL``: Shuffle both the files and samples. 64 65 - ``Shuffle.FILES``: Shuffle files only. 66 67 num_shards (int, optional): Number of shards that the dataset will be divided into. 68 Default: ``None``. When this argument is specified, `num_samples` reflects the 69 max sample number of per shard. 70 shard_id (int, optional): The shard ID within `num_shards` . This 71 argument can only be specified when `num_shards` is also specified. Default: ``None``. 72 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 73 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 74 Default: ``None``, which means no cache is used. 75 76 Raises: 77 RuntimeError: If `dataset_dir` does not contain data files. 78 RuntimeError: If `num_shards` is specified but `shard_id` is None. 79 RuntimeError: If `shard_id` is specified but `num_shards` is None. 80 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 81 82 Tutorial Examples: 83 - `Load & Process Data With Dataset Pipeline 84 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 85 86 Examples: 87 >>> import mindspore.dataset as ds 88 >>> ag_news_dataset_dir = "/path/to/ag_news_dataset_file" 89 >>> dataset = ds.AGNewsDataset(dataset_dir=ag_news_dataset_dir, usage='all') 90 91 About AGNews dataset: 92 93 AG is a collection of over 1 million news articles. The news articles were collected 94 by ComeToMyHead from over 2,000 news sources in over 1 year of activity. ComeToMyHead 95 is an academic news search engine that has been in operation since July 2004. 96 The dataset is provided by academics for research purposes such as data mining 97 (clustering, classification, etc.), information retrieval (ranking, searching, etc.), 98 xml, data compression, data streaming, and any other non-commercial activities. 99 AG's news topic classification dataset was constructed by selecting the four largest 100 classes from the original corpus. Each class contains 30,000 training samples and 101 1,900 test samples. The total number of training samples in train.csv is 120,000 102 and the number of test samples in test.csv is 7,600. 103 104 You can unzip the dataset files into the following structure and read by MindSpore's API: 105 106 .. code-block:: 107 108 . 109 └── ag_news_dataset_dir 110 ├── classes.txt 111 ├── train.csv 112 ├── test.csv 113 └── readme.txt 114 115 Citation: 116 117 .. code-block:: 118 119 @misc{zhang2015characterlevel, 120 title={Character-level Convolutional Networks for Text Classification}, 121 author={Xiang Zhang and Junbo Zhao and Yann LeCun}, 122 year={2015}, 123 eprint={1509.01626}, 124 archivePrefix={arXiv}, 125 primaryClass={cs.LG} 126 } 127 """ 128 129 @check_ag_news_dataset 130 def __init__(self, dataset_dir, usage=None, num_samples=None, 131 num_parallel_workers=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, cache=None): 132 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 133 num_shards=num_shards, shard_id=shard_id, cache=cache) 134 self.dataset_dir = dataset_dir 135 self.usage = replace_none(usage, "all") 136 137 def parse(self, children=None): 138 return cde.AGNewsNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 139 self.shard_id) 140 141 142class AmazonReviewDataset(SourceDataset, TextBaseDataset): 143 """ 144 Amazon Review Polarity and Amazon Review Full datasets. 145 146 The generated dataset has three columns: :py:obj:`[label, title, content]` , 147 and the data type of three columns is string. 148 149 Args: 150 dataset_dir (str): Path to the root directory that contains the Amazon Review Polarity dataset 151 or the Amazon Review Full dataset. 152 usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` . 153 For Polarity dataset, ``'train'`` will read from 3,600,000 train samples, 154 ``'test'`` will read from 400,000 test samples, 155 ``'all'`` will read from all 4,000,000 samples. 156 For Full dataset, ``'train'`` will read from 3,000,000 train samples, 157 ``'test'`` will read from 650,000 test samples, 158 ``'all'`` will read from all 3,650,000 samples. Default: ``None``, all samples. 159 num_samples (int, optional): Number of samples (rows) to be read. Default: ``None``, 160 reads the full dataset. 161 num_parallel_workers (int, optional): Number of worker threads to read the data. 162 Default: ``None`` , will use global default workers(8), it can be set 163 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 164 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 165 Bool type and Shuffle enum are both supported to pass in. 166 Default: ``Shuffle.GLOBAL`` . 167 If `shuffle` is ``False``, no shuffling will be performed. 168 If `shuffle` is ``True``, it is equivalent to setting `shuffle` to 169 ``mindspore.dataset.Shuffle.GLOBAL``. 170 Set the mode of data shuffling by passing in enumeration variables: 171 172 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 173 174 - ``Shuffle.FILES`` : Shuffle files only. 175 176 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 177 When this argument is specified, `num_samples` reflects the max sample number of per shard. 178 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 179 argument can only be specified when `num_shards` is also specified. 180 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 181 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 182 Default: ``None`` , which means no cache is used. 183 184 Raises: 185 RuntimeError: If `dataset_dir` does not contain data files. 186 RuntimeError: If `num_shards` is specified but `shard_id` is None. 187 RuntimeError: If `shard_id` is specified but `num_shards` is None. 188 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 189 190 Tutorial Examples: 191 - `Load & Process Data With Dataset Pipeline 192 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 193 194 Examples: 195 >>> import mindspore.dataset as ds 196 >>> amazon_review_dataset_dir = "/path/to/amazon_review_dataset_dir" 197 >>> dataset = ds.AmazonReviewDataset(dataset_dir=amazon_review_dataset_dir, usage='all') 198 199 About AmazonReview Dataset: 200 201 The Amazon reviews full dataset consists of reviews from Amazon. The data span a period of 18 years, including ~35 202 million reviews up to March 2013. Reviews include product and user information, ratings, and a plaintext review. 203 The dataset is mainly used for text classification, given the content and title, predict the correct star rating. 204 205 The Amazon reviews polarity dataset is constructed by taking review score 1 and 2 as negative, 4 and 5 as positive. 206 Samples of score 3 is ignored. 207 208 The Amazon Reviews Polarity and Amazon Reviews Full datasets have the same directory structures. 209 You can unzip the dataset files into the following structure and read by MindSpore's API: 210 211 .. code-block:: 212 213 . 214 └── amazon_review_dir 215 ├── train.csv 216 ├── test.csv 217 └── readme.txt 218 219 Citation: 220 221 .. code-block:: 222 223 @article{zhang2015character, 224 title={Character-level convolutional networks for text classification}, 225 author={Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, 226 journal={Advances in neural information processing systems}, 227 volume={28}, 228 pages={649--657}, 229 year={2015} 230 } 231 """ 232 233 @check_amazon_review_dataset 234 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 235 num_shards=None, shard_id=None, cache=None): 236 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 237 num_shards=num_shards, shard_id=shard_id, cache=cache) 238 self.dataset_dir = dataset_dir 239 self.usage = replace_none(usage, 'all') 240 241 def parse(self, children=None): 242 return cde.AmazonReviewNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 243 self.shard_id) 244 245 246class CLUEDataset(SourceDataset, TextBaseDataset): 247 """ 248 CLUE(Chinese Language Understanding Evaluation) dataset. 249 Supported CLUE classification tasks: ``'AFQMC'`` , ``'TNEWS'``, ``'IFLYTEK'``, ``'CMNLI'``, 250 ``'WSC'`` and ``'CSL'``. 251 252 Args: 253 dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for 254 a pattern of files. The list will be sorted in a lexicographical order. 255 task (str, optional): The kind of task, one of ``'AFQMC'`` , ``'TNEWS'``, ``'IFLYTEK'``, ``'CMNLI'``, 256 ``'WSC'`` and ``'CSL'``. Default: ``'AFQMC'`` . 257 usage (str, optional): Specify the ``'train'``, ``'test'`` or ``'eval'`` part of dataset. 258 Default: ``'train'``. 259 num_samples (int, optional): The number of samples to be included in the dataset. 260 Default: ``None`` , will include all images. 261 num_parallel_workers (int, optional): Number of worker threads to read the data. 262 Default: ``None`` , will use global default workers(8), it can be set 263 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 264 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 265 Default: ``Shuffle.GLOBAL`` . Bool type and Shuffle enum are both supported to pass in. 266 If `shuffle` is ``False``, no shuffling will be performed. 267 If `shuffle` is ``True``, performs global shuffle. 268 There are three levels of shuffling, desired shuffle enum defined by :class:`mindspore.dataset.Shuffle` . 269 270 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting `shuffle` to ``True``. 271 272 - ``Shuffle.FILES`` : Shuffle files only. 273 274 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 275 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 276 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 277 argument can only be specified when `num_shards` is also specified. 278 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 279 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 280 Default: ``None`` , which means no cache is used. 281 282 The generated dataset with different task setting has different output columns: 283 284 +-------------------------+------------------------------+-----------------------------+ 285 | `task` | `usage` | Output column | 286 +=========================+==============================+=============================+ 287 | AFQMC | train | [sentence1, dtype=string] | 288 | | | | 289 | | | [sentence2, dtype=string] | 290 | | | | 291 | | | [label, dtype=string] | 292 | +------------------------------+-----------------------------+ 293 | | test | [id, dtype=uint32] | 294 | | | | 295 | | | [sentence1, dtype=string] | 296 | | | | 297 | | | [sentence2, dtype=string] | 298 | +------------------------------+-----------------------------+ 299 | | eval | [sentence1, dtype=string] | 300 | | | | 301 | | | [sentence2, dtype=string] | 302 | | | | 303 | | | [label, dtype=string] | 304 +-------------------------+------------------------------+-----------------------------+ 305 | TNEWS | train | [label, dtype=string] | 306 | | | | 307 | | | [label_des, dtype=string] | 308 | | | | 309 | | | [sentence, dtype=string] | 310 | | | | 311 | | | [keywords, dtype=string] | 312 | +------------------------------+-----------------------------+ 313 | | test | [label, dtype=uint32] | 314 | | | | 315 | | | [keywords, dtype=string] | 316 | | | | 317 | | | [sentence, dtype=string] | 318 | +------------------------------+-----------------------------+ 319 | | eval | [label, dtype=string] | 320 | | | | 321 | | | [label_des, dtype=string] | 322 | | | | 323 | | | [sentence, dtype=string] | 324 | | | | 325 | | | [keywords, dtype=string] | 326 +-------------------------+------------------------------+-----------------------------+ 327 | IFLYTEK | train | [label, dtype=string] | 328 | | | | 329 | | | [label_des, dtype=string] | 330 | | | | 331 | | | [sentence, dtype=string] | 332 | +------------------------------+-----------------------------+ 333 | | test | [id, dtype=uint32] | 334 | | | | 335 | | | [sentence, dtype=string] | 336 | +------------------------------+-----------------------------+ 337 | | eval | [label, dtype=string] | 338 | | | | 339 | | | [label_des, dtype=string] | 340 | | | | 341 | | | [sentence, dtype=string] | 342 +-------------------------+------------------------------+-----------------------------+ 343 | CMNLI | train | [sentence1, dtype=string] | 344 | | | | 345 | | | [sentence2, dtype=string] | 346 | | | | 347 | | | [label, dtype=string] | 348 | +------------------------------+-----------------------------+ 349 | | test | [id, dtype=uint32] | 350 | | | | 351 | | | [sentence1, dtype=string] | 352 | | | | 353 | | | [sentence2, dtype=string] | 354 | +------------------------------+-----------------------------+ 355 | | eval | [sentence1, dtype=string] | 356 | | | | 357 | | | [sentence2, dtype=string] | 358 | | | | 359 | | | [label, dtype=string] | 360 +-------------------------+------------------------------+-----------------------------+ 361 | WSC | train | [span1_index, dtype=uint32]| 362 | | | | 363 | | | [span2_index, dtype=uint32]| 364 | | | | 365 | | | [span1_text, dtype=string] | 366 | | | | 367 | | | [span2_text, dtype=string] | 368 | | | | 369 | | | [idx, dtype=uint32] | 370 | | | | 371 | | | [text, dtype=string] | 372 | | | | 373 | | | [label, dtype=string] | 374 | +------------------------------+-----------------------------+ 375 | | test | [span1_index, dtype=uint32]| 376 | | | | 377 | | | [span2_index, dtype=uint32]| 378 | | | | 379 | | | [span1_text, dtype=string] | 380 | | | | 381 | | | [span2_text, dtype=string] | 382 | | | | 383 | | | [idx, dtype=uint32] | 384 | | | | 385 | | | [text, dtype=string] | 386 | +------------------------------+-----------------------------+ 387 | | eval | [span1_index, dtype=uint32]| 388 | | | | 389 | | | [span2_index, dtype=uint32]| 390 | | | | 391 | | | [span1_text, dtype=string] | 392 | | | | 393 | | | [span2_text, dtype=string] | 394 | | | | 395 | | | [idx, dtype=uint32] | 396 | | | | 397 | | | [text, dtype=string] | 398 | | | | 399 | | | [label, dtype=string] | 400 +-------------------------+------------------------------+-----------------------------+ 401 | CSL | train | [id, dtype=uint32] | 402 | | | | 403 | | | [abst, dtype=string] | 404 | | | | 405 | | | [keyword, dtype=string] | 406 | | | | 407 | | | [label, dtype=string] | 408 | +------------------------------+-----------------------------+ 409 | | test | [id, dtype=uint32] | 410 | | | | 411 | | | [abst, dtype=string] | 412 | | | | 413 | | | [keyword, dtype=string] | 414 | +------------------------------+-----------------------------+ 415 | | eval | [id, dtype=uint32] | 416 | | | | 417 | | | [abst, dtype=string] | 418 | | | | 419 | | | [keyword, dtype=string] | 420 | | | | 421 | | | [label, dtype=string] | 422 +-------------------------+------------------------------+-----------------------------+ 423 424 Raises: 425 ValueError: If dataset_files are not valid or do not exist. 426 ValueError: task is not in ``'AFQMC'`` , ``'TNEWS'``, ``'IFLYTEK'``, ``'CMNLI'``, ``'WSC'`` 427 or ``'CSL'``. 428 ValueError: usage is not in ``'train'``, ``'test'`` or ``'eval'``. 429 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 430 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 431 RuntimeError: If `num_shards` is specified but `shard_id` is None. 432 RuntimeError: If `shard_id` is specified but `num_shards` is None. 433 434 Tutorial Examples: 435 - `Load & Process Data With Dataset Pipeline 436 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 437 438 Examples: 439 >>> import mindspore.dataset as ds 440 >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple clue files 441 >>> dataset = ds.CLUEDataset(dataset_files=clue_dataset_dir, task='AFQMC', usage='train') 442 443 About CLUE dataset: 444 445 CLUE, a Chinese Language Understanding Evaluation benchmark. It contains multiple 446 tasks, including single-sentence classification, sentence pair classification, and machine 447 reading comprehension. 448 449 You can unzip the dataset files into the following structure and read by MindSpore's API, 450 such as afqmc dataset: 451 452 .. code-block:: 453 454 . 455 └── afqmc_public 456 ├── train.json 457 ├── test.json 458 └── dev.json 459 460 Citation: 461 462 .. code-block:: 463 464 @article{CLUEbenchmark, 465 title = {CLUE: A Chinese Language Understanding Evaluation Benchmark}, 466 author = {Liang Xu, Xuanwei Zhang, Lu Li, Hai Hu, Chenjie Cao, Weitang Liu, Junyi Li, Yudong Li, 467 Kai Sun, Yechen Xu, Yiming Cui, Cong Yu, Qianqian Dong, Yin Tian, Dian Yu, Bo Shi, Jun Zeng, 468 Rongzhao Wang, Weijian Xie, Yanting Li, Yina Patterson, Zuoyu Tian, Yiwen Zhang, He Zhou, 469 Shaoweihua Liu, Qipeng Zhao, Cong Yue, Xinrui Zhang, Zhengliang Yang, Zhenzhong Lan}, 470 journal = {arXiv preprint arXiv:2004.05986}, 471 year = {2020}, 472 howpublished = {https://github.com/CLUEbenchmark/CLUE} 473 } 474 """ 475 476 @check_cluedataset 477 def __init__(self, dataset_files, task='AFQMC', usage='train', num_samples=None, num_parallel_workers=None, 478 shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, cache=None): 479 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 480 num_shards=num_shards, shard_id=shard_id, cache=cache) 481 self.dataset_files = self._find_files(dataset_files) 482 self.usage = replace_none(usage, 'train') 483 self.task = replace_none(task, 'AFQMC') 484 485 def parse(self, children=None): 486 return cde.CLUENode(self.dataset_files, self.task, self.usage, self.num_samples, self.shuffle_flag, 487 self.num_shards, self.shard_id) 488 489 490class CoNLL2000Dataset(SourceDataset, TextBaseDataset): 491 """ 492 CoNLL-2000(Conference on Computational Natural Language Learning) chunking dataset. 493 494 The generated dataset has three columns: :py:obj:`[word, pos_tag, chunk_tag]` . 495 The tensors of column :py:obj:`word` , column :py:obj:`pos_tag` , 496 and column :py:obj:`chunk_tag` are of the string type. 497 498 Args: 499 dataset_dir (str): Path to the root directory that contains the CoNLL2000 chunking dataset. 500 usage (str, optional): Usage of dataset, can be ``'train'`` , ``'test'`` , or ``'all'`` . 501 For dataset, ``'train'`` will read from 8,936 train samples, 502 ``'test'`` will read from 2,012 test samples, 503 ``'all'`` will read from all 1,0948 samples. Default: ``None`` , read all samples. 504 num_samples (int, optional): Number of samples (rows) to be read. Default: ``None`` , 505 read the full dataset. 506 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 507 Default: ``Shuffle.GLOBAL`` . 508 If `shuffle` is ``False`` , no shuffling will be performed. 509 If `shuffle` is ``True`` , performs global shuffle. 510 There are three levels of shuffling, desired shuffle enum defined by 511 :class:`mindspore.dataset.Shuffle` . 512 513 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting `shuffle` to ``True``. 514 - ``Shuffle.FILES`` : Shuffle files only. 515 516 num_shards (int, optional): Number of shards that the dataset will be divided into. 517 When this argument is specified, `num_samples` reflects the max sample number of per shard. 518 Default: ``None`` . 519 shard_id (int, optional): The shard ID within `num_shards` . This 520 argument can only be specified when `num_shards` is also specified. Default: ``None`` . 521 num_parallel_workers (int, optional): Number of worker threads to read the data. 522 Default: ``None`` , will use global default workers(8), it can be set 523 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 524 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 525 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 526 Default: ``None`` , which means no cache is used. 527 528 Raises: 529 RuntimeError: If `dataset_dir` does not contain data files. 530 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 531 RuntimeError: If `num_shards` is specified but `shard_id` is None. 532 RuntimeError: If `shard_id` is specified but `num_shards` is None. 533 534 Tutorial Examples: 535 - `Load & Process Data With Dataset Pipeline 536 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 537 538 Examples: 539 >>> import mindspore.dataset as ds 540 >>> conll2000_dataset_dir = "/path/to/conll2000_dataset_dir" 541 >>> dataset = ds.CoNLL2000Dataset(dataset_dir=conll2000_dataset_dir, usage='all') 542 543 About CoNLL2000 Dataset: 544 545 The CoNLL2000 chunking dataset consists of the text from sections 15-20 of the Wall Street Journal corpus. 546 Texts are chunked using IOB notation, and the chunk type has NP, VP, PP, ADJP and ADVP. 547 The dataset consist of three columns separated by spaces. The first column contains the current word, 548 the second is part-of-speech tag as derived by the Brill tagger and the third is chunk tag as derived from 549 the WSJ corpus. Text chunking consists of dividing a text in syntactically correlated parts of words. 550 551 You can unzip the dataset files into the following structure and read by MindSpore's API: 552 553 .. code-block:: 554 555 . 556 └── conll2000_dataset_dir 557 ├── train.txt 558 ├── test.txt 559 └── readme.txt 560 561 Citation: 562 563 .. code-block:: 564 565 @inproceedings{tksbuchholz2000conll, 566 author = {Tjong Kim Sang, Erik F. and Sabine Buchholz}, 567 title = {Introduction to the CoNLL-2000 Shared Task: Chunking}, 568 editor = {Claire Cardie and Walter Daelemans and Claire Nedellec and Tjong Kim Sang, Erik}, 569 booktitle = {Proceedings of CoNLL-2000 and LLL-2000}, 570 publisher = {Lisbon, Portugal}, 571 pages = {127--132}, 572 year = {2000} 573 } 574 """ 575 576 @check_conll2000_dataset 577 def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None, 578 shard_id=None, num_parallel_workers=None, cache=None): 579 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 580 num_shards=num_shards, shard_id=shard_id, cache=cache) 581 self.dataset_dir = dataset_dir 582 self.usage = replace_none(usage, 'all') 583 584 def parse(self, children=None): 585 return cde.CoNLL2000Node(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 586 self.shard_id) 587 588 589class DBpediaDataset(SourceDataset, TextBaseDataset): 590 """ 591 DBpedia dataset. 592 593 The generated dataset has three columns :py:obj:`[class, title, content]` , 594 and the data type of three columns is string. 595 596 Args: 597 dataset_dir (str): Path to the root directory that contains the dataset. 598 usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` . 599 ``'train'`` will read from 560,000 train samples, 600 ``'test'`` will read from 70,000 test samples, 601 ``'all'`` will read from all 630,000 samples. Default: ``None`` , all samples. 602 num_samples (int, optional): The number of samples to be included in the dataset. 603 Default: ``None`` , will include all text. 604 num_parallel_workers (int, optional): Number of worker threads to read the data. 605 Default: ``None`` , will use global default workers(8), it can be set 606 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 607 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 608 Bool type and Shuffle enum are both supported to pass in. 609 Default: ``Shuffle.GLOBAL`` . 610 If `shuffle` is ``False`` , no shuffling will be performed. 611 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 612 ``mindspore.dataset.Shuffle.GLOBAL`` . 613 Set the mode of data shuffling by passing in enumeration variables: 614 615 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 616 617 - ``Shuffle.FILES`` : Shuffle files only. 618 619 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 620 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 621 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 622 argument can only be specified when `num_shards` is also specified. 623 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 624 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 625 Default: ``None`` , which means no cache is used. 626 627 Raises: 628 RuntimeError: If `dataset_dir` does not contain data files. 629 RuntimeError: If `num_shards` is specified but `shard_id` is None. 630 RuntimeError: If `shard_id` is specified but `num_shards` is None. 631 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 632 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 633 634 Tutorial Examples: 635 - `Load & Process Data With Dataset Pipeline 636 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 637 638 Examples: 639 >>> import mindspore.dataset as ds 640 >>> dbpedia_dataset_dir = "/path/to/dbpedia_dataset_directory" 641 >>> 642 >>> # 1) Read 3 samples from DBpedia dataset 643 >>> dataset = ds.DBpediaDataset(dataset_dir=dbpedia_dataset_dir, num_samples=3) 644 >>> 645 >>> # 2) Read train samples from DBpedia dataset 646 >>> dataset = ds.DBpediaDataset(dataset_dir=dbpedia_dataset_dir, usage="train") 647 648 About DBpedia dataset: 649 650 The DBpedia dataset consists of 630,000 text samples in 14 classes, there are 560,000 samples in the train.csv 651 and 70,000 samples in the test.csv. 652 The 14 different classes represent Company, EducationaInstitution, Artist, Athlete, OfficeHolder, 653 MeanOfTransportation, Building, NaturalPlace, Village, Animal, Plant, Album, Film, WrittenWork. 654 655 Here is the original DBpedia dataset structure. 656 You can unzip the dataset files into this directory structure and read by Mindspore's API. 657 658 .. code-block:: 659 660 . 661 └── dbpedia_dataset_dir 662 ├── train.csv 663 ├── test.csv 664 ├── classes.txt 665 └── readme.txt 666 667 Citation: 668 669 .. code-block:: 670 671 @article{DBpedia, 672 title = {DBPedia Ontology Classification Dataset}, 673 author = {Jens Lehmann, Robert Isele, Max Jakob, Anja Jentzsch, Dimitris Kontokostas, 674 Pablo N. Mendes, Sebastian Hellmann, Mohamed Morsey, Patrick van Kleef, 675 Sören Auer, Christian Bizer}, 676 year = {2015}, 677 howpublished = {http://dbpedia.org} 678 } 679 """ 680 681 @check_dbpedia_dataset 682 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 683 num_shards=None, shard_id=None, cache=None): 684 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 685 num_shards=num_shards, shard_id=shard_id, cache=cache) 686 self.dataset_dir = dataset_dir 687 self.usage = replace_none(usage, "all") 688 689 def parse(self, children=None): 690 return cde.DBpediaNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 691 self.shard_id) 692 693 694class EnWik9Dataset(SourceDataset, TextBaseDataset): 695 """ 696 EnWik9 dataset. 697 698 The generated dataset has one column :py:obj:`[text]` with type string. 699 700 Args: 701 dataset_dir (str): Path to the root directory that contains the dataset. 702 num_samples (int, optional): The number of samples to be included in the dataset. 703 Default: ``None`` , will include all samples. 704 num_parallel_workers (int, optional): Number of worker threads to read the data. 705 Default: ``None`` , will use global default workers(8), it can be set 706 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 707 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 708 Bool type and Shuffle enum are both supported to pass in. Default: ``True``. 709 If `shuffle` is ``False`` , no shuffling will be performed. 710 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 711 ``mindspore.dataset.Shuffle.GLOBAL`` . 712 Set the mode of data shuffling by passing in enumeration variables: 713 714 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 715 716 - ``Shuffle.FILES`` : Shuffle files only. 717 718 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 719 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 720 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 721 argument can only be specified when `num_shards` is also specified. 722 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 723 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 724 Default: ``None`` , which means no cache is used. 725 726 Raises: 727 RuntimeError: If `dataset_dir` does not contain data files. 728 RuntimeError: If `num_shards` is specified but `shard_id` is None. 729 RuntimeError: If `shard_id` is specified but `num_shards` is None. 730 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 731 732 Tutorial Examples: 733 - `Load & Process Data With Dataset Pipeline 734 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 735 736 Examples: 737 >>> import mindspore.dataset as ds 738 >>> en_wik9_dataset_dir = "/path/to/en_wik9_dataset" 739 >>> dataset2 = ds.EnWik9Dataset(dataset_dir=en_wik9_dataset_dir, num_samples=2, 740 ... shuffle=True) 741 742 About EnWik9 dataset: 743 744 The data of EnWik9 is UTF-8 encoded XML consisting primarily of English text. It contains 243,426 article titles, 745 of which 85,560 are #REDIRECT to fix broken links, and the rest are regular articles. 746 747 The data is UTF-8 clean. All characters are in the range U'0000 to U'10FFFF with valid encodings of 1 to 748 4 bytes. The byte values 0xC0, 0xC1, and 0xF5-0xFF never occur. Also, in the Wikipedia dumps, 749 there are no control characters in the range 0x00-0x1F except for 0x09 (tab) and 0x0A (linefeed). 750 Linebreaks occur only on paragraph boundaries, so they always have a semantic purpose. 751 752 You can unzip the dataset files into the following directory structure and read by MindSpore's API. 753 754 .. code-block:: 755 756 . 757 └── EnWik9 758 ├── enwik9 759 760 Citation: 761 762 .. code-block:: 763 764 @NetworkResource{Hutter_prize, 765 author = {English Wikipedia}, 766 url = "https://cs.fit.edu/~mmahoney/compression/textdata.html", 767 month = {March}, 768 year = {2006} 769 } 770 """ 771 772 @check_en_wik9_dataset 773 def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=True, 774 num_shards=None, shard_id=None, cache=None): 775 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 776 num_shards=num_shards, shard_id=shard_id, cache=cache) 777 self.dataset_dir = dataset_dir 778 779 def parse(self, children=None): 780 return cde.EnWik9Node(self.dataset_dir, self.num_samples, self.shuffle_flag, self.num_shards, 781 self.shard_id) 782 783 784class IMDBDataset(MappableDataset, TextBaseDataset): 785 """ 786 IMDb(Internet Movie Database) dataset. 787 788 The generated dataset has two columns: :py:obj:`[text, label]` . 789 The tensor of column :py:obj:`text` is of the string type. 790 The column :py:obj:`label` is of a scalar of uint32 type. 791 792 Args: 793 dataset_dir (str): Path to the root directory that contains the dataset. 794 usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` . 795 Default: ``None`` , will read all samples. 796 num_samples (int, optional): The number of images to be included in the dataset. 797 Default: ``None`` , will include all samples. 798 num_parallel_workers (int, optional): Number of worker threads to read the data. 799 Default: ``None`` , will use global default workers(8), it can be set 800 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 801 shuffle (bool, optional): Whether or not to perform shuffle on the dataset. 802 Default: ``None`` , expected order behavior shown in the table below. 803 sampler (Sampler, optional): Object used to choose samples from the dataset. 804 Default: ``None`` , expected order behavior shown in the table below. 805 num_shards (int, optional): Number of shards that the dataset will be divided 806 into. Default: ``None`` . When this argument is specified, `num_samples` reflects 807 the maximum sample number of per shard. 808 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 809 argument can only be specified when `num_shards` is also specified. 810 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 811 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 812 Default: ``None`` , which means no cache is used. 813 814 Raises: 815 RuntimeError: If `dataset_dir` does not contain data files. 816 RuntimeError: If `sampler` and `shuffle` are specified at the same time. 817 RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time. 818 RuntimeError: If `num_shards` is specified but `shard_id` is None. 819 RuntimeError: If `shard_id` is specified but `num_shards` is None. 820 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 821 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 822 823 Tutorial Examples: 824 - `Load & Process Data With Dataset Pipeline 825 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 826 827 Note: 828 - The shape of the test column. 829 - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler 830 used in the dataset, and their effects when combined with parameter `sampler` are as follows. 831 832 .. include:: mindspore.dataset.sampler.txt 833 834 Examples: 835 >>> import mindspore.dataset as ds 836 >>> imdb_dataset_dir = "/path/to/imdb_dataset_directory" 837 >>> 838 >>> # 1) Read all samples (text files) in imdb_dataset_dir with 8 threads 839 >>> dataset = ds.IMDBDataset(dataset_dir=imdb_dataset_dir, num_parallel_workers=8) 840 >>> 841 >>> # 2) Read train samples (text files). 842 >>> dataset = ds.IMDBDataset(dataset_dir=imdb_dataset_dir, usage="train") 843 844 About IMDBDataset: 845 846 The IMDB dataset contains 50, 000 highly polarized reviews from the Internet Movie Database (IMDB). The dataset 847 was divided into 25 000 comments for training and 25 000 comments for testing, with both the training set and test 848 set containing 50% positive and 50% negative comments. Train labels and test labels are all lists of 0 and 1, where 849 0 stands for negative and 1 for positive. 850 851 You can unzip the dataset files into this directory structure and read by MindSpore's API. 852 853 .. code-block:: 854 855 . 856 └── imdb_dataset_directory 857 ├── train 858 │ ├── pos 859 │ │ ├── 0_9.txt 860 │ │ ├── 1_7.txt 861 │ │ ├── ... 862 │ ├── neg 863 │ │ ├── 0_3.txt 864 │ │ ├── 1_1.txt 865 │ │ ├── ... 866 ├── test 867 │ ├── pos 868 │ │ ├── 0_10.txt 869 │ │ ├── 1_10.txt 870 │ │ ├── ... 871 │ ├── neg 872 │ │ ├── 0_2.txt 873 │ │ ├── 1_3.txt 874 │ │ ├── ... 875 876 Citation: 877 878 .. code-block:: 879 880 @InProceedings{maas-EtAl:2011:ACL-HLT2011, 881 author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan 882 and Ng, Andrew Y. and Potts, Christopher}, 883 title = {Learning Word Vectors for Sentiment Analysis}, 884 booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: 885 Human Language Technologies}, 886 month = {June}, 887 year = {2011}, 888 address = {Portland, Oregon, USA}, 889 publisher = {Association for Computational Linguistics}, 890 pages = {142--150}, 891 url = {http://www.aclweb.org/anthology/P11-1015} 892 } 893 """ 894 895 @check_imdb_dataset 896 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, sampler=None, 897 num_shards=None, shard_id=None, cache=None): 898 super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, 899 shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) 900 901 self.dataset_dir = dataset_dir 902 self.usage = replace_none(usage, "all") 903 904 def parse(self, children=None): 905 return cde.IMDBNode(self.dataset_dir, self.usage, self.sampler) 906 907 908class IWSLT2016Dataset(SourceDataset, TextBaseDataset): 909 """ 910 IWSLT2016(International Workshop on Spoken Language Translation) dataset. 911 912 The generated dataset has two columns: :py:obj:`[text, translation]` . 913 The tensor of column :py:obj: `text` is of the string type. 914 The column :py:obj: `translation` is of the string type. 915 916 Args: 917 dataset_dir (str): Path to the root directory that contains the dataset. 918 usage (str, optional): Acceptable usages include 'train', 'valid', 'test' and 'all'. Default: ``None`` , 919 all samples. 920 language_pair (sequence, optional): Sequence containing source and target language, supported values are 921 ``('en', 'fr')``, ``('en', 'de')``, ``('en', 'cs')``, ``('en', 'ar')``, ``('fr', 'en')``, 922 ``('de', 'en')``, ``('cs', 'en')``, ``('ar', 'en')``. Default: ``None``, set to ``('de', 'en')``. 923 valid_set (str, optional): A string to identify validation set, when usage is valid or all, the validation set 924 of `valid_set` type will be read, supported values are ``'dev2010'``, ``'tst2010'``, ``'tst2011'``, 925 ``'tst2012'``, ``'tst2013'`` and ``'tst2014'``. Default: ``None``, set to ``'tst2013'``. 926 test_set (str, optional): A string to identify test set, when usage is test or all, the test set of `test_set` 927 type will be read, supported values are ``'dev2010'``, ``'tst2010'``, ``'tst2011'``, ``'tst2012'``, 928 ``'tst2013'`` and ``'tst2014'``. Default: ``None``, set to ``'tst2014'``. 929 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset. 930 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 931 Bool type and Shuffle enum are both supported to pass in. 932 Default: ``Shuffle.GLOBAL`` . 933 If `shuffle` is ``False``, no shuffling will be performed. 934 If `shuffle` is ``True``, it is equivalent to setting `shuffle` to 935 ``mindspore.dataset.Shuffle.GLOBAL`` . 936 Set the mode of data shuffling by passing in enumeration variables: 937 938 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 939 940 - ``Shuffle.FILES`` : Shuffle files only. 941 942 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 943 When this argument is specified, `num_samples` reflects the max sample number of per shard. 944 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 945 argument can only be specified when `num_shards` is also specified. 946 num_parallel_workers (int, optional): Number of worker threads to read the data. 947 Default: ``None`` , will use global default workers(8), it can be set 948 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 949 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 950 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 951 Default: ``None`` , which means no cache is used. 952 953 Raises: 954 RuntimeError: If `dataset_dir` does not contain data files. 955 RuntimeError: If `num_shards` is specified but `shard_id` is None. 956 RuntimeError: If `shard_id` is specified but `num_shards` is None. 957 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 958 959 Tutorial Examples: 960 - `Load & Process Data With Dataset Pipeline 961 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 962 963 Examples: 964 >>> import mindspore.dataset as ds 965 >>> iwslt2016_dataset_dir = "/path/to/iwslt2016_dataset_dir" 966 >>> dataset = ds.IWSLT2016Dataset(dataset_dir=iwslt2016_dataset_dir, usage='all', 967 ... language_pair=('de', 'en'), valid_set='tst2013', test_set='tst2014') 968 969 About IWSLT2016 dataset: 970 971 IWSLT is an international oral translation conference, a major annual scientific conference dedicated to all aspects 972 of oral translation. The MT task of the IWSLT evaluation activity constitutes a dataset, which can be publicly 973 obtained through the WIT3 website `wit3 <https://wit3.fbk.eu>`_ . The IWSLT2016 dataset includes translations from 974 English to Arabic, Czech, French, and German, and translations from Arabic, Czech, French, and German to English. 975 976 You can unzip the original IWSLT2016 dataset files into this directory structure and read by MindSpore's API. After 977 decompression, you also need to decompress the dataset to be read in the specified folder. For example, if you want 978 to read the dataset of de-en, you need to unzip the tgz file in the de/en directory, the dataset is in the 979 unzipped folder. 980 981 .. code-block:: 982 983 . 984 └── iwslt2016_dataset_directory 985 ├── subeval_files 986 └── texts 987 ├── ar 988 │ └── en 989 │ └── ar-en 990 ├── cs 991 │ └── en 992 │ └── cs-en 993 ├── de 994 │ └── en 995 │ └── de-en 996 │ ├── IWSLT16.TED.dev2010.de-en.de.xml 997 │ ├── train.tags.de-en.de 998 │ ├── ... 999 ├── en 1000 │ ├── ar 1001 │ │ └── en-ar 1002 │ ├── cs 1003 │ │ └── en-cs 1004 │ ├── de 1005 │ │ └── en-de 1006 │ └── fr 1007 │ └── en-fr 1008 └── fr 1009 └── en 1010 └── fr-en 1011 1012 Citation: 1013 1014 .. code-block:: 1015 1016 @inproceedings{cettoloEtAl:EAMT2012, 1017 Address = {Trento, Italy}, 1018 Author = {Mauro Cettolo and Christian Girardi and Marcello Federico}, 1019 Booktitle = {Proceedings of the 16$^{th}$ Conference of the European Association for Machine Translation 1020 (EAMT)}, 1021 Date = {28-30}, 1022 Month = {May}, 1023 Pages = {261--268}, 1024 Title = {WIT$^3$: Web Inventory of Transcribed and Translated Talks}, 1025 Year = {2012}} 1026 """ 1027 1028 @check_iwslt2016_dataset 1029 def __init__(self, dataset_dir, usage=None, language_pair=None, valid_set=None, test_set=None, 1030 num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, num_parallel_workers=None, 1031 cache=None): 1032 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1033 num_shards=num_shards, shard_id=shard_id, cache=cache) 1034 self.dataset_dir = dataset_dir 1035 self.usage = replace_none(usage, 'all') 1036 self.language_pair = replace_none(language_pair, ["de", "en"]) 1037 self.valid_set = replace_none(valid_set, 'tst2013') 1038 self.test_set = replace_none(test_set, 'tst2014') 1039 1040 def parse(self, children=None): 1041 return cde.IWSLT2016Node(self.dataset_dir, self.usage, self.language_pair, self.valid_set, self.test_set, 1042 self.num_samples, self.shuffle_flag, self.num_shards, self.shard_id) 1043 1044 1045class IWSLT2017Dataset(SourceDataset, TextBaseDataset): 1046 """ 1047 IWSLT2017(International Workshop on Spoken Language Translation) dataset. 1048 1049 The generated dataset has two columns: :py:obj:`[text, translation]` . 1050 The tensor of column :py:obj:`text` and :py:obj:`translation` are of the string type. 1051 1052 Args: 1053 dataset_dir (str): Path to the root directory that contains the dataset. 1054 usage (str, optional): Acceptable usages include 'train', 'valid', 'test' and 'all'. Default: ``None`` , 1055 all samples. 1056 language_pair (sequence, optional): List containing src and tgt language, supported values are ``('en', 'nl')``, 1057 ``('en', 'de')``, ``('en', 'it')``, ``('en', 'ro')``, ``('nl', 'en')``, ``('nl', 'de')``, ``('nl', 'it')``, 1058 ``('nl', 'ro')``, ``('de', 'en')``, ``('de', 'nl')``, ``('de', 'it')``, ``('de', 'ro')``, ``('it', 'en')``, 1059 ``('it', 'nl')``, ``('it', 'de')``, ``('it', 'ro')``, ``('ro', 'en')``, ``('ro', 'nl')``, ``('ro', 'de')``, 1060 ``('ro', 'it')``. Default: ``None``, set to ``('de', 'en')``. 1061 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset. 1062 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1063 Bool type and Shuffle enum are both supported to pass in. 1064 Default: ``Shuffle.GLOBAL`` . 1065 If `shuffle` is ``False`` , no shuffling will be performed. 1066 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 1067 ``mindspore.dataset.Shuffle.GLOBAL`` . 1068 Set the mode of data shuffling by passing in enumeration variables: 1069 1070 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1071 1072 - ``Shuffle.FILES`` : Shuffle files only. 1073 1074 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1075 When this argument is specified, `num_samples` reflects the max sample number of per shard. 1076 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1077 argument can only be specified when `num_shards` is also specified. 1078 num_parallel_workers (int, optional): Number of worker threads to read the data. 1079 Default: ``None`` , will use global default workers(8), it can be set 1080 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1081 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1082 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1083 Default: ``None`` , which means no cache is used. 1084 1085 Raises: 1086 RuntimeError: If `dataset_dir` does not contain data files. 1087 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1088 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1089 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1090 1091 Tutorial Examples: 1092 - `Load & Process Data With Dataset Pipeline 1093 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1094 1095 Examples: 1096 >>> import mindspore.dataset as ds 1097 >>> iwslt2017_dataset_dir = "/path/to/iwslt2017_dataset_dir" 1098 >>> dataset = ds.IWSLT2017Dataset(dataset_dir=iwslt2017_dataset_dir, usage='all', language_pair=('de', 'en')) 1099 1100 About IWSLT2017 dataset: 1101 1102 IWSLT is an international oral translation conference, a major annual scientific conference dedicated to all aspects 1103 of oral translation. The MT task of the IWSLT evaluation activity constitutes a dataset, which can be publicly 1104 obtained through the WIT3 website `wit3 <https://wit3.fbk.eu>`_ . The IWSLT2017 dataset involves German, English, 1105 Italian, Dutch, and Romanian. The dataset includes translations in any two different languages. 1106 1107 You can unzip the original IWSLT2017 dataset files into this directory structure and read by MindSpore's API. You 1108 need to decompress the dataset package in texts/DeEnItNlRo/DeEnItNlRo directory to get the DeEnItNlRo-DeEnItNlRo 1109 subdirectory. 1110 1111 .. code-block:: 1112 1113 . 1114 └── iwslt2017_dataset_directory 1115 └── DeEnItNlRo 1116 └── DeEnItNlRo 1117 └── DeEnItNlRo-DeEnItNlRo 1118 ├── IWSLT17.TED.dev2010.de-en.de.xml 1119 ├── train.tags.de-en.de 1120 ├── ... 1121 1122 Citation: 1123 1124 .. code-block:: 1125 1126 @inproceedings{cettoloEtAl:EAMT2012, 1127 Address = {Trento, Italy}, 1128 Author = {Mauro Cettolo and Christian Girardi and Marcello Federico}, 1129 Booktitle = {Proceedings of the 16$^{th}$ Conference of the European Association for Machine Translation 1130 (EAMT)}, 1131 Date = {28-30}, 1132 Month = {May}, 1133 Pages = {261--268}, 1134 Title = {WIT$^3$: Web Inventory of Transcribed and Translated Talks}, 1135 Year = {2012}} 1136 """ 1137 1138 @check_iwslt2017_dataset 1139 def __init__(self, dataset_dir, usage=None, language_pair=None, num_samples=None, shuffle=Shuffle.GLOBAL, 1140 num_shards=None, shard_id=None, num_parallel_workers=None, cache=None): 1141 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1142 num_shards=num_shards, shard_id=shard_id, cache=cache) 1143 self.dataset_dir = dataset_dir 1144 self.usage = replace_none(usage, 'all') 1145 self.language_pair = replace_none(language_pair, ["de", "en"]) 1146 1147 def parse(self, children=None): 1148 return cde.IWSLT2017Node(self.dataset_dir, self.usage, self.language_pair, self.num_samples, 1149 self.shuffle_flag, self.num_shards, self.shard_id) 1150 1151 1152class Multi30kDataset(SourceDataset, TextBaseDataset): 1153 """ 1154 Multi30k dataset. 1155 1156 The generated dataset has two columns :py:obj:`[text, translation]` . 1157 The tensor of column :py:obj:`text` is of the string type. 1158 The tensor of column :py:obj:`translation` is of the string type. 1159 1160 Args: 1161 dataset_dir (str): Path to the root directory that contains the dataset. 1162 usage (str, optional): Acceptable usages include ``'train'``, ``'test'``, ``'valid'`` or ``'all'``. 1163 Default: ``None`` , will read all samples. 1164 language_pair (Sequence[str, str], optional): Acceptable language_pair include ``['en', 'de']``, 1165 ``['de', 'en']``. Default: ``None`` , means ``['en', 'de']``. 1166 num_samples (int, optional): The number of images to be included in the dataset. 1167 Default: ``None`` , will read all samples. 1168 num_parallel_workers (int, optional): Number of worker threads to read the data. 1169 Default: ``None`` , will use global default workers(8), it can be set 1170 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1171 shuffle (Union[bool, Shuffle], optional): Whether to shuffle the dataset. Default: ``None`` , 1172 means ``mindspore.dataset.Shuffle.GLOBAL`` . 1173 If ``False`` is provided, no shuffling will be performed. 1174 If ``True`` is provided, it is the same as setting to 1175 ``mindspore.dataset.Shuffle.GLOBAL`` . 1176 If Shuffle is provided, the effect is as follows: 1177 1178 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1179 - ``Shuffle.FILES`` : Shuffle files only. 1180 1181 num_shards (int, optional): Number of shards that the dataset will be divided 1182 into. Default: ``None`` . When this argument is specified, `num_samples` reflects 1183 the max sample number of per shard. 1184 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1185 argument can only be specified when `num_shards` is also specified. 1186 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1187 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1188 Default: ``None`` , which means no cache is used. 1189 1190 Raises: 1191 RuntimeError: If `dataset_dir` does not contain data files. 1192 ValueError: If `usage` is not ``'train'``, ``'test'``, ``'valid'`` or ``'all'``. 1193 TypeError: If `language_pair` is not of type Sequence[str, str]. 1194 RuntimeError: If num_samples is less than 0. 1195 RuntimeError: If `num_parallel_workers` exceeds the max thread numbers. 1196 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1197 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1198 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 1199 1200 Tutorial Examples: 1201 - `Load & Process Data With Dataset Pipeline 1202 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1203 1204 Examples: 1205 >>> import mindspore.dataset as ds 1206 >>> multi30k_dataset_dir = "/path/to/multi30k_dataset_directory" 1207 >>> data = ds.Multi30kDataset(dataset_dir=multi30k_dataset_dir, usage='all', language_pair=['de', 'en']) 1208 1209 About Multi30k dataset: 1210 1211 Multi30K is a multilingual dataset that features approximately 31,000 standardized images 1212 described in multiple languages. The images are sourced from Flickr and each image comes 1213 with sentence descripitions in both English and German, as well as descriptions in other 1214 languages. Multi30k is used primarily for training and testing in tasks such as image 1215 captioning, machine translation, and visual question answering. 1216 1217 You can unzip the dataset files into the following directory structure and read by MindSpore's API. 1218 1219 .. code-block:: 1220 1221 └── multi30k_dataset_directory 1222 ├── training 1223 │ ├── train.de 1224 │ └── train.en 1225 ├── validation 1226 │ ├── val.de 1227 │ └── val.en 1228 └── mmt16_task1_test 1229 ├── val.de 1230 └── val.en 1231 1232 Citation: 1233 1234 .. code-block:: 1235 1236 @article{elliott-EtAl:2016:VL16, 1237 author = {{Elliott}, D. and {Frank}, S. and {Sima'an}, K. and {Specia}, L.}, 1238 title = {Multi30K: Multilingual English-German Image Descriptions}, 1239 booktitle = {Proceedings of the 5th Workshop on Vision and Language}, 1240 year = {2016}, 1241 pages = {70--74}, 1242 year = 2016 1243 } 1244 """ 1245 1246 @check_multi30k_dataset 1247 def __init__(self, dataset_dir, usage=None, language_pair=None, num_samples=None, 1248 num_parallel_workers=None, shuffle=None, num_shards=None, shard_id=None, cache=None): 1249 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1250 num_shards=num_shards, shard_id=shard_id, cache=cache) 1251 self.dataset_dir = dataset_dir 1252 self.usage = replace_none(usage, 'all') 1253 self.language_pair = replace_none(language_pair, ["en", "de"]) 1254 self.shuffle = replace_none(shuffle, Shuffle.GLOBAL) 1255 1256 def parse(self, children=None): 1257 return cde.Multi30kNode(self.dataset_dir, self.usage, self.language_pair, self.num_samples, 1258 self.shuffle_flag, self.num_shards, self.shard_id) 1259 1260 1261class PennTreebankDataset(SourceDataset, TextBaseDataset): 1262 """ 1263 PennTreebank dataset. 1264 1265 The generated dataset has one column :py:obj:`[text]` . 1266 The tensor of column :py:obj:`text` is of the string type. 1267 1268 Args: 1269 dataset_dir (str): Path to the root directory that contains the dataset. 1270 usage (str, optional): Acceptable usages include ``'train'``, ``'test'``, ``'valid'`` and ``'all'``. 1271 ``'train'`` will read from 42,068 train samples of string type, 1272 ``'test'`` will read from 3,370 test samples of string type, 1273 ``'valid'`` will read from 3,761 test samples of string type, 1274 ``'all'`` will read from all 49,199 samples of string type. Default: ``None`` , all samples. 1275 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset. 1276 num_parallel_workers (int, optional): Number of worker threads to read the data. 1277 Default: ``None`` , will use global default workers(8), it can be set 1278 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1279 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1280 Bool type and Shuffle enum are both supported to pass in. 1281 Default: ``Shuffle.GLOBAL`` . 1282 If `shuffle` is ``False`` , no shuffling will be performed. 1283 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 1284 ``mindspore.dataset.Shuffle.GLOBAL`` . 1285 Set the mode of data shuffling by passing in enumeration variables: 1286 1287 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1288 1289 - ``Shuffle.FILES`` : Shuffle files only. 1290 1291 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1292 When this argument is specified, `num_samples` reflects the max sample number of per shard. 1293 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1294 argument can only be specified when `num_shards` is also specified. 1295 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1296 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1297 Default: ``None`` , which means no cache is used. 1298 1299 Raises: 1300 RuntimeError: If `dataset_dir` does not contain data files. 1301 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1302 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1303 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1304 1305 Tutorial Examples: 1306 - `Load & Process Data With Dataset Pipeline 1307 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1308 1309 Examples: 1310 >>> import mindspore.dataset as ds 1311 >>> penn_treebank_dataset_dir = "/path/to/penn_treebank_dataset_directory" 1312 >>> dataset = ds.PennTreebankDataset(dataset_dir=penn_treebank_dataset_dir, usage='all') 1313 1314 About PennTreebank dataset: 1315 1316 Penn Treebank (PTB) dataset, is widely used in machine learning for NLP (Natural Language Processing) 1317 research. Word-level PTB does not contain capital letters, numbers, and punctuations, and the vocabulary 1318 is capped at 10k unique words, which is relatively small in comparison to most modern datasets which 1319 can result in a larger number of out of vocabulary tokens. 1320 1321 Here is the original PennTreebank dataset structure. 1322 You can unzip the dataset files into this directory structure and read by MindSpore's API. 1323 1324 .. code-block:: 1325 1326 . 1327 └── PennTreebank_dataset_dir 1328 ├── ptb.test.txt 1329 ├── ptb.train.txt 1330 └── ptb.valid.txt 1331 1332 Citation: 1333 1334 .. code-block:: 1335 1336 @techreport{Santorini1990, 1337 added-at = {2014-03-26T23:25:56.000+0100}, 1338 author = {Santorini, Beatrice}, 1339 biburl = {https://www.bibsonomy.org/bibtex/234cdf6ddadd89376090e7dada2fc18ec/butonic}, 1340 file = {:Santorini - Penn Treebank tag definitions.pdf:PDF}, 1341 institution = {Department of Computer and Information Science, University of Pennsylvania}, 1342 interhash = {818e72efd9e4b5fae3e51e88848100a0}, 1343 intrahash = {34cdf6ddadd89376090e7dada2fc18ec}, 1344 keywords = {dis pos tagging treebank}, 1345 number = {MS-CIS-90-47}, 1346 timestamp = {2014-03-26T23:25:56.000+0100}, 1347 title = {Part-of-speech tagging guidelines for the {P}enn {T}reebank {P}roject}, 1348 url = {ftp://ftp.cis.upenn.edu/pub/treebank/doc/tagguide.ps.gz}, 1349 year = 1990 1350 } 1351 """ 1352 1353 @check_penn_treebank_dataset 1354 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 1355 num_shards=None, shard_id=None, cache=None): 1356 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1357 num_shards=num_shards, shard_id=shard_id, cache=cache) 1358 self.dataset_dir = dataset_dir 1359 self.usage = replace_none(usage, "all") 1360 1361 def parse(self, children=None): 1362 return cde.PennTreebankNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 1363 self.shard_id) 1364 1365 1366class SogouNewsDataset(SourceDataset, TextBaseDataset): 1367 r""" 1368 Sogou News dataset. 1369 1370 The generated dataset has three columns: :py:obj:`[index, title, content]` , 1371 and the data type of three columns is string. 1372 1373 Args: 1374 dataset_dir (str): Path to the root directory that contains the dataset. 1375 usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` . 1376 ``'train'`` will read from 450,000 train samples, ``'test'`` will read from 60,000 test samples, 1377 ``'all'`` will read from all 510,000 samples. Default: ``None`` , all samples. 1378 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , read all samples. 1379 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1380 Bool type and Shuffle enum are both supported to pass in. 1381 Default: ``Shuffle.GLOBAL`` . 1382 If `shuffle` is ``False`` , no shuffling will be performed. 1383 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 1384 ``mindspore.dataset.Shuffle.GLOBAL`` . 1385 Set the mode of data shuffling by passing in enumeration variables: 1386 1387 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting shuffle to True. 1388 1389 - ``Shuffle.FILES`` : Shuffle files only. 1390 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1391 When this argument is specified, `num_samples` reflects the max sample number of per shard. 1392 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1393 argument can only be specified when `num_shards` is also specified. 1394 num_parallel_workers (int, optional): Number of worker threads to read the data. 1395 Default: ``None`` , will use global default workers(8), it can be set 1396 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1397 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1398 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1399 Default: ``None`` , which means no cache is used. 1400 1401 Raises: 1402 RuntimeError: If `dataset_dir` does not contain data files. 1403 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1404 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1405 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1406 1407 Tutorial Examples: 1408 - `Load & Process Data With Dataset Pipeline 1409 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1410 1411 Examples: 1412 >>> import mindspore.dataset as ds 1413 >>> sogou_news_dataset_dir = "/path/to/sogou_news_dataset_dir" 1414 >>> dataset = ds.SogouNewsDataset(dataset_dir=sogou_news_dataset_dir, usage='all') 1415 1416 About SogouNews Dataset: 1417 1418 SogouNews dataset includes 3 columns, corresponding to class index (1 to 5), title and content. The title and 1419 content are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). 1420 New lines are escaped by a backslash followed with an "n" character, that is "\n". 1421 1422 You can unzip the dataset files into the following structure and read by MindSpore's API: 1423 1424 .. code-block:: 1425 1426 . 1427 └── sogou_news_dir 1428 ├── classes.txt 1429 ├── readme.txt 1430 ├── test.csv 1431 └── train.csv 1432 1433 Citation: 1434 1435 .. code-block:: 1436 1437 @misc{zhang2015characterlevel, 1438 title={Character-level Convolutional Networks for Text Classification}, 1439 author={Xiang Zhang and Junbo Zhao and Yann LeCun}, 1440 year={2015}, 1441 eprint={1509.01626}, 1442 archivePrefix={arXiv}, 1443 primaryClass={cs.LG} 1444 } 1445 """ 1446 1447 @check_sogou_news_dataset 1448 def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None, 1449 shard_id=None, num_parallel_workers=None, cache=None): 1450 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1451 num_shards=num_shards, shard_id=shard_id, cache=cache) 1452 self.dataset_dir = dataset_dir 1453 self.usage = replace_none(usage, 'all') 1454 1455 def parse(self, children=None): 1456 return cde.SogouNewsNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, 1457 self.num_shards, self.shard_id) 1458 1459 1460class SQuADDataset(SourceDataset, TextBaseDataset): 1461 """ 1462 SQuAD 1.1 and SQuAD 2.0 datasets. 1463 1464 The generated dataset with different versions and usages has the same output columns: 1465 :py:obj:`[context, question, text, answer_start]` . 1466 The tensor of column :py:obj:`context` is of the string type. 1467 The tensor of column :py:obj:`question` is of the string type. 1468 The tensor of column :py:obj:`text` is the answer in the context of the string type. 1469 The tensor of column :py:obj:`answer_start` is the start index of answer in context, 1470 which is of the uint32 type. 1471 1472 Args: 1473 dataset_dir (str): Path to the root directory that contains the dataset. 1474 usage (str, optional): Specify the ``'train'``, ``'dev'`` or ``'all'`` part of dataset. 1475 Default: ``None`` , all samples. 1476 num_samples (int, optional): The number of samples to be included in the dataset. 1477 Default: ``None`` , will include all samples. 1478 num_parallel_workers (int, optional): Number of worker threads to read the data. 1479 Default: ``None`` , will use global default workers(8), it can be set 1480 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1481 shuffle (Union[bool, Shuffle], optional): Whether to shuffle the dataset. 1482 Default: ``Shuffle.GLOBAL`` . 1483 If ``False`` is provided, no shuffling will be performed. 1484 If ``True`` is provided, it is the same as setting to 1485 ``mindspore.dataset.Shuffle.GLOBAL`` . 1486 If Shuffle is provided, the effect is as follows: 1487 1488 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1489 - ``Shuffle.FILES`` : Shuffle files only. 1490 1491 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1492 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 1493 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1494 argument can only be specified when `num_shards` is also specified. 1495 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1496 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1497 Default: ``None`` , which means no cache is used. 1498 1499 Raises: 1500 RuntimeError: If `dataset_dir` does not contain data files. 1501 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1502 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1503 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1504 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 1505 1506 Tutorial Examples: 1507 - `Load & Process Data With Dataset Pipeline 1508 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1509 1510 Examples: 1511 >>> import mindspore.dataset as ds 1512 >>> squad_dataset_dir = "/path/to/squad_dataset_file" 1513 >>> dataset = ds.SQuADDataset(dataset_dir=squad_dataset_dir, usage='all') 1514 1515 About SQuAD dataset: 1516 1517 SQuAD (Stanford Question Answering Dataset) is a reading comprehension dataset, consisting of questions posed by 1518 crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, 1519 from the corresponding reading passage, or the question might be unanswerable. 1520 1521 SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles. 1522 SQuAD 2.0 combines the 100,000 questions in SQuAD 1.1 with over 50,000 unanswerable questions written adversarially 1523 by crowdworkers to look similar to answerable ones. To do well on SQuAD 2.0, systems must not only answer questions 1524 when possible, but also determine when no answer is supported by the paragraph and abstain from answering. 1525 1526 You can get the dataset files into the following structure and read by MindSpore's API, 1527 1528 For SQuAD 1.1: 1529 1530 .. code-block:: 1531 1532 . 1533 └── SQuAD1 1534 ├── train-v1.1.json 1535 └── dev-v1.1.json 1536 1537 For SQuAD 2.0: 1538 1539 .. code-block:: 1540 1541 . 1542 └── SQuAD2 1543 ├── train-v2.0.json 1544 └── dev-v2.0.json 1545 1546 Citation: 1547 1548 .. code-block:: 1549 1550 @misc{rajpurkar2016squad, 1551 title = {SQuAD: 100,000+ Questions for Machine Comprehension of Text}, 1552 author = {Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang}, 1553 year = {2016}, 1554 eprint = {1606.05250}, 1555 archivePrefix = {arXiv}, 1556 primaryClass = {cs.CL} 1557 } 1558 1559 @misc{rajpurkar2018know, 1560 title = {Know What You Don't Know: Unanswerable Questions for SQuAD}, 1561 author = {Pranav Rajpurkar and Robin Jia and Percy Liang}, 1562 year = {2018}, 1563 eprint = {1806.03822}, 1564 archivePrefix = {arXiv}, 1565 primaryClass = {cs.CL} 1566 } 1567 """ 1568 1569 @check_squad_dataset 1570 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, 1571 shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, cache=None): 1572 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1573 num_shards=num_shards, shard_id=shard_id, cache=cache) 1574 self.dataset_dir = dataset_dir 1575 self.usage = replace_none(usage, 'all') 1576 1577 def parse(self, children=None): 1578 return cde.SQuADNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, 1579 self.num_shards, self.shard_id) 1580 1581 1582class SST2Dataset(SourceDataset, TextBaseDataset): 1583 """ 1584 SST2(Stanford Sentiment Treebank v2) dataset. 1585 1586 The generated dataset's train.tsv and dev.tsv have two columns :py:obj:`[sentence, label]` . 1587 The generated dataset's test.tsv has one column :py:obj:`[sentence]` . 1588 The tensor of column :py:obj:`sentence` and :py:obj:`label` are of the string type. 1589 1590 Args: 1591 dataset_dir (str): Path to the root directory that contains the dataset. 1592 usage (str, optional): Usage of this dataset, can be ``"train"``, ``"test"`` or ``"dev"``. 1593 ``"train"`` will read from 67,349 train samples, ``"test"`` will read from 1,821 test samples, 1594 ``"dev"`` will read from all 872 samples. Default: ``None`` , will read train samples. 1595 num_samples (int, optional): The number of samples to be included in the dataset. 1596 Default: ``None`` , will include all text. 1597 num_parallel_workers (int, optional): Number of worker threads to read the data. 1598 Default: ``None`` , will use global default workers(8), it can be set 1599 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1600 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1601 Bool type and Shuffle enum are both supported to pass in. 1602 Default: ``Shuffle.GLOBAL`` . 1603 If `shuffle` is ``False`` , no shuffling will be performed; 1604 If `shuffle` is ``True`` , the behavior is the same as setting shuffle to be Shuffle.GLOBAL 1605 Set the mode of data shuffling by passing in enumeration variables: 1606 1607 - ``Shuffle.GLOBAL`` : Shuffle the samples. 1608 1609 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1610 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 1611 shard_id (int, optional): The shard ID within `num_shards`. This argument can only be specified when 1612 `num_shards` is also specified. Default: ``None`` . 1613 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1614 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1615 Default: ``None`` , which means no cache is used. 1616 1617 Raises: 1618 RuntimeError: If `dataset_dir` does not contain data files. 1619 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1620 RuntimeError: If `num_shards` is specified but shard_id is None. 1621 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1622 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 1623 1624 Tutorial Examples: 1625 - `Load & Process Data With Dataset Pipeline 1626 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1627 1628 Examples: 1629 >>> import mindspore.dataset as ds 1630 >>> sst2_dataset_dir = "/path/to/sst2_dataset_directory" 1631 >>> 1632 >>> # 1) Read 3 samples from SST2 dataset 1633 >>> dataset = ds.SST2Dataset(dataset_dir=sst2_dataset_dir, num_samples=3) 1634 >>> 1635 >>> # 2) Read train samples from SST2 dataset 1636 >>> dataset = ds.SST2Dataset(dataset_dir=sst2_dataset_dir, usage="train") 1637 1638 About SST2 dataset: 1639 The Stanford Sentiment Treebank is a corpus with fully labeled parse trees that allows for a complete 1640 analysis of the compositional effects of sentiment in language. The corpus is based on the dataset introduced 1641 by Pang and Lee (2005) and consists of 11,855 single sentences extracted from movie reviews. It was parsed 1642 with the Stanford parser and includes a total of 215,154 unique phrases from those parse trees, each 1643 annotated by 3 human judges. 1644 1645 Here is the original SST2 dataset structure. 1646 You can unzip the dataset files into this directory structure and read by Mindspore's API. 1647 1648 .. code-block:: 1649 1650 . 1651 └── sst2_dataset_dir 1652 ├── train.tsv 1653 ├── test.tsv 1654 ├── dev.tsv 1655 └── original 1656 1657 Citation: 1658 1659 .. code-block:: 1660 1661 @inproceedings{socher-etal-2013-recursive, 1662 title = {Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank}, 1663 author = {Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, 1664 Christopher D. and Ng, Andrew and Potts, Christopher}, 1665 booktitle = {Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing}, 1666 month = oct, 1667 year = {2013}, 1668 address = {Seattle, Washington, USA}, 1669 publisher = {Association for Computational Linguistics}, 1670 url = {https://www.aclweb.org/anthology/D13-1170}, 1671 pages = {1631--1642}, 1672 } 1673 """ 1674 1675 @check_sst2_dataset 1676 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 1677 num_shards=None, shard_id=None, cache=None): 1678 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1679 num_shards=num_shards, shard_id=shard_id, cache=cache) 1680 self.dataset_dir = dataset_dir 1681 self.usage = replace_none(usage, "train") 1682 1683 def parse(self, children=None): 1684 return cde.SST2Node(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, 1685 self.num_shards, self.shard_id) 1686 1687 1688class TextFileDataset(SourceDataset, TextBaseDataset): 1689 """ 1690 A source dataset that reads and parses datasets stored on disk in text format. 1691 The generated dataset has one column :py:obj:`[text]` with type string. 1692 1693 Args: 1694 dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for a 1695 pattern of files. The list will be sorted in a lexicographical order. 1696 num_samples (int, optional): The number of samples to be included in the dataset. 1697 Default: ``None`` , will include all images. 1698 num_parallel_workers (int, optional): Number of worker threads to read the data. 1699 Default: ``None`` , will use global default workers(8), it can be set 1700 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1701 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1702 Default: ``Shuffle.GLOBAL`` . 1703 Bool type and Shuffle enum are both supported to pass in. 1704 If `shuffle` is ``False`` , no shuffling will be performed. 1705 If `shuffle` is ``True`` , performs global shuffle. 1706 There are three levels of shuffling, desired shuffle enum defined by :class:`mindspore.dataset.Shuffle` . 1707 1708 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting shuffle to True. 1709 1710 - ``Shuffle.FILES`` : Shuffle files only. 1711 1712 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1713 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 1714 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1715 argument can only be specified when `num_shards` is also specified. 1716 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1717 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1718 Default: ``None`` , which means no cache is used. 1719 1720 Raises: 1721 ValueError: If dataset_files are not valid or do not exist. 1722 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1723 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1724 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1725 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 1726 1727 Tutorial Examples: 1728 - `Load & Process Data With Dataset Pipeline 1729 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1730 1731 Examples: 1732 >>> import mindspore.dataset as ds 1733 >>> text_file_list = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files 1734 >>> dataset = ds.TextFileDataset(dataset_files=text_file_list) 1735 """ 1736 1737 @check_textfiledataset 1738 def __init__(self, dataset_files, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 1739 num_shards=None, shard_id=None, cache=None): 1740 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1741 num_shards=num_shards, shard_id=shard_id, cache=cache) 1742 self.dataset_files = self._find_files(dataset_files) 1743 self.dataset_files.sort() 1744 1745 def parse(self, children=None): 1746 return cde.TextFileNode(self.dataset_files, self.num_samples, self.shuffle_flag, self.num_shards, 1747 self.shard_id) 1748 1749 1750class UDPOSDataset(SourceDataset, TextBaseDataset): 1751 """ 1752 UDPOS(Universal Dependencies dataset for Part of Speech) dataset. 1753 1754 The generated dataset has three columns: :py:obj:`[word, universal, stanford]` , 1755 and the data type of three columns is string. 1756 1757 Args: 1758 dataset_dir (str): Path to the root directory that contains the dataset. 1759 usage (str, optional): Usage of this dataset, can be ``'train'``, ``'test'``, ``'valid'`` or ``'all'``. 1760 ``'train'`` will read from 12,543 train samples, ``'test'`` will read from 2,077 test samples, 1761 ``'valid'`` will read from 2,002 test samples, ``'all'`` will read from all 16,622 samples. 1762 Default: ``None`` , all samples. 1763 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset. 1764 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1765 Bool type and Shuffle enum are both supported to pass in. 1766 Default: ``Shuffle.GLOBAL`` . 1767 If `shuffle` is ``False`` , no shuffling will be performed. 1768 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 1769 ``mindspore.dataset.Shuffle.GLOBAL`` . 1770 Set the mode of data shuffling by passing in enumeration variables: 1771 1772 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1773 1774 - ``Shuffle.FILES`` : Shuffle files only. 1775 1776 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1777 When this argument is specified, `num_samples` reflects the max sample number of per shard. 1778 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1779 argument can only be specified when `num_shards` is also specified. 1780 num_parallel_workers (int, optional): Number of worker threads to read the data. 1781 Default: ``None`` , will use global default workers(8), it can be set 1782 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1783 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1784 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1785 Default: ``None`` , which means no cache is used. 1786 1787 Raises: 1788 RuntimeError: If `dataset_dir` does not contain data files. 1789 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1790 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1791 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1792 1793 Tutorial Examples: 1794 - `Load & Process Data With Dataset Pipeline 1795 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1796 1797 Examples: 1798 >>> import mindspore.dataset as ds 1799 >>> udpos_dataset_dir = "/path/to/udpos_dataset_dir" 1800 >>> dataset = ds.UDPOSDataset(dataset_dir=udpos_dataset_dir, usage='all') 1801 1802 About UDPOS dataset: 1803 1804 Text corpus dataset that clarifies syntactic or semantic sentence structure. 1805 The corpus comprises 254,830 words and 16,622 sentences, taken from various web media including 1806 weblogs, newsgroups, emails and reviews. 1807 1808 Citation: 1809 1810 .. code-block:: 1811 1812 @inproceedings{silveira14gold, 1813 year = {2014}, 1814 author = {Natalia Silveira and Timothy Dozat and Marie-Catherine de Marneffe and Samuel Bowman 1815 and Miriam Connor and John Bauer and Christopher D. Manning}, 1816 title = {A Gold Standard Dependency Corpus for {E}nglish}, 1817 booktitle = {Proceedings of the Ninth International Conference on Language 1818 Resources and Evaluation (LREC-2014)} 1819 } 1820 """ 1821 1822 @check_udpos_dataset 1823 def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None, 1824 shard_id=None, num_parallel_workers=None, cache=None): 1825 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1826 num_shards=num_shards, shard_id=shard_id, cache=cache) 1827 self.dataset_dir = dataset_dir 1828 self.usage = replace_none(usage, 'all') 1829 1830 def parse(self, children=None): 1831 return cde.UDPOSNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 1832 self.shard_id) 1833 1834 1835class WikiTextDataset(SourceDataset, TextBaseDataset): 1836 """ 1837 WikiText2 and WikiText103 datasets. 1838 1839 The generated dataset has one column :py:obj:`[text]` , and 1840 the tensor of column `text` is of the string type. 1841 1842 Args: 1843 dataset_dir (str): Path to the root directory that contains the dataset. 1844 usage (str, optional): Acceptable usages include ``'train'``, ``'test'``, ``'valid'`` and ``'all'``. 1845 Default: ``None`` , all samples. 1846 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset. 1847 num_parallel_workers (int, optional): Number of worker threads to read the data. 1848 Default: ``None`` , will use global default workers(8), it can be set 1849 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1850 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1851 Bool type and Shuffle enum are both supported to pass in. 1852 Default: ``Shuffle.GLOBAL`` . 1853 If `shuffle` is ``False`` , no shuffling will be performed. 1854 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 1855 ``mindspore.dataset.Shuffle.GLOBAL`` . 1856 Set the mode of data shuffling by passing in enumeration variables: 1857 1858 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1859 1860 - ``Shuffle.FILES`` : Shuffle files only. 1861 1862 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1863 When this argument is specified, `num_samples` reflects the max sample number of per shard. 1864 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1865 argument can only be specified when `num_shards` is also specified. 1866 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1867 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1868 Default: ``None`` , which means no cache is used. 1869 1870 Raises: 1871 RuntimeError: If `dataset_dir` does not contain data files or invalid. 1872 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1873 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1874 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 1875 ValueError: If `num_samples` is invalid (< 0). 1876 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1877 1878 Tutorial Examples: 1879 - `Load & Process Data With Dataset Pipeline 1880 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1881 1882 About WikiTextDataset dataset: 1883 1884 The WikiText Long Term Dependency Language Modeling Dataset is an English lexicon containing 100 million words. 1885 These terms are drawn from Wikipedia's premium and benchmark articles, including versions of Wikitext2 and 1886 Wikitext103. For WikiText2, it has 36718 lines in wiki.train.tokens, 4358 lines in wiki.test.tokens and 1887 3760 lines in wiki.valid.tokens. For WikiText103, it has 1801350 lines in wiki.train.tokens, 4358 lines in 1888 wiki.test.tokens and 3760 lines in wiki.valid.tokens. 1889 1890 Here is the original WikiText dataset structure. 1891 You can unzip the dataset files into this directory structure and read by MindSpore's API. 1892 1893 .. code-block:: 1894 1895 . 1896 └── WikiText2/WikiText103 1897 ├── wiki.train.tokens 1898 ├── wiki.test.tokens 1899 ├── wiki.valid.tokens 1900 1901 Citation: 1902 1903 .. code-block:: 1904 1905 @article{merity2016pointer, 1906 title={Pointer sentinel mixture models}, 1907 author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, 1908 journal={arXiv preprint arXiv:1609.07843}, 1909 year={2016} 1910 } 1911 1912 Examples: 1913 >>> import mindspore.dataset as ds 1914 >>> wiki_text_dataset_dir = "/path/to/wiki_text_dataset_directory" 1915 >>> dataset = ds.WikiTextDataset(dataset_dir=wiki_text_dataset_dir, usage='all') 1916 """ 1917 1918 @check_wiki_text_dataset 1919 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 1920 num_shards=None, shard_id=None, cache=None): 1921 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 1922 num_shards=num_shards, shard_id=shard_id, cache=cache) 1923 self.dataset_dir = dataset_dir 1924 self.usage = replace_none(usage, "all") 1925 1926 def parse(self, children=None): 1927 return cde.WikiTextNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards, 1928 self.shard_id) 1929 1930 1931class YahooAnswersDataset(SourceDataset, TextBaseDataset): 1932 """ 1933 YahooAnswers dataset. 1934 1935 The generated dataset has four columns :py:obj:`[class, title, content, answer]` , whose data type is string. 1936 1937 Args: 1938 dataset_dir (str): Path to the root directory that contains the dataset. 1939 usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` . 1940 ``'train'`` will read from 1,400,000 train samples, ``'test'`` will read from 60,000 test 1941 samples, ``'all'`` will read from all 1,460,000 samples. Default: ``None`` , all samples. 1942 num_samples (int, optional): The number of samples to be included in the dataset. 1943 Default: ``None`` , will include all text. 1944 num_parallel_workers (int, optional): Number of worker threads to read the data. 1945 Default: ``None`` , will use global default workers(8), it can be set 1946 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 1947 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 1948 Bool type and Shuffle enum are both supported to pass in. 1949 Default: ``Shuffle.GLOBAL`` . 1950 If `shuffle` is ``False`` , no shuffling will be performed. 1951 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 1952 ``mindspore.dataset.Shuffle.GLOBAL`` . 1953 Set the mode of data shuffling by passing in enumeration variables: 1954 1955 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 1956 1957 - ``Shuffle.FILES`` : Shuffle files only. 1958 1959 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 1960 When this argument is specified, `num_samples` reflects the maximum sample number of per shard. 1961 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 1962 argument can only be specified when `num_shards` is also specified. 1963 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 1964 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 1965 Default: ``None`` , which means no cache is used. 1966 1967 Raises: 1968 RuntimeError: If `dataset_dir` does not contain data files. 1969 RuntimeError: If `num_shards` is specified but `shard_id` is None. 1970 RuntimeError: If `shard_id` is specified but `num_shards` is None. 1971 ValueError: If `shard_id` is not in range of [0, `num_shards` ). 1972 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 1973 1974 Tutorial Examples: 1975 - `Load & Process Data With Dataset Pipeline 1976 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 1977 1978 Examples: 1979 >>> import mindspore.dataset as ds 1980 >>> yahoo_answers_dataset_dir = "/path/to/yahoo_answers_dataset_directory" 1981 >>> 1982 >>> # 1) Read 3 samples from YahooAnswers dataset 1983 >>> dataset = ds.YahooAnswersDataset(dataset_dir=yahoo_answers_dataset_dir, num_samples=3) 1984 >>> 1985 >>> # 2) Read train samples from YahooAnswers dataset 1986 >>> dataset = ds.YahooAnswersDataset(dataset_dir=yahoo_answers_dataset_dir, usage="train") 1987 1988 About YahooAnswers dataset: 1989 1990 The YahooAnswers dataset consists of 630,000 text samples in 10 classes, 1991 There are 560,000 samples in the train.csv and 70,000 samples in the test.csv. 1992 The 10 different classes represent Society & Culture, Science & Mathematics, Health, Education & Reference, 1993 Computers & Internet, Sports, Business & Finance, Entertainment & Music, Family & Relationships, 1994 Politics & Government. 1995 1996 Here is the original YahooAnswers dataset structure. 1997 You can unzip the dataset files into this directory structure and read by Mindspore's API. 1998 1999 .. code-block:: 2000 2001 . 2002 └── yahoo_answers_dataset_dir 2003 ├── train.csv 2004 ├── test.csv 2005 ├── classes.txt 2006 └── readme.txt 2007 2008 Citation: 2009 2010 .. code-block:: 2011 2012 @article{YahooAnswers, 2013 title = {Yahoo! Answers Topic Classification Dataset}, 2014 author = {Xiang Zhang}, 2015 year = {2015}, 2016 howpublished = {} 2017 } 2018 """ 2019 2020 @check_yahoo_answers_dataset 2021 def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL, 2022 num_shards=None, shard_id=None, cache=None): 2023 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 2024 num_shards=num_shards, shard_id=shard_id, cache=cache) 2025 self.dataset_dir = dataset_dir 2026 self.usage = replace_none(usage, "all") 2027 2028 def parse(self, children=None): 2029 return cde.YahooAnswersNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, 2030 self.num_shards, self.shard_id) 2031 2032 2033class YelpReviewDataset(SourceDataset, TextBaseDataset): 2034 """ 2035 Yelp Review Polarity and Yelp Review Full datasets. 2036 2037 The generated dataset has two columns: :py:obj:`[label, text]` , and the data type of two columns is string. 2038 2039 Args: 2040 dataset_dir (str): Path to the root directory that contains the dataset. 2041 usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` . 2042 For Polarity, ``'train'`` will read from 560,000 train samples, 2043 ``'test'`` will read from 38,000 test samples, 2044 ``'all'`` will read from all 598,000 samples. 2045 For Full, ``'train'`` will read from 650,000 train samples, ``'test'`` will read from 50,000 test samples, 2046 ``'all'`` will read from all 700,000 samples. Default: ``None`` , all samples. 2047 num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads all samples. 2048 shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch. 2049 Bool type and Shuffle enum are both supported to pass in. 2050 Default: ``Shuffle.GLOBAL`` . 2051 If `shuffle` is ``False`` , no shuffling will be performed. 2052 If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to 2053 ``mindspore.dataset.Shuffle.GLOBAL`` . 2054 Set the mode of data shuffling by passing in enumeration variables: 2055 2056 - ``Shuffle.GLOBAL`` : Shuffle both the files and samples. 2057 2058 - ``Shuffle.FILES`` : Shuffle files only. 2059 num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` . 2060 When this argument is specified, `num_samples` reflects the max sample number of per shard. 2061 shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This 2062 argument can only be specified when `num_shards` is also specified. 2063 num_parallel_workers (int, optional): Number of worker threads to read the data. 2064 Default: ``None`` , will use global default workers(8), it can be set 2065 by :func:`mindspore.dataset.config.set_num_parallel_workers` . 2066 cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details: 2067 `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ . 2068 Default: ``None`` , which means no cache is used. 2069 2070 Raises: 2071 RuntimeError: If `dataset_dir` does not contain data files. 2072 RuntimeError: If `num_shards` is specified but `shard_id` is None. 2073 RuntimeError: If `shard_id` is specified but `num_shards` is None. 2074 ValueError: If `num_parallel_workers` exceeds the max thread numbers. 2075 2076 Tutorial Examples: 2077 - `Load & Process Data With Dataset Pipeline 2078 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_ 2079 2080 Examples: 2081 >>> import mindspore.dataset as ds 2082 >>> yelp_review_dataset_dir = "/path/to/yelp_review_dataset_dir" 2083 >>> dataset = ds.YelpReviewDataset(dataset_dir=yelp_review_dataset_dir, usage='all') 2084 2085 About YelpReview Dataset: 2086 2087 The Yelp Review Full dataset consists of reviews from Yelp. It is extracted from the Yelp Dataset Challenge 2015 2088 data, and it is mainly used for text classification. 2089 2090 The Yelp Review Polarity dataset is constructed from the above dataset, by considering stars 1 and 2 negative, and 3 2091 and 4 positive. 2092 2093 The directory structures of these two datasets are the same. 2094 You can unzip the dataset files into the following structure and read by MindSpore's API: 2095 2096 .. code-block:: 2097 2098 . 2099 └── yelp_review_dir 2100 ├── train.csv 2101 ├── test.csv 2102 └── readme.txt 2103 2104 Citation: 2105 2106 For Yelp Review Polarity: 2107 2108 .. code-block:: 2109 2110 @article{zhangCharacterlevelConvolutionalNetworks2015, 2111 archivePrefix = {arXiv}, 2112 eprinttype = {arxiv}, 2113 eprint = {1509.01626}, 2114 primaryClass = {cs}, 2115 title = {Character-Level {{Convolutional Networks}} for {{Text Classification}}}, 2116 abstract = {This article offers an empirical exploration on the use of character-level convolutional networks 2117 (ConvNets) for text classification. We constructed several large-scale datasets to show that 2118 character-level convolutional networks could achieve state-of-the-art or competitive results. 2119 Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF 2120 variants, and deep learning models such as word-based ConvNets and recurrent neural networks.}, 2121 journal = {arXiv:1509.01626 [cs]}, 2122 author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, 2123 month = sep, 2124 year = {2015}, 2125 } 2126 2127 Citation: 2128 2129 For Yelp Review Full: 2130 2131 .. code-block:: 2132 2133 @article{zhangCharacterlevelConvolutionalNetworks2015, 2134 archivePrefix = {arXiv}, 2135 eprinttype = {arxiv}, 2136 eprint = {1509.01626}, 2137 primaryClass = {cs}, 2138 title = {Character-Level {{Convolutional Networks}} for {{Text Classification}}}, 2139 abstract = {This article offers an empirical exploration on the use of character-level convolutional networks 2140 (ConvNets) for text classification. We constructed several large-scale datasets to show that 2141 character-level convolutional networks could achieve state-of-the-art or competitive results. 2142 Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF 2143 variants, and deep learning models such as word-based ConvNets and recurrent neural networks.}, 2144 journal = {arXiv:1509.01626 [cs]}, 2145 author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, 2146 month = sep, 2147 year = {2015}, 2148 } 2149 """ 2150 2151 @check_yelp_review_dataset 2152 def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None, 2153 shard_id=None, num_parallel_workers=None, cache=None): 2154 super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle, 2155 num_shards=num_shards, shard_id=shard_id, cache=cache) 2156 self.dataset_dir = dataset_dir 2157 self.usage = replace_none(usage, 'all') 2158 2159 def parse(self, children=None): 2160 return cde.YelpReviewNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, 2161 self.num_shards, self.shard_id) 2162