• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2019-2023 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""
16This file contains specific text dataset loading classes. You can easily use
17these classes to load the prepared dataset. For example:
18    IMDBDataset: which is IMDB dataset.
19    WikiTextDataset: which is Wiki text dataset.
20    CLUEDataset: which is CLUE dataset.
21    YelpReviewDataset: which is yelp review dataset.
22    ...
23After declaring the dataset object, you can further apply dataset operations
24(e.g. filter, skip, concat, map, batch) on it.
25"""
26import mindspore._c_dataengine as cde
27
28from .datasets import TextBaseDataset, SourceDataset, MappableDataset, Shuffle
29from .validators import check_imdb_dataset, check_iwslt2016_dataset, check_iwslt2017_dataset, \
30    check_penn_treebank_dataset, check_ag_news_dataset, check_amazon_review_dataset, check_udpos_dataset, \
31    check_wiki_text_dataset, check_conll2000_dataset, check_cluedataset, \
32    check_sogou_news_dataset, check_textfiledataset, check_dbpedia_dataset, check_yelp_review_dataset, \
33    check_en_wik9_dataset, check_yahoo_answers_dataset, check_multi30k_dataset, check_squad_dataset, \
34    check_sst2_dataset
35
36from ..core.validator_helpers import replace_none
37
38
39class AGNewsDataset(SourceDataset, TextBaseDataset):
40    """
41    AG News dataset.
42
43    The generated dataset has three columns: :py:obj:`[index, title, description]` ,
44    and the data type of three columns is string type.
45
46    Args:
47        dataset_dir (str): Path to the root directory that contains the dataset.
48        usage (str, optional): Acceptable usages include ``'train'`` , ``'test'`` and ``'all'`` .
49            Default: ``None`` , all samples.
50        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` ,
51            reads the full dataset.
52        num_parallel_workers (int, optional): Number of worker threads to read the data.
53            Default: ``None`` , will use global default workers(8), it can be set
54            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
55        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
56            Bool type and Shuffle enum are both supported to pass in.
57            Default: ``Shuffle.GLOBAL`` .
58            If `shuffle` is ``False``, no shuffling will be performed.
59            If `shuffle` is ``True``, it is equivalent to setting `shuffle` to
60            ``mindspore.dataset.Shuffle.GLOBAL`` .
61            Set the mode of data shuffling by passing in enumeration variables:
62
63            - ``Shuffle.GLOBAL``: Shuffle both the files and samples.
64
65            - ``Shuffle.FILES``: Shuffle files only.
66
67        num_shards (int, optional): Number of shards that the dataset will be divided into.
68            Default: ``None``. When this argument is specified, `num_samples` reflects the
69            max sample number of per shard.
70        shard_id (int, optional): The shard ID within `num_shards` . This
71            argument can only be specified when `num_shards` is also specified. Default: ``None``.
72        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
73            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
74            Default: ``None``, which means no cache is used.
75
76    Raises:
77        RuntimeError: If `dataset_dir` does not contain data files.
78        RuntimeError: If `num_shards` is specified but `shard_id` is None.
79        RuntimeError: If `shard_id` is specified but `num_shards` is None.
80        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
81
82    Tutorial Examples:
83        - `Load & Process Data With Dataset Pipeline
84          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
85
86    Examples:
87        >>> import mindspore.dataset as ds
88        >>> ag_news_dataset_dir = "/path/to/ag_news_dataset_file"
89        >>> dataset = ds.AGNewsDataset(dataset_dir=ag_news_dataset_dir, usage='all')
90
91    About AGNews dataset:
92
93    AG is a collection of over 1 million news articles. The news articles were collected
94    by ComeToMyHead from over 2,000 news sources in over 1 year of activity. ComeToMyHead
95    is an academic news search engine that has been in operation since July 2004.
96    The dataset is provided by academics for research purposes such as data mining
97    (clustering, classification, etc.), information retrieval (ranking, searching, etc.),
98    xml, data compression, data streaming, and any other non-commercial activities.
99    AG's news topic classification dataset was constructed by selecting the four largest
100    classes from the original corpus. Each class contains 30,000 training samples and
101    1,900 test samples. The total number of training samples in train.csv is 120,000
102    and the number of test samples in test.csv is 7,600.
103
104    You can unzip the dataset files into the following structure and read by MindSpore's API:
105
106    .. code-block::
107
108        .
109        └── ag_news_dataset_dir
110            ├── classes.txt
111            ├── train.csv
112            ├── test.csv
113            └── readme.txt
114
115    Citation:
116
117    .. code-block::
118
119        @misc{zhang2015characterlevel,
120        title={Character-level Convolutional Networks for Text Classification},
121        author={Xiang Zhang and Junbo Zhao and Yann LeCun},
122        year={2015},
123        eprint={1509.01626},
124        archivePrefix={arXiv},
125        primaryClass={cs.LG}
126        }
127    """
128
129    @check_ag_news_dataset
130    def __init__(self, dataset_dir, usage=None, num_samples=None,
131                 num_parallel_workers=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, cache=None):
132        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
133                         num_shards=num_shards, shard_id=shard_id, cache=cache)
134        self.dataset_dir = dataset_dir
135        self.usage = replace_none(usage, "all")
136
137    def parse(self, children=None):
138        return cde.AGNewsNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
139                              self.shard_id)
140
141
142class AmazonReviewDataset(SourceDataset, TextBaseDataset):
143    """
144    Amazon Review Polarity and Amazon Review Full datasets.
145
146    The generated dataset has three columns: :py:obj:`[label, title, content]` ,
147    and the data type of three columns is string.
148
149    Args:
150        dataset_dir (str): Path to the root directory that contains the Amazon Review Polarity dataset
151            or the Amazon Review Full dataset.
152        usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` .
153            For Polarity dataset, ``'train'`` will read from 3,600,000 train samples,
154            ``'test'`` will read from 400,000 test samples,
155            ``'all'`` will read from all 4,000,000 samples.
156            For Full dataset, ``'train'`` will read from 3,000,000 train samples,
157            ``'test'`` will read from 650,000 test samples,
158            ``'all'`` will read from all 3,650,000 samples. Default: ``None``, all samples.
159        num_samples (int, optional): Number of samples (rows) to be read. Default: ``None``,
160            reads the full dataset.
161        num_parallel_workers (int, optional): Number of worker threads to read the data.
162            Default: ``None`` , will use global default workers(8), it can be set
163            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
164        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
165            Bool type and Shuffle enum are both supported to pass in.
166            Default: ``Shuffle.GLOBAL`` .
167            If `shuffle` is ``False``, no shuffling will be performed.
168            If `shuffle` is ``True``, it is equivalent to setting `shuffle` to
169            ``mindspore.dataset.Shuffle.GLOBAL``.
170            Set the mode of data shuffling by passing in enumeration variables:
171
172            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
173
174            - ``Shuffle.FILES`` : Shuffle files only.
175
176        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
177            When this argument is specified, `num_samples` reflects the max sample number of per shard.
178        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
179            argument can only be specified when `num_shards` is also specified.
180        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
181            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
182            Default: ``None`` , which means no cache is used.
183
184    Raises:
185        RuntimeError: If `dataset_dir` does not contain data files.
186        RuntimeError: If `num_shards` is specified but `shard_id` is None.
187        RuntimeError: If `shard_id` is specified but `num_shards` is None.
188        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
189
190    Tutorial Examples:
191        - `Load & Process Data With Dataset Pipeline
192          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
193
194    Examples:
195        >>> import mindspore.dataset as ds
196        >>> amazon_review_dataset_dir = "/path/to/amazon_review_dataset_dir"
197        >>> dataset = ds.AmazonReviewDataset(dataset_dir=amazon_review_dataset_dir, usage='all')
198
199    About AmazonReview Dataset:
200
201    The Amazon reviews full dataset consists of reviews from Amazon. The data span a period of 18 years, including ~35
202    million reviews up to March 2013. Reviews include product and user information, ratings, and a plaintext review.
203    The dataset is mainly used for text classification, given the content and title, predict the correct star rating.
204
205    The Amazon reviews polarity dataset is constructed by taking review score 1 and 2 as negative, 4 and 5 as positive.
206    Samples of score 3 is ignored.
207
208    The Amazon Reviews Polarity and Amazon Reviews Full datasets have the same directory structures.
209    You can unzip the dataset files into the following structure and read by MindSpore's API:
210
211    .. code-block::
212
213        .
214        └── amazon_review_dir
215             ├── train.csv
216             ├── test.csv
217             └── readme.txt
218
219    Citation:
220
221    .. code-block::
222
223        @article{zhang2015character,
224          title={Character-level convolutional networks for text classification},
225          author={Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
226          journal={Advances in neural information processing systems},
227          volume={28},
228          pages={649--657},
229          year={2015}
230        }
231    """
232
233    @check_amazon_review_dataset
234    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
235                 num_shards=None, shard_id=None, cache=None):
236        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
237                         num_shards=num_shards, shard_id=shard_id, cache=cache)
238        self.dataset_dir = dataset_dir
239        self.usage = replace_none(usage, 'all')
240
241    def parse(self, children=None):
242        return cde.AmazonReviewNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
243                                    self.shard_id)
244
245
246class CLUEDataset(SourceDataset, TextBaseDataset):
247    """
248    CLUE(Chinese Language Understanding Evaluation) dataset.
249    Supported CLUE classification tasks: ``'AFQMC'`` , ``'TNEWS'``, ``'IFLYTEK'``, ``'CMNLI'``,
250    ``'WSC'`` and ``'CSL'``.
251
252    Args:
253        dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for
254            a pattern of files. The list will be sorted in a lexicographical order.
255        task (str, optional): The kind of task, one of ``'AFQMC'`` , ``'TNEWS'``, ``'IFLYTEK'``, ``'CMNLI'``,
256            ``'WSC'`` and ``'CSL'``. Default: ``'AFQMC'`` .
257        usage (str, optional): Specify the ``'train'``, ``'test'`` or ``'eval'`` part of dataset.
258            Default: ``'train'``.
259        num_samples (int, optional): The number of samples to be included in the dataset.
260            Default: ``None`` , will include all images.
261        num_parallel_workers (int, optional): Number of worker threads to read the data.
262            Default: ``None`` , will use global default workers(8), it can be set
263            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
264        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
265            Default: ``Shuffle.GLOBAL`` . Bool type and Shuffle enum are both supported to pass in.
266            If `shuffle` is ``False``, no shuffling will be performed.
267            If `shuffle` is ``True``, performs global shuffle.
268            There are three levels of shuffling, desired shuffle enum defined by :class:`mindspore.dataset.Shuffle` .
269
270            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting `shuffle` to ``True``.
271
272            - ``Shuffle.FILES`` : Shuffle files only.
273
274        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
275            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
276        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
277            argument can only be specified when `num_shards` is also specified.
278        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
279            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
280            Default: ``None`` , which means no cache is used.
281
282    The generated dataset with different task setting has different output columns:
283
284    +-------------------------+------------------------------+-----------------------------+
285    | `task`                  |   `usage`                    |   Output column             |
286    +=========================+==============================+=============================+
287    | AFQMC                   |   train                      |   [sentence1, dtype=string] |
288    |                         |                              |                             |
289    |                         |                              |   [sentence2, dtype=string] |
290    |                         |                              |                             |
291    |                         |                              |   [label, dtype=string]     |
292    |                         +------------------------------+-----------------------------+
293    |                         |   test                       |   [id, dtype=uint32]        |
294    |                         |                              |                             |
295    |                         |                              |   [sentence1, dtype=string] |
296    |                         |                              |                             |
297    |                         |                              |   [sentence2, dtype=string] |
298    |                         +------------------------------+-----------------------------+
299    |                         |   eval                       |   [sentence1, dtype=string] |
300    |                         |                              |                             |
301    |                         |                              |   [sentence2, dtype=string] |
302    |                         |                              |                             |
303    |                         |                              |   [label, dtype=string]     |
304    +-------------------------+------------------------------+-----------------------------+
305    | TNEWS                   |   train                      |   [label, dtype=string]     |
306    |                         |                              |                             |
307    |                         |                              |   [label_des, dtype=string] |
308    |                         |                              |                             |
309    |                         |                              |   [sentence, dtype=string]  |
310    |                         |                              |                             |
311    |                         |                              |   [keywords, dtype=string]  |
312    |                         +------------------------------+-----------------------------+
313    |                         |   test                       |   [label, dtype=uint32]     |
314    |                         |                              |                             |
315    |                         |                              |   [keywords, dtype=string]  |
316    |                         |                              |                             |
317    |                         |                              |   [sentence, dtype=string]  |
318    |                         +------------------------------+-----------------------------+
319    |                         |   eval                       |   [label, dtype=string]     |
320    |                         |                              |                             |
321    |                         |                              |   [label_des, dtype=string] |
322    |                         |                              |                             |
323    |                         |                              |   [sentence, dtype=string]  |
324    |                         |                              |                             |
325    |                         |                              |   [keywords, dtype=string]  |
326    +-------------------------+------------------------------+-----------------------------+
327    | IFLYTEK                 |   train                      |   [label, dtype=string]     |
328    |                         |                              |                             |
329    |                         |                              |   [label_des, dtype=string] |
330    |                         |                              |                             |
331    |                         |                              |   [sentence, dtype=string]  |
332    |                         +------------------------------+-----------------------------+
333    |                         |   test                       |   [id, dtype=uint32]        |
334    |                         |                              |                             |
335    |                         |                              |   [sentence, dtype=string]  |
336    |                         +------------------------------+-----------------------------+
337    |                         |   eval                       |   [label, dtype=string]     |
338    |                         |                              |                             |
339    |                         |                              |   [label_des, dtype=string] |
340    |                         |                              |                             |
341    |                         |                              |   [sentence, dtype=string]  |
342    +-------------------------+------------------------------+-----------------------------+
343    | CMNLI                   |   train                      |   [sentence1, dtype=string] |
344    |                         |                              |                             |
345    |                         |                              |   [sentence2, dtype=string] |
346    |                         |                              |                             |
347    |                         |                              |   [label, dtype=string]     |
348    |                         +------------------------------+-----------------------------+
349    |                         |   test                       |   [id, dtype=uint32]        |
350    |                         |                              |                             |
351    |                         |                              |   [sentence1, dtype=string] |
352    |                         |                              |                             |
353    |                         |                              |   [sentence2, dtype=string] |
354    |                         +------------------------------+-----------------------------+
355    |                         |   eval                       |   [sentence1, dtype=string] |
356    |                         |                              |                             |
357    |                         |                              |   [sentence2, dtype=string] |
358    |                         |                              |                             |
359    |                         |                              |   [label, dtype=string]     |
360    +-------------------------+------------------------------+-----------------------------+
361    | WSC                     |   train                      |  [span1_index, dtype=uint32]|
362    |                         |                              |                             |
363    |                         |                              |  [span2_index, dtype=uint32]|
364    |                         |                              |                             |
365    |                         |                              |  [span1_text, dtype=string] |
366    |                         |                              |                             |
367    |                         |                              |  [span2_text, dtype=string] |
368    |                         |                              |                             |
369    |                         |                              |  [idx, dtype=uint32]        |
370    |                         |                              |                             |
371    |                         |                              |  [text, dtype=string]       |
372    |                         |                              |                             |
373    |                         |                              |  [label, dtype=string]      |
374    |                         +------------------------------+-----------------------------+
375    |                         |   test                       |  [span1_index, dtype=uint32]|
376    |                         |                              |                             |
377    |                         |                              |  [span2_index, dtype=uint32]|
378    |                         |                              |                             |
379    |                         |                              |  [span1_text, dtype=string] |
380    |                         |                              |                             |
381    |                         |                              |  [span2_text, dtype=string] |
382    |                         |                              |                             |
383    |                         |                              |  [idx, dtype=uint32]        |
384    |                         |                              |                             |
385    |                         |                              |  [text, dtype=string]       |
386    |                         +------------------------------+-----------------------------+
387    |                         |   eval                       |  [span1_index, dtype=uint32]|
388    |                         |                              |                             |
389    |                         |                              |  [span2_index, dtype=uint32]|
390    |                         |                              |                             |
391    |                         |                              |  [span1_text, dtype=string] |
392    |                         |                              |                             |
393    |                         |                              |  [span2_text, dtype=string] |
394    |                         |                              |                             |
395    |                         |                              |  [idx, dtype=uint32]        |
396    |                         |                              |                             |
397    |                         |                              |  [text, dtype=string]       |
398    |                         |                              |                             |
399    |                         |                              |  [label, dtype=string]      |
400    +-------------------------+------------------------------+-----------------------------+
401    | CSL                     |   train                      |   [id, dtype=uint32]        |
402    |                         |                              |                             |
403    |                         |                              |   [abst, dtype=string]      |
404    |                         |                              |                             |
405    |                         |                              |   [keyword, dtype=string]   |
406    |                         |                              |                             |
407    |                         |                              |   [label, dtype=string]     |
408    |                         +------------------------------+-----------------------------+
409    |                         |   test                       |   [id, dtype=uint32]        |
410    |                         |                              |                             |
411    |                         |                              |   [abst, dtype=string]      |
412    |                         |                              |                             |
413    |                         |                              |   [keyword, dtype=string]   |
414    |                         +------------------------------+-----------------------------+
415    |                         |   eval                       |   [id, dtype=uint32]        |
416    |                         |                              |                             |
417    |                         |                              |   [abst, dtype=string]      |
418    |                         |                              |                             |
419    |                         |                              |   [keyword, dtype=string]   |
420    |                         |                              |                             |
421    |                         |                              |   [label, dtype=string]     |
422    +-------------------------+------------------------------+-----------------------------+
423
424    Raises:
425        ValueError: If dataset_files are not valid or do not exist.
426        ValueError: task is not in ``'AFQMC'`` , ``'TNEWS'``, ``'IFLYTEK'``, ``'CMNLI'``, ``'WSC'``
427            or ``'CSL'``.
428        ValueError: usage is not in ``'train'``, ``'test'`` or ``'eval'``.
429        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
430        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
431        RuntimeError: If `num_shards` is specified but `shard_id` is None.
432        RuntimeError: If `shard_id` is specified but `num_shards` is None.
433
434    Tutorial Examples:
435        - `Load & Process Data With Dataset Pipeline
436          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
437
438    Examples:
439        >>> import mindspore.dataset as ds
440        >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple clue files
441        >>> dataset = ds.CLUEDataset(dataset_files=clue_dataset_dir, task='AFQMC', usage='train')
442
443    About CLUE dataset:
444
445    CLUE, a Chinese Language Understanding Evaluation benchmark. It contains multiple
446    tasks, including single-sentence classification, sentence pair classification, and machine
447    reading comprehension.
448
449    You can unzip the dataset files into the following structure and read by MindSpore's API,
450    such as afqmc dataset:
451
452    .. code-block::
453
454        .
455        └── afqmc_public
456             ├── train.json
457             ├── test.json
458             └── dev.json
459
460    Citation:
461
462    .. code-block::
463
464        @article{CLUEbenchmark,
465        title   = {CLUE: A Chinese Language Understanding Evaluation Benchmark},
466        author  = {Liang Xu, Xuanwei Zhang, Lu Li, Hai Hu, Chenjie Cao, Weitang Liu, Junyi Li, Yudong Li,
467                Kai Sun, Yechen Xu, Yiming Cui, Cong Yu, Qianqian Dong, Yin Tian, Dian Yu, Bo Shi, Jun Zeng,
468                Rongzhao Wang, Weijian Xie, Yanting Li, Yina Patterson, Zuoyu Tian, Yiwen Zhang, He Zhou,
469                Shaoweihua Liu, Qipeng Zhao, Cong Yue, Xinrui Zhang, Zhengliang Yang, Zhenzhong Lan},
470        journal = {arXiv preprint arXiv:2004.05986},
471        year    = {2020},
472        howpublished = {https://github.com/CLUEbenchmark/CLUE}
473        }
474    """
475
476    @check_cluedataset
477    def __init__(self, dataset_files, task='AFQMC', usage='train', num_samples=None, num_parallel_workers=None,
478                 shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, cache=None):
479        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
480                         num_shards=num_shards, shard_id=shard_id, cache=cache)
481        self.dataset_files = self._find_files(dataset_files)
482        self.usage = replace_none(usage, 'train')
483        self.task = replace_none(task, 'AFQMC')
484
485    def parse(self, children=None):
486        return cde.CLUENode(self.dataset_files, self.task, self.usage, self.num_samples, self.shuffle_flag,
487                            self.num_shards, self.shard_id)
488
489
490class CoNLL2000Dataset(SourceDataset, TextBaseDataset):
491    """
492    CoNLL-2000(Conference on Computational Natural Language Learning) chunking dataset.
493
494    The generated dataset has three columns: :py:obj:`[word, pos_tag, chunk_tag]` .
495    The tensors of column :py:obj:`word` , column :py:obj:`pos_tag` ,
496    and column :py:obj:`chunk_tag` are of the string type.
497
498    Args:
499        dataset_dir (str): Path to the root directory that contains the CoNLL2000 chunking dataset.
500        usage (str, optional): Usage of dataset, can be ``'train'`` , ``'test'`` , or ``'all'`` .
501            For dataset, ``'train'`` will read from 8,936 train samples,
502            ``'test'`` will read from 2,012 test samples,
503            ``'all'`` will read from all 1,0948 samples. Default: ``None`` , read all samples.
504        num_samples (int, optional): Number of samples (rows) to be read. Default: ``None`` ,
505            read the full dataset.
506        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
507            Default: ``Shuffle.GLOBAL`` .
508            If `shuffle` is ``False`` , no shuffling will be performed.
509            If `shuffle` is ``True`` , performs global shuffle.
510            There are three levels of shuffling, desired shuffle enum defined by
511            :class:`mindspore.dataset.Shuffle` .
512
513            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting `shuffle` to ``True``.
514            - ``Shuffle.FILES`` : Shuffle files only.
515
516        num_shards (int, optional): Number of shards that the dataset will be divided into.
517            When this argument is specified, `num_samples` reflects the max sample number of per shard.
518            Default: ``None`` .
519        shard_id (int, optional): The shard ID within `num_shards` . This
520            argument can only be specified when `num_shards` is also specified. Default: ``None`` .
521        num_parallel_workers (int, optional): Number of worker threads to read the data.
522            Default: ``None`` , will use global default workers(8), it can be set
523            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
524        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
525            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
526            Default: ``None`` , which means no cache is used.
527
528    Raises:
529        RuntimeError: If `dataset_dir` does not contain data files.
530        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
531        RuntimeError: If `num_shards` is specified but `shard_id` is None.
532        RuntimeError: If `shard_id` is specified but `num_shards` is None.
533
534    Tutorial Examples:
535        - `Load & Process Data With Dataset Pipeline
536          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
537
538    Examples:
539        >>> import mindspore.dataset as ds
540        >>> conll2000_dataset_dir = "/path/to/conll2000_dataset_dir"
541        >>> dataset = ds.CoNLL2000Dataset(dataset_dir=conll2000_dataset_dir, usage='all')
542
543    About CoNLL2000 Dataset:
544
545    The CoNLL2000 chunking dataset consists of the text from sections 15-20 of the Wall Street Journal corpus.
546    Texts are chunked using IOB notation, and the chunk type has NP, VP, PP, ADJP and ADVP.
547    The dataset consist of three columns separated by spaces. The first column contains the current word,
548    the second is part-of-speech tag as derived by the Brill tagger and the third is chunk tag as derived from
549    the WSJ corpus. Text chunking consists of dividing a text in syntactically correlated parts of words.
550
551    You can unzip the dataset files into the following structure and read by MindSpore's API:
552
553    .. code-block::
554
555        .
556        └── conll2000_dataset_dir
557             ├── train.txt
558             ├── test.txt
559             └── readme.txt
560
561    Citation:
562
563    .. code-block::
564
565        @inproceedings{tksbuchholz2000conll,
566        author     = {Tjong Kim Sang, Erik F. and Sabine Buchholz},
567        title      = {Introduction to the CoNLL-2000 Shared Task: Chunking},
568        editor     = {Claire Cardie and Walter Daelemans and Claire Nedellec and Tjong Kim Sang, Erik},
569        booktitle  = {Proceedings of CoNLL-2000 and LLL-2000},
570        publisher  = {Lisbon, Portugal},
571        pages      = {127--132},
572        year       = {2000}
573        }
574    """
575
576    @check_conll2000_dataset
577    def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None,
578                 shard_id=None, num_parallel_workers=None, cache=None):
579        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
580                         num_shards=num_shards, shard_id=shard_id, cache=cache)
581        self.dataset_dir = dataset_dir
582        self.usage = replace_none(usage, 'all')
583
584    def parse(self, children=None):
585        return cde.CoNLL2000Node(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
586                                 self.shard_id)
587
588
589class DBpediaDataset(SourceDataset, TextBaseDataset):
590    """
591    DBpedia dataset.
592
593    The generated dataset has three columns :py:obj:`[class, title, content]` ,
594    and the data type of three columns is string.
595
596    Args:
597        dataset_dir (str): Path to the root directory that contains the dataset.
598        usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` .
599            ``'train'`` will read from 560,000 train samples,
600            ``'test'`` will read from 70,000 test samples,
601            ``'all'`` will read from all 630,000 samples. Default: ``None`` , all samples.
602        num_samples (int, optional): The number of samples to be included in the dataset.
603            Default: ``None`` , will include all text.
604        num_parallel_workers (int, optional): Number of worker threads to read the data.
605            Default: ``None`` , will use global default workers(8), it can be set
606            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
607        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
608            Bool type and Shuffle enum are both supported to pass in.
609            Default: ``Shuffle.GLOBAL`` .
610            If `shuffle` is ``False`` , no shuffling will be performed.
611            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
612            ``mindspore.dataset.Shuffle.GLOBAL`` .
613            Set the mode of data shuffling by passing in enumeration variables:
614
615            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
616
617            - ``Shuffle.FILES`` : Shuffle files only.
618
619        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
620            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
621        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
622            argument can only be specified when `num_shards` is also specified.
623        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
624            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
625            Default: ``None`` , which means no cache is used.
626
627    Raises:
628        RuntimeError: If `dataset_dir` does not contain data files.
629        RuntimeError: If `num_shards` is specified but `shard_id` is None.
630        RuntimeError: If `shard_id` is specified but `num_shards` is None.
631        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
632        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
633
634    Tutorial Examples:
635        - `Load & Process Data With Dataset Pipeline
636          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
637
638    Examples:
639        >>> import mindspore.dataset as ds
640        >>> dbpedia_dataset_dir = "/path/to/dbpedia_dataset_directory"
641        >>>
642        >>> # 1) Read 3 samples from DBpedia dataset
643        >>> dataset = ds.DBpediaDataset(dataset_dir=dbpedia_dataset_dir, num_samples=3)
644        >>>
645        >>> # 2) Read train samples from DBpedia dataset
646        >>> dataset = ds.DBpediaDataset(dataset_dir=dbpedia_dataset_dir, usage="train")
647
648    About DBpedia dataset:
649
650    The DBpedia dataset consists of 630,000 text samples in 14 classes, there are 560,000 samples in the train.csv
651    and 70,000 samples in the test.csv.
652    The 14 different classes represent Company, EducationaInstitution, Artist, Athlete, OfficeHolder,
653    MeanOfTransportation, Building, NaturalPlace, Village, Animal, Plant, Album, Film, WrittenWork.
654
655    Here is the original DBpedia dataset structure.
656    You can unzip the dataset files into this directory structure and read by Mindspore's API.
657
658    .. code-block::
659
660        .
661        └── dbpedia_dataset_dir
662            ├── train.csv
663            ├── test.csv
664            ├── classes.txt
665            └── readme.txt
666
667    Citation:
668
669    .. code-block::
670
671        @article{DBpedia,
672        title   = {DBPedia Ontology Classification Dataset},
673        author  = {Jens Lehmann, Robert Isele, Max Jakob, Anja Jentzsch, Dimitris Kontokostas,
674                Pablo N. Mendes, Sebastian Hellmann, Mohamed Morsey, Patrick van Kleef,
675                    Sören Auer, Christian Bizer},
676        year    = {2015},
677        howpublished = {http://dbpedia.org}
678        }
679    """
680
681    @check_dbpedia_dataset
682    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
683                 num_shards=None, shard_id=None, cache=None):
684        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
685                         num_shards=num_shards, shard_id=shard_id, cache=cache)
686        self.dataset_dir = dataset_dir
687        self.usage = replace_none(usage, "all")
688
689    def parse(self, children=None):
690        return cde.DBpediaNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
691                               self.shard_id)
692
693
694class EnWik9Dataset(SourceDataset, TextBaseDataset):
695    """
696    EnWik9 dataset.
697
698    The generated dataset has one column :py:obj:`[text]` with type string.
699
700    Args:
701        dataset_dir (str): Path to the root directory that contains the dataset.
702        num_samples (int, optional): The number of samples to be included in the dataset.
703            Default: ``None`` , will include all samples.
704        num_parallel_workers (int, optional): Number of worker threads to read the data.
705            Default: ``None`` , will use global default workers(8), it can be set
706            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
707        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
708            Bool type and Shuffle enum are both supported to pass in. Default: ``True``.
709            If `shuffle` is ``False`` , no shuffling will be performed.
710            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
711            ``mindspore.dataset.Shuffle.GLOBAL`` .
712            Set the mode of data shuffling by passing in enumeration variables:
713
714            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
715
716            - ``Shuffle.FILES`` : Shuffle files only.
717
718        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
719            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
720        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
721            argument can only be specified when `num_shards` is also specified.
722        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
723            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
724            Default: ``None`` , which means no cache is used.
725
726    Raises:
727        RuntimeError: If `dataset_dir` does not contain data files.
728        RuntimeError: If `num_shards` is specified but `shard_id` is None.
729        RuntimeError: If `shard_id` is specified but `num_shards` is None.
730        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
731
732    Tutorial Examples:
733        - `Load & Process Data With Dataset Pipeline
734          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
735
736    Examples:
737        >>> import mindspore.dataset as ds
738        >>> en_wik9_dataset_dir = "/path/to/en_wik9_dataset"
739        >>> dataset2 = ds.EnWik9Dataset(dataset_dir=en_wik9_dataset_dir, num_samples=2,
740        ...                             shuffle=True)
741
742    About EnWik9 dataset:
743
744    The data of EnWik9 is UTF-8 encoded XML consisting primarily of English text. It contains 243,426 article titles,
745    of which 85,560 are #REDIRECT to fix broken links, and the rest are regular articles.
746
747    The data is UTF-8 clean. All characters are in the range U'0000 to U'10FFFF with valid encodings of 1 to
748    4 bytes. The byte values 0xC0, 0xC1, and 0xF5-0xFF never occur. Also, in the Wikipedia dumps,
749    there are no control characters in the range 0x00-0x1F except for 0x09 (tab) and 0x0A (linefeed).
750    Linebreaks occur only on paragraph boundaries, so they always have a semantic purpose.
751
752    You can unzip the dataset files into the following directory structure and read by MindSpore's API.
753
754    .. code-block::
755
756        .
757        └── EnWik9
758             ├── enwik9
759
760    Citation:
761
762    .. code-block::
763
764        @NetworkResource{Hutter_prize,
765        author    = {English Wikipedia},
766        url       = "https://cs.fit.edu/~mmahoney/compression/textdata.html",
767        month     = {March},
768        year      = {2006}
769        }
770    """
771
772    @check_en_wik9_dataset
773    def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=True,
774                 num_shards=None, shard_id=None, cache=None):
775        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
776                         num_shards=num_shards, shard_id=shard_id, cache=cache)
777        self.dataset_dir = dataset_dir
778
779    def parse(self, children=None):
780        return cde.EnWik9Node(self.dataset_dir, self.num_samples, self.shuffle_flag, self.num_shards,
781                              self.shard_id)
782
783
784class IMDBDataset(MappableDataset, TextBaseDataset):
785    """
786    IMDb(Internet Movie Database) dataset.
787
788    The generated dataset has two columns: :py:obj:`[text, label]` .
789    The tensor of column :py:obj:`text` is of the string type.
790    The column :py:obj:`label` is of a scalar of uint32 type.
791
792    Args:
793        dataset_dir (str): Path to the root directory that contains the dataset.
794        usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` .
795            Default: ``None`` , will read all samples.
796        num_samples (int, optional): The number of images to be included in the dataset.
797            Default: ``None`` , will include all samples.
798        num_parallel_workers (int, optional): Number of worker threads to read the data.
799            Default: ``None`` , will use global default workers(8), it can be set
800            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
801        shuffle (bool, optional): Whether or not to perform shuffle on the dataset.
802            Default: ``None`` , expected order behavior shown in the table below.
803        sampler (Sampler, optional): Object used to choose samples from the dataset.
804            Default: ``None`` , expected order behavior shown in the table below.
805        num_shards (int, optional): Number of shards that the dataset will be divided
806            into. Default: ``None`` . When this argument is specified, `num_samples` reflects
807            the maximum sample number of per shard.
808        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
809            argument can only be specified when `num_shards` is also specified.
810        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
811            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
812            Default: ``None`` , which means no cache is used.
813
814    Raises:
815        RuntimeError: If `dataset_dir` does not contain data files.
816        RuntimeError: If `sampler` and `shuffle` are specified at the same time.
817        RuntimeError: If `sampler` and `num_shards`/`shard_id` are specified at the same time.
818        RuntimeError: If `num_shards` is specified but `shard_id` is None.
819        RuntimeError: If `shard_id` is specified but `num_shards` is None.
820        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
821        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
822
823    Tutorial Examples:
824        - `Load & Process Data With Dataset Pipeline
825          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
826
827    Note:
828        - The shape of the test column.
829        - The parameters `num_samples` , `shuffle` , `num_shards` , `shard_id` can be used to control the sampler
830          used in the dataset, and their effects when combined with parameter `sampler` are as follows.
831
832    .. include:: mindspore.dataset.sampler.txt
833
834    Examples:
835        >>> import mindspore.dataset as ds
836        >>> imdb_dataset_dir = "/path/to/imdb_dataset_directory"
837        >>>
838        >>> # 1) Read all samples (text files) in imdb_dataset_dir with 8 threads
839        >>> dataset = ds.IMDBDataset(dataset_dir=imdb_dataset_dir, num_parallel_workers=8)
840        >>>
841        >>> # 2) Read train samples (text files).
842        >>> dataset = ds.IMDBDataset(dataset_dir=imdb_dataset_dir, usage="train")
843
844    About IMDBDataset:
845
846    The IMDB dataset contains 50, 000 highly polarized reviews from the Internet Movie Database (IMDB). The dataset
847    was divided into 25 000 comments for training and 25 000 comments for testing, with both the training set and test
848    set containing 50% positive and 50% negative comments. Train labels and test labels are all lists of 0 and 1, where
849    0 stands for negative and 1 for positive.
850
851    You can unzip the dataset files into this directory structure and read by MindSpore's API.
852
853    .. code-block::
854
855        .
856        └── imdb_dataset_directory
857             ├── train
858             │    ├── pos
859             │    │    ├── 0_9.txt
860             │    │    ├── 1_7.txt
861             │    │    ├── ...
862             │    ├── neg
863             │    │    ├── 0_3.txt
864             │    │    ├── 1_1.txt
865             │    │    ├── ...
866             ├── test
867             │    ├── pos
868             │    │    ├── 0_10.txt
869             │    │    ├── 1_10.txt
870             │    │    ├── ...
871             │    ├── neg
872             │    │    ├── 0_2.txt
873             │    │    ├── 1_3.txt
874             │    │    ├── ...
875
876    Citation:
877
878    .. code-block::
879
880        @InProceedings{maas-EtAl:2011:ACL-HLT2011,
881          author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan
882                        and  Ng, Andrew Y.  and  Potts, Christopher},
883          title     = {Learning Word Vectors for Sentiment Analysis},
884          booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics:
885                        Human Language Technologies},
886          month     = {June},
887          year      = {2011},
888          address   = {Portland, Oregon, USA},
889          publisher = {Association for Computational Linguistics},
890          pages     = {142--150},
891          url       = {http://www.aclweb.org/anthology/P11-1015}
892        }
893    """
894
895    @check_imdb_dataset
896    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None, sampler=None,
897                 num_shards=None, shard_id=None, cache=None):
898        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
899                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
900
901        self.dataset_dir = dataset_dir
902        self.usage = replace_none(usage, "all")
903
904    def parse(self, children=None):
905        return cde.IMDBNode(self.dataset_dir, self.usage, self.sampler)
906
907
908class IWSLT2016Dataset(SourceDataset, TextBaseDataset):
909    """
910    IWSLT2016(International Workshop on Spoken Language Translation) dataset.
911
912    The generated dataset has two columns: :py:obj:`[text, translation]` .
913    The tensor of column :py:obj: `text` is of the string type.
914    The column :py:obj: `translation` is of the string type.
915
916    Args:
917        dataset_dir (str): Path to the root directory that contains the dataset.
918        usage (str, optional): Acceptable usages include 'train', 'valid', 'test' and 'all'. Default: ``None`` ,
919            all samples.
920        language_pair (sequence, optional): Sequence containing source and target language, supported values are
921            ``('en', 'fr')``, ``('en', 'de')``, ``('en', 'cs')``, ``('en', 'ar')``, ``('fr', 'en')``,
922            ``('de', 'en')``, ``('cs', 'en')``, ``('ar', 'en')``. Default: ``None``, set to ``('de', 'en')``.
923        valid_set (str, optional): A string to identify validation set, when usage is valid or all, the validation set
924            of `valid_set` type will be read, supported values are ``'dev2010'``, ``'tst2010'``, ``'tst2011'``,
925            ``'tst2012'``, ``'tst2013'`` and ``'tst2014'``. Default: ``None``, set to ``'tst2013'``.
926        test_set (str, optional): A string to identify test set, when usage is test or all, the test set of `test_set`
927            type will be read, supported values are ``'dev2010'``, ``'tst2010'``, ``'tst2011'``, ``'tst2012'``,
928            ``'tst2013'`` and ``'tst2014'``. Default: ``None``, set to ``'tst2014'``.
929        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset.
930        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
931            Bool type and Shuffle enum are both supported to pass in.
932            Default: ``Shuffle.GLOBAL`` .
933            If `shuffle` is ``False``, no shuffling will be performed.
934            If `shuffle` is ``True``, it is equivalent to setting `shuffle` to
935            ``mindspore.dataset.Shuffle.GLOBAL`` .
936            Set the mode of data shuffling by passing in enumeration variables:
937
938            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
939
940            - ``Shuffle.FILES`` : Shuffle files only.
941
942        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
943            When this argument is specified, `num_samples` reflects the max sample number of per shard.
944        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
945            argument can only be specified when `num_shards` is also specified.
946        num_parallel_workers (int, optional): Number of worker threads to read the data.
947            Default: ``None`` , will use global default workers(8), it can be set
948            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
949        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
950            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
951            Default: ``None`` , which means no cache is used.
952
953    Raises:
954        RuntimeError: If `dataset_dir` does not contain data files.
955        RuntimeError: If `num_shards` is specified but `shard_id` is None.
956        RuntimeError: If `shard_id` is specified but `num_shards` is None.
957        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
958
959    Tutorial Examples:
960        - `Load & Process Data With Dataset Pipeline
961          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
962
963    Examples:
964        >>> import mindspore.dataset as ds
965        >>> iwslt2016_dataset_dir = "/path/to/iwslt2016_dataset_dir"
966        >>> dataset = ds.IWSLT2016Dataset(dataset_dir=iwslt2016_dataset_dir, usage='all',
967        ...                               language_pair=('de', 'en'), valid_set='tst2013', test_set='tst2014')
968
969    About IWSLT2016 dataset:
970
971    IWSLT is an international oral translation conference, a major annual scientific conference dedicated to all aspects
972    of oral translation. The MT task of the IWSLT evaluation activity constitutes a dataset, which can be publicly
973    obtained through the WIT3 website `wit3 <https://wit3.fbk.eu>`_ . The IWSLT2016 dataset includes translations from
974    English to Arabic, Czech, French, and German, and translations from Arabic, Czech, French, and German to English.
975
976    You can unzip the original IWSLT2016 dataset files into this directory structure and read by MindSpore's API. After
977    decompression, you also need to decompress the dataset to be read in the specified folder. For example, if you want
978    to read the dataset of de-en, you need to unzip the tgz file in the de/en directory, the dataset is in the
979    unzipped folder.
980
981    .. code-block::
982
983        .
984        └── iwslt2016_dataset_directory
985             ├── subeval_files
986             └── texts
987                  ├── ar
988                  │    └── en
989                  │        └── ar-en
990                  ├── cs
991                  │    └── en
992                  │        └── cs-en
993                  ├── de
994                  │    └── en
995                  │        └── de-en
996                  │            ├── IWSLT16.TED.dev2010.de-en.de.xml
997                  │            ├── train.tags.de-en.de
998                  │            ├── ...
999                  ├── en
1000                  │    ├── ar
1001                  │    │   └── en-ar
1002                  │    ├── cs
1003                  │    │   └── en-cs
1004                  │    ├── de
1005                  │    │   └── en-de
1006                  │    └── fr
1007                  │        └── en-fr
1008                  └── fr
1009                       └── en
1010                           └── fr-en
1011
1012    Citation:
1013
1014    .. code-block::
1015
1016        @inproceedings{cettoloEtAl:EAMT2012,
1017        Address = {Trento, Italy},
1018        Author = {Mauro Cettolo and Christian Girardi and Marcello Federico},
1019        Booktitle = {Proceedings of the 16$^{th}$ Conference of the European Association for Machine Translation
1020                     (EAMT)},
1021        Date = {28-30},
1022        Month = {May},
1023        Pages = {261--268},
1024        Title = {WIT$^3$: Web Inventory of Transcribed and Translated Talks},
1025        Year = {2012}}
1026    """
1027
1028    @check_iwslt2016_dataset
1029    def __init__(self, dataset_dir, usage=None, language_pair=None, valid_set=None, test_set=None,
1030                 num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, num_parallel_workers=None,
1031                 cache=None):
1032        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1033                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1034        self.dataset_dir = dataset_dir
1035        self.usage = replace_none(usage, 'all')
1036        self.language_pair = replace_none(language_pair, ["de", "en"])
1037        self.valid_set = replace_none(valid_set, 'tst2013')
1038        self.test_set = replace_none(test_set, 'tst2014')
1039
1040    def parse(self, children=None):
1041        return cde.IWSLT2016Node(self.dataset_dir, self.usage, self.language_pair, self.valid_set, self.test_set,
1042                                 self.num_samples, self.shuffle_flag, self.num_shards, self.shard_id)
1043
1044
1045class IWSLT2017Dataset(SourceDataset, TextBaseDataset):
1046    """
1047    IWSLT2017(International Workshop on Spoken Language Translation) dataset.
1048
1049    The generated dataset has two columns: :py:obj:`[text, translation]` .
1050    The tensor of column :py:obj:`text` and :py:obj:`translation` are of the string type.
1051
1052    Args:
1053        dataset_dir (str): Path to the root directory that contains the dataset.
1054        usage (str, optional): Acceptable usages include 'train', 'valid', 'test' and 'all'. Default: ``None`` ,
1055            all samples.
1056        language_pair (sequence, optional): List containing src and tgt language, supported values are ``('en', 'nl')``,
1057            ``('en', 'de')``, ``('en', 'it')``, ``('en', 'ro')``, ``('nl', 'en')``, ``('nl', 'de')``, ``('nl', 'it')``,
1058            ``('nl', 'ro')``, ``('de', 'en')``, ``('de', 'nl')``, ``('de', 'it')``, ``('de', 'ro')``, ``('it', 'en')``,
1059            ``('it', 'nl')``, ``('it', 'de')``, ``('it', 'ro')``, ``('ro', 'en')``, ``('ro', 'nl')``, ``('ro', 'de')``,
1060            ``('ro', 'it')``. Default: ``None``, set to ``('de', 'en')``.
1061        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset.
1062        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1063            Bool type and Shuffle enum are both supported to pass in.
1064            Default: ``Shuffle.GLOBAL`` .
1065            If `shuffle` is ``False`` , no shuffling will be performed.
1066            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
1067            ``mindspore.dataset.Shuffle.GLOBAL`` .
1068            Set the mode of data shuffling by passing in enumeration variables:
1069
1070            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1071
1072            - ``Shuffle.FILES`` : Shuffle files only.
1073
1074        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1075            When this argument is specified, `num_samples` reflects the max sample number of per shard.
1076        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1077            argument can only be specified when `num_shards` is also specified.
1078        num_parallel_workers (int, optional): Number of worker threads to read the data.
1079            Default: ``None`` , will use global default workers(8), it can be set
1080            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1081        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1082            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1083            Default: ``None`` , which means no cache is used.
1084
1085    Raises:
1086        RuntimeError: If `dataset_dir` does not contain data files.
1087        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1088        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1089        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1090
1091    Tutorial Examples:
1092        - `Load & Process Data With Dataset Pipeline
1093          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1094
1095    Examples:
1096        >>> import mindspore.dataset as ds
1097        >>> iwslt2017_dataset_dir = "/path/to/iwslt2017_dataset_dir"
1098        >>> dataset = ds.IWSLT2017Dataset(dataset_dir=iwslt2017_dataset_dir, usage='all', language_pair=('de', 'en'))
1099
1100    About IWSLT2017 dataset:
1101
1102    IWSLT is an international oral translation conference, a major annual scientific conference dedicated to all aspects
1103    of oral translation. The MT task of the IWSLT evaluation activity constitutes a dataset, which can be publicly
1104    obtained through the WIT3 website  `wit3 <https://wit3.fbk.eu>`_ . The IWSLT2017 dataset involves German, English,
1105    Italian, Dutch, and Romanian. The dataset includes translations in any two different languages.
1106
1107    You can unzip the original IWSLT2017 dataset files into this directory structure and read by MindSpore's API. You
1108    need to decompress the dataset package in texts/DeEnItNlRo/DeEnItNlRo directory to get the DeEnItNlRo-DeEnItNlRo
1109    subdirectory.
1110
1111    .. code-block::
1112
1113        .
1114        └── iwslt2017_dataset_directory
1115            └── DeEnItNlRo
1116                └── DeEnItNlRo
1117                    └── DeEnItNlRo-DeEnItNlRo
1118                        ├── IWSLT17.TED.dev2010.de-en.de.xml
1119                        ├── train.tags.de-en.de
1120                        ├── ...
1121
1122    Citation:
1123
1124    .. code-block::
1125
1126        @inproceedings{cettoloEtAl:EAMT2012,
1127        Address = {Trento, Italy},
1128        Author = {Mauro Cettolo and Christian Girardi and Marcello Federico},
1129        Booktitle = {Proceedings of the 16$^{th}$ Conference of the European Association for Machine Translation
1130                     (EAMT)},
1131        Date = {28-30},
1132        Month = {May},
1133        Pages = {261--268},
1134        Title = {WIT$^3$: Web Inventory of Transcribed and Translated Talks},
1135        Year = {2012}}
1136    """
1137
1138    @check_iwslt2017_dataset
1139    def __init__(self, dataset_dir, usage=None, language_pair=None, num_samples=None, shuffle=Shuffle.GLOBAL,
1140                 num_shards=None, shard_id=None, num_parallel_workers=None, cache=None):
1141        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1142                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1143        self.dataset_dir = dataset_dir
1144        self.usage = replace_none(usage, 'all')
1145        self.language_pair = replace_none(language_pair, ["de", "en"])
1146
1147    def parse(self, children=None):
1148        return cde.IWSLT2017Node(self.dataset_dir, self.usage, self.language_pair, self.num_samples,
1149                                 self.shuffle_flag, self.num_shards, self.shard_id)
1150
1151
1152class Multi30kDataset(SourceDataset, TextBaseDataset):
1153    """
1154    Multi30k dataset.
1155
1156    The generated dataset has two columns :py:obj:`[text, translation]` .
1157    The tensor of column :py:obj:`text` is of the string type.
1158    The tensor of column :py:obj:`translation` is of the string type.
1159
1160    Args:
1161        dataset_dir (str): Path to the root directory that contains the dataset.
1162        usage (str, optional): Acceptable usages include ``'train'``, ``'test'``, ``'valid'`` or ``'all'``.
1163            Default: ``None`` , will read all samples.
1164        language_pair (Sequence[str, str], optional): Acceptable language_pair include ``['en', 'de']``,
1165            ``['de', 'en']``. Default: ``None`` , means ``['en', 'de']``.
1166        num_samples (int, optional): The number of images to be included in the dataset.
1167            Default: ``None`` , will read all samples.
1168        num_parallel_workers (int, optional): Number of worker threads to read the data.
1169            Default: ``None`` , will use global default workers(8), it can be set
1170            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1171        shuffle (Union[bool, Shuffle], optional): Whether to shuffle the dataset. Default: ``None`` ,
1172            means ``mindspore.dataset.Shuffle.GLOBAL`` .
1173            If ``False`` is provided, no shuffling will be performed.
1174            If ``True`` is provided, it is the same as setting to
1175            ``mindspore.dataset.Shuffle.GLOBAL`` .
1176            If Shuffle is provided, the effect is as follows:
1177
1178            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1179            - ``Shuffle.FILES`` : Shuffle files only.
1180
1181        num_shards (int, optional): Number of shards that the dataset will be divided
1182            into. Default: ``None`` . When this argument is specified, `num_samples` reflects
1183            the max sample number of per shard.
1184        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1185            argument can only be specified when `num_shards` is also specified.
1186        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1187            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1188            Default: ``None`` , which means no cache is used.
1189
1190    Raises:
1191        RuntimeError: If `dataset_dir` does not contain data files.
1192        ValueError: If `usage` is not ``'train'``, ``'test'``, ``'valid'`` or ``'all'``.
1193        TypeError: If `language_pair` is not of type Sequence[str, str].
1194        RuntimeError: If num_samples is less than 0.
1195        RuntimeError: If `num_parallel_workers` exceeds the max thread numbers.
1196        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1197        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1198        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
1199
1200    Tutorial Examples:
1201        - `Load & Process Data With Dataset Pipeline
1202          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1203
1204    Examples:
1205        >>> import mindspore.dataset as ds
1206        >>> multi30k_dataset_dir = "/path/to/multi30k_dataset_directory"
1207        >>> data = ds.Multi30kDataset(dataset_dir=multi30k_dataset_dir, usage='all', language_pair=['de', 'en'])
1208
1209    About Multi30k dataset:
1210
1211    Multi30K is a multilingual dataset that features approximately 31,000 standardized images
1212    described in multiple languages. The images are sourced from Flickr and each image comes
1213    with sentence descripitions in both English and German, as well as descriptions in other
1214    languages. Multi30k is used primarily for training and testing in tasks such as image
1215    captioning, machine translation, and visual question answering.
1216
1217    You can unzip the dataset files into the following directory structure and read by MindSpore's API.
1218
1219    .. code-block::
1220
1221        └── multi30k_dataset_directory
1222              ├── training
1223              │    ├── train.de
1224              │    └── train.en
1225              ├── validation
1226              │    ├── val.de
1227              │    └── val.en
1228              └── mmt16_task1_test
1229                   ├── val.de
1230                   └── val.en
1231
1232    Citation:
1233
1234    .. code-block::
1235
1236        @article{elliott-EtAl:2016:VL16,
1237        author    = {{Elliott}, D. and {Frank}, S. and {Sima'an}, K. and {Specia}, L.},
1238        title     = {Multi30K: Multilingual English-German Image Descriptions},
1239        booktitle = {Proceedings of the 5th Workshop on Vision and Language},
1240        year      = {2016},
1241        pages     = {70--74},
1242        year      = 2016
1243        }
1244    """
1245
1246    @check_multi30k_dataset
1247    def __init__(self, dataset_dir, usage=None, language_pair=None, num_samples=None,
1248                 num_parallel_workers=None, shuffle=None, num_shards=None, shard_id=None, cache=None):
1249        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1250                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1251        self.dataset_dir = dataset_dir
1252        self.usage = replace_none(usage, 'all')
1253        self.language_pair = replace_none(language_pair, ["en", "de"])
1254        self.shuffle = replace_none(shuffle, Shuffle.GLOBAL)
1255
1256    def parse(self, children=None):
1257        return cde.Multi30kNode(self.dataset_dir, self.usage, self.language_pair, self.num_samples,
1258                                self.shuffle_flag, self.num_shards, self.shard_id)
1259
1260
1261class PennTreebankDataset(SourceDataset, TextBaseDataset):
1262    """
1263    PennTreebank dataset.
1264
1265    The generated dataset has one column :py:obj:`[text]` .
1266    The tensor of column :py:obj:`text` is of the string type.
1267
1268    Args:
1269        dataset_dir (str): Path to the root directory that contains the dataset.
1270        usage (str, optional): Acceptable usages include ``'train'``, ``'test'``, ``'valid'`` and ``'all'``.
1271            ``'train'`` will read from 42,068 train samples of string type,
1272            ``'test'`` will read from 3,370 test samples of string type,
1273            ``'valid'`` will read from 3,761 test samples of string type,
1274            ``'all'`` will read from all 49,199 samples of string type. Default: ``None`` , all samples.
1275        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset.
1276        num_parallel_workers (int, optional): Number of worker threads to read the data.
1277            Default: ``None`` , will use global default workers(8), it can be set
1278            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1279        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1280            Bool type and Shuffle enum are both supported to pass in.
1281            Default: ``Shuffle.GLOBAL`` .
1282            If `shuffle` is ``False`` , no shuffling will be performed.
1283            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
1284            ``mindspore.dataset.Shuffle.GLOBAL`` .
1285            Set the mode of data shuffling by passing in enumeration variables:
1286
1287            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1288
1289            - ``Shuffle.FILES`` : Shuffle files only.
1290
1291        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1292            When this argument is specified, `num_samples` reflects the max sample number of per shard.
1293        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1294            argument can only be specified when `num_shards` is also specified.
1295        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1296            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1297            Default: ``None`` , which means no cache is used.
1298
1299    Raises:
1300        RuntimeError: If `dataset_dir` does not contain data files.
1301        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1302        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1303        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1304
1305    Tutorial Examples:
1306        - `Load & Process Data With Dataset Pipeline
1307          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1308
1309    Examples:
1310        >>> import mindspore.dataset as ds
1311        >>> penn_treebank_dataset_dir = "/path/to/penn_treebank_dataset_directory"
1312        >>> dataset = ds.PennTreebankDataset(dataset_dir=penn_treebank_dataset_dir, usage='all')
1313
1314    About PennTreebank dataset:
1315
1316    Penn Treebank (PTB) dataset, is widely used in machine learning for NLP (Natural Language Processing)
1317    research. Word-level PTB does not contain capital letters, numbers, and punctuations, and the vocabulary
1318    is capped at 10k unique words, which is relatively small in comparison to most modern datasets which
1319    can result in a larger number of out of vocabulary tokens.
1320
1321    Here is the original PennTreebank dataset structure.
1322    You can unzip the dataset files into this directory structure and read by MindSpore's API.
1323
1324    .. code-block::
1325
1326        .
1327        └── PennTreebank_dataset_dir
1328             ├── ptb.test.txt
1329             ├── ptb.train.txt
1330             └── ptb.valid.txt
1331
1332    Citation:
1333
1334    .. code-block::
1335
1336        @techreport{Santorini1990,
1337          added-at = {2014-03-26T23:25:56.000+0100},
1338          author = {Santorini, Beatrice},
1339          biburl = {https://www.bibsonomy.org/bibtex/234cdf6ddadd89376090e7dada2fc18ec/butonic},
1340          file = {:Santorini - Penn Treebank tag definitions.pdf:PDF},
1341          institution = {Department of Computer and Information Science, University of Pennsylvania},
1342          interhash = {818e72efd9e4b5fae3e51e88848100a0},
1343          intrahash = {34cdf6ddadd89376090e7dada2fc18ec},
1344          keywords = {dis pos tagging treebank},
1345          number = {MS-CIS-90-47},
1346          timestamp = {2014-03-26T23:25:56.000+0100},
1347          title = {Part-of-speech tagging guidelines for the {P}enn {T}reebank {P}roject},
1348          url = {ftp://ftp.cis.upenn.edu/pub/treebank/doc/tagguide.ps.gz},
1349          year = 1990
1350        }
1351    """
1352
1353    @check_penn_treebank_dataset
1354    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
1355                 num_shards=None, shard_id=None, cache=None):
1356        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1357                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1358        self.dataset_dir = dataset_dir
1359        self.usage = replace_none(usage, "all")
1360
1361    def parse(self, children=None):
1362        return cde.PennTreebankNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
1363                                    self.shard_id)
1364
1365
1366class SogouNewsDataset(SourceDataset, TextBaseDataset):
1367    r"""
1368    Sogou News dataset.
1369
1370    The generated dataset has three columns: :py:obj:`[index, title, content]` ,
1371    and the data type of three columns is string.
1372
1373    Args:
1374        dataset_dir (str): Path to the root directory that contains the dataset.
1375        usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` .
1376            ``'train'`` will read from 450,000 train samples, ``'test'`` will read from 60,000 test samples,
1377            ``'all'`` will read from all 510,000 samples. Default: ``None`` , all samples.
1378        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , read all samples.
1379        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1380            Bool type and Shuffle enum are both supported to pass in.
1381            Default: ``Shuffle.GLOBAL`` .
1382            If `shuffle` is ``False`` , no shuffling will be performed.
1383            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
1384            ``mindspore.dataset.Shuffle.GLOBAL`` .
1385            Set the mode of data shuffling by passing in enumeration variables:
1386
1387            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting shuffle to True.
1388
1389            - ``Shuffle.FILES`` : Shuffle files only.
1390        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1391            When this argument is specified, `num_samples` reflects the max sample number of per shard.
1392        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1393            argument can only be specified when `num_shards` is also specified.
1394        num_parallel_workers (int, optional): Number of worker threads to read the data.
1395            Default: ``None`` , will use global default workers(8), it can be set
1396            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1397        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1398            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1399            Default: ``None`` , which means no cache is used.
1400
1401    Raises:
1402        RuntimeError: If `dataset_dir` does not contain data files.
1403        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1404        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1405        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1406
1407    Tutorial Examples:
1408        - `Load & Process Data With Dataset Pipeline
1409          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1410
1411    Examples:
1412        >>> import mindspore.dataset as ds
1413        >>> sogou_news_dataset_dir = "/path/to/sogou_news_dataset_dir"
1414        >>> dataset = ds.SogouNewsDataset(dataset_dir=sogou_news_dataset_dir, usage='all')
1415
1416    About SogouNews Dataset:
1417
1418    SogouNews dataset includes 3 columns, corresponding to class index (1 to 5), title and content. The title and
1419    content are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes ("").
1420    New lines are escaped by a backslash followed with an "n" character, that is "\n".
1421
1422    You can unzip the dataset files into the following structure and read by MindSpore's API:
1423
1424    .. code-block::
1425
1426        .
1427        └── sogou_news_dir
1428             ├── classes.txt
1429             ├── readme.txt
1430             ├── test.csv
1431             └── train.csv
1432
1433    Citation:
1434
1435    .. code-block::
1436
1437        @misc{zhang2015characterlevel,
1438            title={Character-level Convolutional Networks for Text Classification},
1439            author={Xiang Zhang and Junbo Zhao and Yann LeCun},
1440            year={2015},
1441            eprint={1509.01626},
1442            archivePrefix={arXiv},
1443            primaryClass={cs.LG}
1444        }
1445    """
1446
1447    @check_sogou_news_dataset
1448    def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None,
1449                 shard_id=None, num_parallel_workers=None, cache=None):
1450        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1451                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1452        self.dataset_dir = dataset_dir
1453        self.usage = replace_none(usage, 'all')
1454
1455    def parse(self, children=None):
1456        return cde.SogouNewsNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
1457                                 self.num_shards, self.shard_id)
1458
1459
1460class SQuADDataset(SourceDataset, TextBaseDataset):
1461    """
1462    SQuAD 1.1 and SQuAD 2.0 datasets.
1463
1464    The generated dataset with different versions and usages has the same output columns:
1465    :py:obj:`[context, question, text, answer_start]` .
1466    The tensor of column :py:obj:`context` is of the string type.
1467    The tensor of column :py:obj:`question` is of the string type.
1468    The tensor of column :py:obj:`text` is the answer in the context of the string type.
1469    The tensor of column :py:obj:`answer_start` is the start index of answer in context,
1470    which is of the uint32 type.
1471
1472    Args:
1473        dataset_dir (str): Path to the root directory that contains the dataset.
1474        usage (str, optional): Specify the ``'train'``, ``'dev'`` or ``'all'`` part of dataset.
1475            Default: ``None`` , all samples.
1476        num_samples (int, optional): The number of samples to be included in the dataset.
1477            Default: ``None`` , will include all samples.
1478        num_parallel_workers (int, optional): Number of worker threads to read the data.
1479            Default: ``None`` , will use global default workers(8), it can be set
1480            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1481        shuffle (Union[bool, Shuffle], optional): Whether to shuffle the dataset.
1482            Default: ``Shuffle.GLOBAL`` .
1483            If ``False`` is provided, no shuffling will be performed.
1484            If ``True`` is provided, it is the same as setting to
1485            ``mindspore.dataset.Shuffle.GLOBAL`` .
1486            If Shuffle is provided, the effect is as follows:
1487
1488            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1489            - ``Shuffle.FILES`` : Shuffle files only.
1490
1491        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1492            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
1493        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1494            argument can only be specified when `num_shards` is also specified.
1495        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1496            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1497            Default: ``None`` , which means no cache is used.
1498
1499    Raises:
1500        RuntimeError: If `dataset_dir` does not contain data files.
1501        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1502        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1503        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1504        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
1505
1506    Tutorial Examples:
1507        - `Load & Process Data With Dataset Pipeline
1508          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1509
1510    Examples:
1511        >>> import mindspore.dataset as ds
1512        >>> squad_dataset_dir = "/path/to/squad_dataset_file"
1513        >>> dataset = ds.SQuADDataset(dataset_dir=squad_dataset_dir, usage='all')
1514
1515    About SQuAD dataset:
1516
1517    SQuAD (Stanford Question Answering Dataset) is a reading comprehension dataset, consisting of questions posed by
1518    crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span,
1519    from the corresponding reading passage, or the question might be unanswerable.
1520
1521    SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles.
1522    SQuAD 2.0 combines the 100,000 questions in SQuAD 1.1 with over 50,000 unanswerable questions written adversarially
1523    by crowdworkers to look similar to answerable ones. To do well on SQuAD 2.0, systems must not only answer questions
1524    when possible, but also determine when no answer is supported by the paragraph and abstain from answering.
1525
1526    You can get the dataset files into the following structure and read by MindSpore's API,
1527
1528    For SQuAD 1.1:
1529
1530    .. code-block::
1531
1532        .
1533        └── SQuAD1
1534             ├── train-v1.1.json
1535             └── dev-v1.1.json
1536
1537    For SQuAD 2.0:
1538
1539    .. code-block::
1540
1541        .
1542        └── SQuAD2
1543             ├── train-v2.0.json
1544             └── dev-v2.0.json
1545
1546    Citation:
1547
1548    .. code-block::
1549
1550        @misc{rajpurkar2016squad,
1551            title         = {SQuAD: 100,000+ Questions for Machine Comprehension of Text},
1552            author        = {Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang},
1553            year          = {2016},
1554            eprint        = {1606.05250},
1555            archivePrefix = {arXiv},
1556            primaryClass  = {cs.CL}
1557        }
1558
1559        @misc{rajpurkar2018know,
1560            title         = {Know What You Don't Know: Unanswerable Questions for SQuAD},
1561            author        = {Pranav Rajpurkar and Robin Jia and Percy Liang},
1562            year          = {2018},
1563            eprint        = {1806.03822},
1564            archivePrefix = {arXiv},
1565            primaryClass  = {cs.CL}
1566        }
1567    """
1568
1569    @check_squad_dataset
1570    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None,
1571                 shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None, cache=None):
1572        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1573                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1574        self.dataset_dir = dataset_dir
1575        self.usage = replace_none(usage, 'all')
1576
1577    def parse(self, children=None):
1578        return cde.SQuADNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
1579                             self.num_shards, self.shard_id)
1580
1581
1582class SST2Dataset(SourceDataset, TextBaseDataset):
1583    """
1584    SST2(Stanford Sentiment Treebank v2) dataset.
1585
1586    The generated dataset's train.tsv and dev.tsv have two columns :py:obj:`[sentence, label]` .
1587    The generated dataset's test.tsv has one column :py:obj:`[sentence]` .
1588    The tensor of column :py:obj:`sentence` and :py:obj:`label` are of the string type.
1589
1590    Args:
1591        dataset_dir (str): Path to the root directory that contains the dataset.
1592        usage (str, optional): Usage of this dataset, can be ``"train"``, ``"test"`` or ``"dev"``.
1593            ``"train"`` will read from 67,349 train samples, ``"test"`` will read from 1,821 test samples,
1594            ``"dev"`` will read from all 872 samples. Default: ``None`` , will read train samples.
1595        num_samples (int, optional): The number of samples to be included in the dataset.
1596            Default: ``None`` , will include all text.
1597        num_parallel_workers (int, optional): Number of worker threads to read the data.
1598            Default: ``None`` , will use global default workers(8), it can be set
1599            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1600        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1601            Bool type and Shuffle enum are both supported to pass in.
1602            Default: ``Shuffle.GLOBAL`` .
1603            If `shuffle` is ``False`` , no shuffling will be performed;
1604            If `shuffle` is ``True`` , the behavior is the same as setting shuffle to be Shuffle.GLOBAL
1605            Set the mode of data shuffling by passing in enumeration variables:
1606
1607            - ``Shuffle.GLOBAL`` : Shuffle the samples.
1608
1609        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1610            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
1611        shard_id (int, optional): The shard ID within `num_shards`. This argument can only be specified when
1612            `num_shards` is also specified. Default: ``None`` .
1613        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1614            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1615            Default: ``None`` , which means no cache is used.
1616
1617    Raises:
1618        RuntimeError: If `dataset_dir` does not contain data files.
1619        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1620        RuntimeError: If `num_shards` is specified but shard_id is None.
1621        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1622        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
1623
1624    Tutorial Examples:
1625        - `Load & Process Data With Dataset Pipeline
1626          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1627
1628    Examples:
1629        >>> import mindspore.dataset as ds
1630        >>> sst2_dataset_dir = "/path/to/sst2_dataset_directory"
1631        >>>
1632        >>> # 1) Read 3 samples from SST2 dataset
1633        >>> dataset = ds.SST2Dataset(dataset_dir=sst2_dataset_dir, num_samples=3)
1634        >>>
1635        >>> # 2) Read train samples from SST2 dataset
1636        >>> dataset = ds.SST2Dataset(dataset_dir=sst2_dataset_dir, usage="train")
1637
1638    About SST2 dataset:
1639    The Stanford Sentiment Treebank is a corpus with fully labeled parse trees that allows for a complete
1640    analysis of the compositional effects of sentiment in language. The corpus is based on the dataset introduced
1641    by Pang and Lee (2005) and consists of 11,855 single sentences extracted from movie reviews. It was parsed
1642    with the Stanford parser and includes a total of 215,154 unique phrases from those parse trees, each
1643    annotated by 3 human judges.
1644
1645    Here is the original SST2 dataset structure.
1646    You can unzip the dataset files into this directory structure and read by Mindspore's API.
1647
1648    .. code-block::
1649
1650        .
1651        └── sst2_dataset_dir
1652            ├── train.tsv
1653            ├── test.tsv
1654            ├── dev.tsv
1655            └── original
1656
1657    Citation:
1658
1659    .. code-block::
1660
1661        @inproceedings{socher-etal-2013-recursive,
1662            title     = {Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank},
1663            author    = {Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning,
1664                          Christopher D. and Ng, Andrew and Potts, Christopher},
1665            booktitle = {Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing},
1666            month     = oct,
1667            year      = {2013},
1668            address   = {Seattle, Washington, USA},
1669            publisher = {Association for Computational Linguistics},
1670            url       = {https://www.aclweb.org/anthology/D13-1170},
1671            pages     = {1631--1642},
1672        }
1673    """
1674
1675    @check_sst2_dataset
1676    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
1677                 num_shards=None, shard_id=None, cache=None):
1678        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1679                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1680        self.dataset_dir = dataset_dir
1681        self.usage = replace_none(usage, "train")
1682
1683    def parse(self, children=None):
1684        return cde.SST2Node(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
1685                            self.num_shards, self.shard_id)
1686
1687
1688class TextFileDataset(SourceDataset, TextBaseDataset):
1689    """
1690    A source dataset that reads and parses datasets stored on disk in text format.
1691    The generated dataset has one column :py:obj:`[text]` with type string.
1692
1693    Args:
1694        dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for a
1695            pattern of files. The list will be sorted in a lexicographical order.
1696        num_samples (int, optional): The number of samples to be included in the dataset.
1697            Default: ``None`` , will include all images.
1698        num_parallel_workers (int, optional): Number of worker threads to read the data.
1699            Default: ``None`` , will use global default workers(8), it can be set
1700            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1701        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1702            Default: ``Shuffle.GLOBAL`` .
1703            Bool type and Shuffle enum are both supported to pass in.
1704            If `shuffle` is ``False`` , no shuffling will be performed.
1705            If `shuffle` is ``True`` , performs global shuffle.
1706            There are three levels of shuffling, desired shuffle enum defined by :class:`mindspore.dataset.Shuffle` .
1707
1708            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples, same as setting shuffle to True.
1709
1710            - ``Shuffle.FILES`` : Shuffle files only.
1711
1712        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1713            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
1714        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1715            argument can only be specified when `num_shards` is also specified.
1716        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1717            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1718            Default: ``None`` , which means no cache is used.
1719
1720    Raises:
1721        ValueError: If dataset_files are not valid or do not exist.
1722        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1723        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1724        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1725        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
1726
1727    Tutorial Examples:
1728        - `Load & Process Data With Dataset Pipeline
1729          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1730
1731    Examples:
1732        >>> import mindspore.dataset as ds
1733        >>> text_file_list = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files
1734        >>> dataset = ds.TextFileDataset(dataset_files=text_file_list)
1735    """
1736
1737    @check_textfiledataset
1738    def __init__(self, dataset_files, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
1739                 num_shards=None, shard_id=None, cache=None):
1740        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1741                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1742        self.dataset_files = self._find_files(dataset_files)
1743        self.dataset_files.sort()
1744
1745    def parse(self, children=None):
1746        return cde.TextFileNode(self.dataset_files, self.num_samples, self.shuffle_flag, self.num_shards,
1747                                self.shard_id)
1748
1749
1750class UDPOSDataset(SourceDataset, TextBaseDataset):
1751    """
1752    UDPOS(Universal Dependencies dataset for Part of Speech) dataset.
1753
1754    The generated dataset has three columns: :py:obj:`[word, universal, stanford]` ,
1755    and the data type of three columns is string.
1756
1757    Args:
1758        dataset_dir (str): Path to the root directory that contains the dataset.
1759        usage (str, optional): Usage of this dataset, can be ``'train'``, ``'test'``, ``'valid'`` or ``'all'``.
1760            ``'train'`` will read from 12,543 train samples, ``'test'`` will read from 2,077 test samples,
1761            ``'valid'`` will read from 2,002 test samples, ``'all'`` will read from all 16,622 samples.
1762            Default: ``None`` , all samples.
1763        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset.
1764        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1765            Bool type and Shuffle enum are both supported to pass in.
1766            Default: ``Shuffle.GLOBAL`` .
1767            If `shuffle` is ``False`` , no shuffling will be performed.
1768            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
1769            ``mindspore.dataset.Shuffle.GLOBAL`` .
1770            Set the mode of data shuffling by passing in enumeration variables:
1771
1772            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1773
1774            - ``Shuffle.FILES`` : Shuffle files only.
1775
1776        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1777            When this argument is specified, `num_samples` reflects the max sample number of per shard.
1778        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1779            argument can only be specified when `num_shards` is also specified.
1780        num_parallel_workers (int, optional): Number of worker threads to read the data.
1781            Default: ``None`` , will use global default workers(8), it can be set
1782            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1783        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1784            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1785            Default: ``None`` , which means no cache is used.
1786
1787    Raises:
1788        RuntimeError: If `dataset_dir` does not contain data files.
1789        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1790        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1791        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1792
1793    Tutorial Examples:
1794        - `Load & Process Data With Dataset Pipeline
1795          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1796
1797    Examples:
1798        >>> import mindspore.dataset as ds
1799        >>> udpos_dataset_dir = "/path/to/udpos_dataset_dir"
1800        >>> dataset = ds.UDPOSDataset(dataset_dir=udpos_dataset_dir, usage='all')
1801
1802    About UDPOS dataset:
1803
1804    Text corpus dataset that clarifies syntactic or semantic sentence structure.
1805    The corpus comprises 254,830 words and 16,622 sentences, taken from various web media including
1806    weblogs, newsgroups, emails and reviews.
1807
1808    Citation:
1809
1810    .. code-block::
1811
1812        @inproceedings{silveira14gold,
1813          year = {2014},
1814          author = {Natalia Silveira and Timothy Dozat and Marie-Catherine de Marneffe and Samuel Bowman
1815            and Miriam Connor and John Bauer and Christopher D. Manning},
1816          title = {A Gold Standard Dependency Corpus for {E}nglish},
1817          booktitle = {Proceedings of the Ninth International Conference on Language
1818            Resources and Evaluation (LREC-2014)}
1819        }
1820    """
1821
1822    @check_udpos_dataset
1823    def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None,
1824                 shard_id=None, num_parallel_workers=None, cache=None):
1825        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1826                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1827        self.dataset_dir = dataset_dir
1828        self.usage = replace_none(usage, 'all')
1829
1830    def parse(self, children=None):
1831        return cde.UDPOSNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
1832                             self.shard_id)
1833
1834
1835class WikiTextDataset(SourceDataset, TextBaseDataset):
1836    """
1837    WikiText2 and WikiText103 datasets.
1838
1839    The generated dataset has one column :py:obj:`[text]` , and
1840    the tensor of column `text` is of the string type.
1841
1842    Args:
1843        dataset_dir (str): Path to the root directory that contains the dataset.
1844        usage (str, optional): Acceptable usages include ``'train'``, ``'test'``, ``'valid'`` and ``'all'``.
1845            Default: ``None`` , all samples.
1846        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads the full dataset.
1847        num_parallel_workers (int, optional): Number of worker threads to read the data.
1848            Default: ``None`` , will use global default workers(8), it can be set
1849            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1850        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1851            Bool type and Shuffle enum are both supported to pass in.
1852            Default: ``Shuffle.GLOBAL`` .
1853            If `shuffle` is ``False`` , no shuffling will be performed.
1854            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
1855            ``mindspore.dataset.Shuffle.GLOBAL`` .
1856            Set the mode of data shuffling by passing in enumeration variables:
1857
1858            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1859
1860            - ``Shuffle.FILES`` : Shuffle files only.
1861
1862        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1863            When this argument is specified, `num_samples` reflects the max sample number of per shard.
1864        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1865            argument can only be specified when `num_shards` is also specified.
1866        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1867            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1868            Default: ``None`` , which means no cache is used.
1869
1870    Raises:
1871        RuntimeError: If `dataset_dir` does not contain data files or invalid.
1872        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1873        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1874        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
1875        ValueError: If `num_samples` is invalid (< 0).
1876        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1877
1878    Tutorial Examples:
1879        - `Load & Process Data With Dataset Pipeline
1880          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1881
1882    About WikiTextDataset dataset:
1883
1884    The WikiText Long Term Dependency Language Modeling Dataset is an English lexicon containing 100 million words.
1885    These terms are drawn from Wikipedia's premium and benchmark articles, including versions of Wikitext2 and
1886    Wikitext103. For WikiText2, it has 36718 lines in wiki.train.tokens, 4358 lines in wiki.test.tokens and
1887    3760 lines in wiki.valid.tokens. For WikiText103, it has 1801350 lines in wiki.train.tokens, 4358 lines in
1888    wiki.test.tokens and 3760 lines in wiki.valid.tokens.
1889
1890    Here is the original WikiText dataset structure.
1891    You can unzip the dataset files into this directory structure and read by MindSpore's API.
1892
1893    .. code-block::
1894
1895        .
1896        └── WikiText2/WikiText103
1897             ├── wiki.train.tokens
1898             ├── wiki.test.tokens
1899             ├── wiki.valid.tokens
1900
1901    Citation:
1902
1903    .. code-block::
1904
1905        @article{merity2016pointer,
1906          title={Pointer sentinel mixture models},
1907          author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard},
1908          journal={arXiv preprint arXiv:1609.07843},
1909          year={2016}
1910        }
1911
1912    Examples:
1913        >>> import mindspore.dataset as ds
1914        >>> wiki_text_dataset_dir = "/path/to/wiki_text_dataset_directory"
1915        >>> dataset = ds.WikiTextDataset(dataset_dir=wiki_text_dataset_dir, usage='all')
1916    """
1917
1918    @check_wiki_text_dataset
1919    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
1920                 num_shards=None, shard_id=None, cache=None):
1921        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
1922                         num_shards=num_shards, shard_id=shard_id, cache=cache)
1923        self.dataset_dir = dataset_dir
1924        self.usage = replace_none(usage, "all")
1925
1926    def parse(self, children=None):
1927        return cde.WikiTextNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag, self.num_shards,
1928                                self.shard_id)
1929
1930
1931class YahooAnswersDataset(SourceDataset, TextBaseDataset):
1932    """
1933    YahooAnswers dataset.
1934
1935    The generated dataset has four columns :py:obj:`[class, title, content, answer]` , whose data type is string.
1936
1937    Args:
1938        dataset_dir (str): Path to the root directory that contains the dataset.
1939        usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` .
1940            ``'train'`` will read from 1,400,000 train samples, ``'test'`` will read from 60,000 test
1941            samples, ``'all'`` will read from all 1,460,000 samples. Default: ``None`` , all samples.
1942        num_samples (int, optional): The number of samples to be included in the dataset.
1943            Default: ``None`` , will include all text.
1944        num_parallel_workers (int, optional): Number of worker threads to read the data.
1945            Default: ``None`` , will use global default workers(8), it can be set
1946            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
1947        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
1948            Bool type and Shuffle enum are both supported to pass in.
1949            Default: ``Shuffle.GLOBAL`` .
1950            If `shuffle` is ``False`` , no shuffling will be performed.
1951            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
1952            ``mindspore.dataset.Shuffle.GLOBAL`` .
1953            Set the mode of data shuffling by passing in enumeration variables:
1954
1955            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
1956
1957            - ``Shuffle.FILES`` : Shuffle files only.
1958
1959        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
1960            When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
1961        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
1962            argument can only be specified when `num_shards` is also specified.
1963        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
1964            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
1965            Default: ``None`` , which means no cache is used.
1966
1967    Raises:
1968        RuntimeError: If `dataset_dir` does not contain data files.
1969        RuntimeError: If `num_shards` is specified but `shard_id` is None.
1970        RuntimeError: If `shard_id` is specified but `num_shards` is None.
1971        ValueError: If `shard_id` is not in range of [0, `num_shards` ).
1972        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
1973
1974    Tutorial Examples:
1975        - `Load & Process Data With Dataset Pipeline
1976          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
1977
1978    Examples:
1979        >>> import mindspore.dataset as ds
1980        >>> yahoo_answers_dataset_dir = "/path/to/yahoo_answers_dataset_directory"
1981        >>>
1982        >>> # 1) Read 3 samples from YahooAnswers dataset
1983        >>> dataset = ds.YahooAnswersDataset(dataset_dir=yahoo_answers_dataset_dir, num_samples=3)
1984        >>>
1985        >>> # 2) Read train samples from YahooAnswers dataset
1986        >>> dataset = ds.YahooAnswersDataset(dataset_dir=yahoo_answers_dataset_dir, usage="train")
1987
1988    About YahooAnswers dataset:
1989
1990    The YahooAnswers dataset consists of 630,000 text samples in 10 classes,
1991    There are 560,000 samples in the train.csv and 70,000 samples in the test.csv.
1992    The 10 different classes represent Society & Culture, Science & Mathematics, Health, Education & Reference,
1993    Computers & Internet, Sports, Business & Finance, Entertainment & Music, Family & Relationships,
1994    Politics & Government.
1995
1996    Here is the original YahooAnswers dataset structure.
1997    You can unzip the dataset files into this directory structure and read by Mindspore's API.
1998
1999    .. code-block::
2000
2001        .
2002        └── yahoo_answers_dataset_dir
2003            ├── train.csv
2004            ├── test.csv
2005            ├── classes.txt
2006            └── readme.txt
2007
2008    Citation:
2009
2010    .. code-block::
2011
2012        @article{YahooAnswers,
2013        title   = {Yahoo! Answers Topic Classification Dataset},
2014        author  = {Xiang Zhang},
2015        year    = {2015},
2016        howpublished = {}
2017        }
2018    """
2019
2020    @check_yahoo_answers_dataset
2021    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=Shuffle.GLOBAL,
2022                 num_shards=None, shard_id=None, cache=None):
2023        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
2024                         num_shards=num_shards, shard_id=shard_id, cache=cache)
2025        self.dataset_dir = dataset_dir
2026        self.usage = replace_none(usage, "all")
2027
2028    def parse(self, children=None):
2029        return cde.YahooAnswersNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
2030                                    self.num_shards, self.shard_id)
2031
2032
2033class YelpReviewDataset(SourceDataset, TextBaseDataset):
2034    """
2035    Yelp Review Polarity and Yelp Review Full datasets.
2036
2037    The generated dataset has two columns: :py:obj:`[label, text]` , and the data type of two columns is string.
2038
2039    Args:
2040        dataset_dir (str): Path to the root directory that contains the dataset.
2041        usage (str, optional): Usage of this dataset, can be ``'train'`` , ``'test'`` or ``'all'`` .
2042            For Polarity, ``'train'`` will read from 560,000 train samples,
2043            ``'test'`` will read from 38,000 test samples,
2044            ``'all'`` will read from all 598,000 samples.
2045            For Full, ``'train'`` will read from 650,000 train samples, ``'test'`` will read from 50,000 test samples,
2046            ``'all'`` will read from all 700,000 samples. Default: ``None`` , all samples.
2047        num_samples (int, optional): Number of samples (rows) to read. Default: ``None`` , reads all samples.
2048        shuffle (Union[bool, Shuffle], optional): Perform reshuffling of the data every epoch.
2049            Bool type and Shuffle enum are both supported to pass in.
2050            Default: ``Shuffle.GLOBAL`` .
2051            If `shuffle` is ``False`` , no shuffling will be performed.
2052            If `shuffle` is ``True`` , it is equivalent to setting `shuffle` to
2053            ``mindspore.dataset.Shuffle.GLOBAL`` .
2054            Set the mode of data shuffling by passing in enumeration variables:
2055
2056            - ``Shuffle.GLOBAL`` : Shuffle both the files and samples.
2057
2058            - ``Shuffle.FILES`` : Shuffle files only.
2059        num_shards (int, optional): Number of shards that the dataset will be divided into. Default: ``None`` .
2060            When this argument is specified, `num_samples` reflects the max sample number of per shard.
2061        shard_id (int, optional): The shard ID within `num_shards` . Default: ``None`` . This
2062            argument can only be specified when `num_shards` is also specified.
2063        num_parallel_workers (int, optional): Number of worker threads to read the data.
2064            Default: ``None`` , will use global default workers(8), it can be set
2065            by :func:`mindspore.dataset.config.set_num_parallel_workers` .
2066        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. More details:
2067            `Single-Node Data Cache <https://www.mindspore.cn/tutorials/experts/en/master/dataset/cache.html>`_ .
2068            Default: ``None`` , which means no cache is used.
2069
2070    Raises:
2071        RuntimeError: If `dataset_dir` does not contain data files.
2072        RuntimeError: If `num_shards` is specified but `shard_id` is None.
2073        RuntimeError: If `shard_id` is specified but `num_shards` is None.
2074        ValueError: If `num_parallel_workers` exceeds the max thread numbers.
2075
2076    Tutorial Examples:
2077        - `Load & Process Data With Dataset Pipeline
2078          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/dataset_gallery.html>`_
2079
2080    Examples:
2081        >>> import mindspore.dataset as ds
2082        >>> yelp_review_dataset_dir = "/path/to/yelp_review_dataset_dir"
2083        >>> dataset = ds.YelpReviewDataset(dataset_dir=yelp_review_dataset_dir, usage='all')
2084
2085    About YelpReview Dataset:
2086
2087    The Yelp Review Full dataset consists of reviews from Yelp. It is extracted from the Yelp Dataset Challenge 2015
2088    data, and it is mainly used for text classification.
2089
2090    The Yelp Review Polarity dataset is constructed from the above dataset, by considering stars 1 and 2 negative, and 3
2091    and 4 positive.
2092
2093    The directory structures of these two datasets are the same.
2094    You can unzip the dataset files into the following structure and read by MindSpore's API:
2095
2096    .. code-block::
2097
2098        .
2099        └── yelp_review_dir
2100             ├── train.csv
2101             ├── test.csv
2102             └── readme.txt
2103
2104    Citation:
2105
2106    For Yelp Review Polarity:
2107
2108    .. code-block::
2109
2110        @article{zhangCharacterlevelConvolutionalNetworks2015,
2111          archivePrefix = {arXiv},
2112          eprinttype = {arxiv},
2113          eprint = {1509.01626},
2114          primaryClass = {cs},
2115          title = {Character-Level {{Convolutional Networks}} for {{Text Classification}}},
2116          abstract = {This article offers an empirical exploration on the use of character-level convolutional networks
2117                      (ConvNets) for text classification. We constructed several large-scale datasets to show that
2118                      character-level convolutional networks could achieve state-of-the-art or competitive results.
2119                      Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF
2120                      variants, and deep learning models such as word-based ConvNets and recurrent neural networks.},
2121          journal = {arXiv:1509.01626 [cs]},
2122          author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
2123          month = sep,
2124          year = {2015},
2125        }
2126
2127    Citation:
2128
2129    For Yelp Review Full:
2130
2131    .. code-block::
2132
2133        @article{zhangCharacterlevelConvolutionalNetworks2015,
2134          archivePrefix = {arXiv},
2135          eprinttype = {arxiv},
2136          eprint = {1509.01626},
2137          primaryClass = {cs},
2138          title = {Character-Level {{Convolutional Networks}} for {{Text Classification}}},
2139          abstract = {This article offers an empirical exploration on the use of character-level convolutional networks
2140                      (ConvNets) for text classification. We constructed several large-scale datasets to show that
2141                      character-level convolutional networks could achieve state-of-the-art or competitive results.
2142                      Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF
2143                      variants, and deep learning models such as word-based ConvNets and recurrent neural networks.},
2144          journal = {arXiv:1509.01626 [cs]},
2145          author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
2146          month = sep,
2147          year = {2015},
2148        }
2149    """
2150
2151    @check_yelp_review_dataset
2152    def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None,
2153                 shard_id=None, num_parallel_workers=None, cache=None):
2154        super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
2155                         num_shards=num_shards, shard_id=shard_id, cache=cache)
2156        self.dataset_dir = dataset_dir
2157        self.usage = replace_none(usage, 'all')
2158
2159    def parse(self, children=None):
2160        return cde.YelpReviewNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
2161                                  self.num_shards, self.shard_id)
2162