• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020-2022 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""
15The module text.transforms is inherited from _c_dataengine
16and is implemented based on ICU4C and cppjieba in C++.
17It's a high performance module to process NLP text.
18Users can use Vocab to build their own dictionary,
19use appropriate tokenizers to split sentences into different tokens,
20and use Lookup to find the index of tokens in Vocab.
21
22.. Note::
23    A constructor's arguments for every class in this module must be saved into the
24    class attributes (self.xxx) to support save() and load().
25
26Examples:
27    >>> import mindspore.dataset as ds
28    >>> import mindspore.dataset.text as text
29    >>>
30    >>> # Create a dataset for text sentences saved as line data in a file
31    >>> text_file_list = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files
32    >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_list, shuffle=False)
33    >>>
34    >>> # Tokenize sentences to unicode characters
35    >>> tokenizer = text.UnicodeCharTokenizer()
36    >>> # Load vocabulary from list
37    >>> vocab = text.Vocab.from_list(word_list=['深', '圳', '欢', '迎', '您'])
38    >>> # Use Lookup operation to map tokens to ids
39    >>> lookup = text.Lookup(vocab=vocab)
40    >>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup])
41    >>> # if text line in dataset_file is:
42    >>> # 深圳欢迎您
43    >>> # then the output will be:
44    >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
45"""
46import json
47import os
48import re
49import platform
50import numpy as np
51
52import mindspore._c_dataengine as cde
53from mindspore.common import dtype as mstype
54
55from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab
56from .validators import check_add_token, check_lookup, check_jieba_add_dict, check_to_vectors, \
57    check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
58    check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
59    check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \
60    check_sentence_piece_tokenizer, check_truncate
61from ..core.datatypes import mstype_to_detype
62from ..core.validator_helpers import replace_none
63from ..transforms.py_transforms_util import Implementation
64from ..transforms.transforms import TensorOperation
65from ..transforms.validators import invalidate_callable
66
67
68class TextTensorOperation(TensorOperation):
69    """
70    Base class of Text Tensor Ops
71    """
72
73    def __init__(self):
74        super().__init__()
75        self.implementation = Implementation.C
76
77    def parse(self):
78        raise NotImplementedError("TextTensorOperation has to implement parse() method.")
79
80
81DE_C_INTER_JIEBA_MODE = {
82    JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX,
83    JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP,
84    JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM
85}
86
87DE_C_INTER_SENTENCEPIECE_LOADTYPE = {
88    SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE,
89    SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL
90}
91
92DE_C_INTER_SENTENCEPIECE_OUTTYPE = {
93    SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString,
94    SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT
95}
96
97
98class AddToken(TextTensorOperation):
99    """
100    Add token to beginning or end of sequence.
101
102    Args:
103        token (str): The token to be added.
104        begin (bool, optional): Choose the position where the token is inserted. If True,
105            the token will be inserted at the beginning of the sequence. Otherwise, it will
106            be inserted at the end of the sequence. Default: ``True``.
107
108    Raises:
109        TypeError: If `token` is not of type string.
110        TypeError: If `begin` is not of type bool.
111
112    Supported Platforms:
113        ``CPU``
114
115    Examples:
116        >>> import mindspore.dataset as ds
117        >>> import mindspore.dataset.text as text
118        >>>
119        >>> # Use the transform in dataset pipeline mode
120        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[['a', 'b', 'c', 'd', 'e']], column_names=["text"])
121        >>> # Data before
122        >>> # |           text            |
123        >>> # +---------------------------+
124        >>> # | ['a', 'b', 'c', 'd', 'e'] |
125        >>> # +---------------------------+
126        >>> add_token_op = text.AddToken(token='TOKEN', begin=True)
127        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=add_token_op)
128        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
129        ...     print(item["text"])
130        ['TOKEN' 'a' 'b' 'c' 'd' 'e']
131        >>> # Data after
132        >>> # |           text            |
133        >>> # +---------------------------+
134        >>> # | ['TOKEN', 'a', 'b', 'c', 'd', 'e'] |
135        >>> # +---------------------------+
136        >>>
137        >>> # Use the transform in eager mode
138        >>> data = ["happy", "birthday", "to", "you"]
139        >>> output = text.AddToken(token='TOKEN', begin=True)(data)
140        >>> print(output)
141        ['TOKEN' 'happy' 'birthday' 'to' 'you']
142
143    Tutorial Examples:
144        - `Illustration of text transforms
145          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
146    """
147
148    @check_add_token
149    def __init__(self, token, begin=True):
150        super().__init__()
151        self.token = token
152        self.begin = begin
153
154    def parse(self):
155        return cde.AddTokenOperation(self.token, self.begin)
156
157
158class JiebaTokenizer(TextTensorOperation):
159    """
160    Use Jieba tokenizer to tokenize Chinese strings.
161
162    Note:
163        The dictionary files used by Hidden Markov Model segment and Max Probability segment can be
164        obtained through the `cppjieba GitHub <https://github.com/yanyiwu/cppjieba/tree/master/dict>`_ .
165        Please ensure the validity and integrity of these files.
166
167    Args:
168        hmm_path (str): Path to the dictionary file used by Hidden Markov Model segment.
169        mp_path (str): Path to the dictionary file used by Max Probability segment.
170        mode (JiebaMode, optional): The desired segment algorithms. See :class:`~.text.JiebaMode`
171            for details on optional values. Default: ``JiebaMode.MIX`` .
172        with_offsets (bool, optional): Whether to output the start and end offsets of each
173            token in the original string. Default: ``False`` .
174
175    Raises:
176        TypeError: If `hmm_path` is not of type str.
177        TypeError: If `mp_path` is not of type str.
178        TypeError: If `mode` is not of type :class:`~.text.JiebaMode` .
179        TypeError: If `with_offsets` is not of type bool.
180
181    Supported Platforms:
182        ``CPU``
183
184    Examples:
185        >>> import mindspore.dataset as ds
186        >>> import mindspore.dataset.text as text
187        >>> from mindspore.dataset.text import JiebaMode
188        >>>
189        >>> # Use the transform in dataset pipeline mode
190        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"])
191        >>>
192        >>> # 1) If with_offsets=False, return one data column {["text", dtype=str]}
193        >>> # The paths to jieba_hmm_file and jieba_mp_file can be downloaded directly from the mindspore repository.
194        >>> # Refer to https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/jiebadict/hmm_model.utf8
195        >>> # and https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/jiebadict/jieba.dict.utf8
196        >>> jieba_hmm_file = "tests/ut/data/dataset/jiebadict/hmm_model.utf8"
197        >>> jieba_mp_file = "tests/ut/data/dataset/jiebadict/jieba.dict.utf8"
198        >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False)
199        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
200        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
201        ...     print(item["text"])
202        ['床' '前' '明月光']
203        >>>
204        >>> # 2) If with_offsets=True, return three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
205        >>> #                                                ["offsets_limit", dtype=uint32]}
206        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"])
207        >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True)
208        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op, input_columns=["text"],
209        ...                                                 output_columns=["token", "offsets_start", "offsets_limit"])
210        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
211        ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
212        ['床' '前' '明月光'] [0 3 6] [ 3  6 15]
213        >>>
214        >>> # Use the transform in eager mode
215        >>> data = "床前明月光"
216        >>> output = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)(data)
217        >>> print(output)
218        ['床' '前' '明月光']
219
220    Tutorial Examples:
221        - `Illustration of text transforms
222          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
223    """
224
225    @check_jieba_init
226    def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False):
227        super().__init__()
228        if not isinstance(mode, JiebaMode):
229            raise TypeError("Wrong input type for mode, should be JiebaMode.")
230
231        self.mode = mode
232        self.__check_path__(hmm_path)
233        self.hmm_path = hmm_path
234        self.__check_path__(mp_path)
235        self.mp_path = mp_path
236        self.with_offsets = with_offsets
237        self.words = []
238
239    def __check_path__(self, model_path):
240        """check model path"""
241        if not os.path.exists(os.path.realpath(model_path)):
242            raise ValueError(
243                " jieba mode file {} is not exist.".format(model_path))
244
245    def parse(self):
246        jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path,
247                                                      DE_C_INTER_JIEBA_MODE.get(self.mode),
248                                                      self.with_offsets)
249        for word in self.words:
250            jieba_tokenizer.add_word(word[0], word[1])
251        return jieba_tokenizer
252
253    @invalidate_callable
254    @check_jieba_add_word
255    def add_word(self, word, freq=None):
256        """
257        Add a specified word mapping to the Vocab of the tokenizer.
258
259        Args:
260            word (str): The word to be added to the Vocab.
261            freq (int, optional): The frequency of the word to be added. The higher the word frequency,
262                the greater the chance that the word will be tokenized. Default: ``None``, using the
263                default word frequency.
264
265        Examples:
266            >>> import mindspore.dataset as ds
267            >>> import mindspore.dataset.text as text
268            >>> from mindspore.dataset.text import JiebaMode
269            >>>
270            >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
271            >>> jieba_mp_file = "/path/to/jieba/mp/file"
272            >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
273            >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
274            >>> with open(sentence_piece_vocab_file, 'r') as f:
275            ...     for line in f:
276            ...         word = line.split(',')[0]
277            ...         jieba_op.add_word(word)
278            >>>
279            >>> text_file_list = ["/path/to/text_file_dataset_file"]
280            >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_list)
281            >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
282        """
283
284        if freq is None:
285            self.words.append((word, 0))
286        else:
287            self.words.append((word, freq))
288
289    @invalidate_callable
290    @check_jieba_add_dict
291    def add_dict(self, user_dict):
292        """
293        Add the specified word mappings to the Vocab of the tokenizer.
294
295        Args:
296            user_dict (Union[str, dict[str, int]]): The word mappings to be added to the Vocab.
297                If the input type is str, it means the path of the file storing the word mappings to be added.
298                Each line of the file should contain two fields separated by a space, where the first field
299                indicates the word itself and the second field should be a number indicating the word frequency.
300                Invalid lines will be ignored and no error or warning will be returned.
301                If the input type is dict[str, int], it means the dictionary storing the word mappings to be added,
302                where the key name is the word itself and the key value is the word frequency.
303
304        Examples:
305            >>> import mindspore.dataset as ds
306            >>> import mindspore.dataset.text as text
307            >>> from mindspore.dataset.text import JiebaMode
308            >>>
309            >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
310            >>> jieba_mp_file = "/path/to/jieba/mp/file"
311            >>> user_dict = {"男默女泪": 10}
312            >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
313            >>> jieba_op.add_dict(user_dict)
314            >>>
315            >>> text_file_list = ["/path/to/text_file_dataset_file"]
316            >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_list)
317            >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
318        """
319
320        if isinstance(user_dict, str):
321            self.__add_dict_py_file(user_dict)
322        elif isinstance(user_dict, dict):
323            for k, v in user_dict.items():
324                self.add_word(k, v)
325        else:
326            raise TypeError("The type of user_dict must str or dict.")
327
328    def __add_dict_py_file(self, file_path):
329        """Add user defined word by file"""
330        words_list = self.__parser_file(file_path)
331        for data in words_list:
332            if data[1] is None:
333                freq = 0
334            else:
335                freq = int(data[1])
336            self.add_word(data[0], freq)
337
338    def __decode(self, data):
339        """decode the dict file to utf8"""
340        try:
341            data = data.decode('utf-8')
342        except UnicodeDecodeError:
343            raise ValueError("user dict file must be utf8 format.")
344        return data.lstrip('\ufeff')
345
346    def __parser_file(self, file_path):
347        """parser user defined word by file"""
348        if not os.path.exists(file_path):
349            raise ValueError(
350                "user dict file {} is not exist.".format(file_path))
351        real_file_path = os.path.realpath(file_path)
352        file_dict = open(real_file_path, "r")
353        data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U)
354        words_list = []
355        for item in file_dict:
356            data = item.strip()
357            if not isinstance(data, str):
358                data = self.__decode(data)
359            tmp = data_re.match(data)
360            if not tmp:
361                continue
362            words = tmp.groups()
363            words_list.append(words)
364        file_dict.close()
365        return words_list
366
367
368class Lookup(TextTensorOperation):
369    """
370    Look up a word into an id according to the input vocabulary table.
371
372    Args:
373        vocab (Vocab): A vocabulary object.
374        unknown_token (str, optional): Word is used for lookup. In case of the word is out of vocabulary (OOV),
375            the result of lookup will be replaced with unknown_token. If the unknown_token is not specified or
376            it is OOV, runtime error will be thrown. Default: ``None``, means no unknown_token is specified.
377        data_type (mindspore.dtype, optional): The data type that lookup operation maps
378            string to. Default: ``mstype.int32``.
379
380    Raises:
381        TypeError: If `vocab` is not of type text.Vocab.
382        TypeError: If `unknown_token` is not of type string.
383        TypeError: If `data_type` is not of type mindspore.dtype.
384
385    Supported Platforms:
386        ``CPU``
387
388    Examples:
389        >>> import mindspore.dataset as ds
390        >>> import mindspore.dataset.text as text
391        >>>
392        >>> # Use the transform in dataset pipeline mode
393        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["with"], column_names=["text"])
394        >>> # Load vocabulary from list
395        >>> vocab = text.Vocab.from_list(["?", "##", "with", "the", "test", "符号"])
396        >>> # Use Lookup operation to map tokens to ids
397        >>> lookup = text.Lookup(vocab)
398        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=[lookup])
399        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
400        ...     print(item["text"])
401        2
402        >>>
403        >>> # Use the transform in eager mode
404        >>> vocab = text.Vocab.from_list(["?", "##", "with", "the", "test", "符号"])
405        >>> data = "with"
406        >>> output = text.Lookup(vocab=vocab, unknown_token="test")(data)
407        >>> print(output)
408        2
409
410    Tutorial Examples:
411        - `Illustration of text transforms
412          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
413    """
414
415    @check_lookup
416    def __init__(self, vocab, unknown_token=None, data_type=mstype.int32):
417        super().__init__()
418        self.vocab = vocab
419        self.unknown_token = unknown_token
420        self.data_type = data_type
421
422    def parse(self):
423        return cde.LookupOperation(self.vocab.c_vocab, self.unknown_token, str(mstype_to_detype(self.data_type)))
424
425
426class Ngram(TextTensorOperation):
427    """
428    Generate n-gram from a 1-D string Tensor.
429
430    Refer to `N-gram <https://en.wikipedia.org/wiki/N-gram#Examples>`_
431    for an overview of what n-gram is and how it works.
432
433    Args:
434        n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result
435            would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
436            for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in
437            an empty string produced.
438        left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width).
439            `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the
440            sequence with "__". Default: ``('', 0)``.
441        right_pad (tuple, optional): Padding performed on right side of the sequence shaped like
442            ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("_", 2)
443            would pad right side of the sequence with "__". Default: ``('', 0)``.
444        separator (str, optional): Symbol used to join strings together. For example, if 2-gram is
445            ["mindspore", "amazing"] with separator is ``"-"``, the result would be ["mindspore-amazing"].
446            Default: ``' '``, which will use whitespace as separator.
447
448    Raises:
449        TypeError: If values of `n` not positive is not of type int.
450        ValueError: If values of `n` not positive.
451        ValueError: If `left_pad` is not a tuple of length 2.
452        ValueError: If `right_pad` is not a tuple of length 2.
453        TypeError: If `separator` is not of type string.
454
455    Supported Platforms:
456        ``CPU``
457
458    Examples:
459        >>> import numpy as np
460        >>> import mindspore.dataset as ds
461        >>> import mindspore.dataset.text as text
462        >>>
463        >>> # Use the transform in dataset pipeline mode
464        >>> def gen(texts):
465        ...     for line in texts:
466        ...         yield(np.array(line.split(" "), dtype=str),)
467        >>> data = ["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"]
468        >>> generator_dataset = ds.GeneratorDataset(gen(data), ["text"])
469        >>> ngram_op = text.Ngram(3, separator="-")
470        >>> generator_dataset = generator_dataset.map(operations=ngram_op)
471        >>> for item in generator_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
472        ...     print(item["text"])
473        ...     break
474        ['']
475        >>>
476        >>> # Use the transform in eager mode
477        >>> output = ngram_op(data)
478        >>> print(output)
479        ["WildRose Country-Canada's Ocean Playground-Land of Living Skies"]
480
481    Tutorial Examples:
482        - `Illustration of text transforms
483          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
484    """
485
486    @check_ngram
487    def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "):
488        super().__init__()
489        self.ngrams = n
490        self.left_pad = left_pad
491        self.right_pad = right_pad
492        self.separator = separator
493
494    def parse(self):
495        return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator)
496
497
498class PythonTokenizer:
499    """
500    Class that applies user-defined string tokenizer into input string.
501
502    Args:
503        tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
504
505    Raises:
506        TypeError: If `tokenizer` is not a callable Python function.
507
508    Supported Platforms:
509        ``CPU``
510
511    Examples:
512        >>> import numpy as np
513        >>> import mindspore.dataset as ds
514        >>> import mindspore.dataset.text as text
515        >>>
516        >>> # Use the transform in dataset pipeline mode
517        >>> def my_tokenizer(line):
518        ...     return line.split()
519        >>>
520        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Hello world'], column_names=["text"])
521        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=text.PythonTokenizer(my_tokenizer))
522        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
523        ...     print(item["text"])
524        ['Hello' 'world']
525        >>>
526        >>> # Use the transform in eager mode
527        >>> data = np.array('Hello world'.encode())
528        >>> output = text.PythonTokenizer(my_tokenizer)(data)
529        >>> print(output)
530        ['Hello' 'world']
531
532    Tutorial Examples:
533        - `Illustration of text transforms
534          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
535    """
536
537    @check_python_tokenizer
538    def __init__(self, tokenizer):
539        self.pyfunc = tokenizer
540        self.tokenizer = np.vectorize(lambda x: np.array(tokenizer(x), dtype='U'), signature='()->(n)')
541        self.random = False
542
543    def __call__(self, in_array):
544        if not isinstance(in_array, np.ndarray):
545            raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
546        if in_array.dtype.type is np.bytes_:
547            in_array = to_str(in_array)
548        try:
549            tokens = self.tokenizer(in_array)
550        except Exception as e:
551            raise RuntimeError("Error occurred in Pyfunc [" + str(self.pyfunc.__name__) + "], error message: " + str(e))
552        return tokens
553
554    def to_json(self):
555        json_obj = {}
556        json_obj["tensor_op_name"] = self.pyfunc.__name__
557        json_obj["python_module"] = self.__class__.__module__
558        return json.dumps(json_obj)
559
560
561class SentencePieceTokenizer(TextTensorOperation):
562    """
563    Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
564
565    Args:
566        mode (Union[str, SentencePieceVocab]): SentencePiece model.
567            If the input parameter is a file, it represents the path of SentencePiece mode to be loaded.
568            If the input parameter is a SentencePieceVocab object, it should be constructed in advanced.
569        out_type (SPieceTokenizerOutType): The type of output, it can be ``SPieceTokenizerOutType.STRING``,
570            ``SPieceTokenizerOutType.INT``.
571
572            - ``SPieceTokenizerOutType.STRING``, means output type of SentencePice Tokenizer is string.
573            - ``SPieceTokenizerOutType.INT``, means output type of SentencePice Tokenizer is int.
574
575    Raises:
576        TypeError: If `mode` is not of type string or SentencePieceVocab.
577        TypeError: If `out_type` is not of type SPieceTokenizerOutType.
578
579    Supported Platforms:
580        ``CPU``
581
582    Examples:
583        >>> import mindspore.dataset as ds
584        >>> import mindspore.dataset.text as text
585        >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
586        >>>
587        >>> # Use the transform in dataset pipeline mode
588        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Hello world'], column_names=["text"])
589        >>> # The paths to sentence_piece_vocab_file can be downloaded directly from the mindspore repository. Refer to
590        >>> # https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/test_sentencepiece/vocab.txt
591        >>> sentence_piece_vocab_file = "tests/ut/data/dataset/test_sentencepiece/vocab.txt"
592        >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 512, 0.9995,
593        ...                                            SentencePieceModel.UNIGRAM, {})
594        >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
595        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer)
596        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
597        ...     print(item["text"])
598        ['▁H' 'e' 'l' 'lo' '▁w' 'o' 'r' 'l' 'd']
599        >>>
600        >>> # Use the transform in eager mode
601        >>> data = "Hello world"
602        >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 100, 0.9995,
603        ...                                           SentencePieceModel.UNIGRAM, {})
604        >>> output = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)(data)
605        >>> print(output)
606        ['▁' 'H' 'e' 'l' 'l' 'o' '▁' 'w' 'o' 'r' 'l' 'd']
607
608    Tutorial Examples:
609        - `Illustration of text transforms
610          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
611    """
612
613    @check_sentence_piece_tokenizer
614    def __init__(self, mode, out_type):
615        super().__init__()
616        self.mode = mode
617        self.out_type = out_type
618
619    def parse(self):
620        self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode
621        return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE.get(self.out_type))
622
623
624class SlidingWindow(TextTensorOperation):
625    """
626    Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis
627    is a slice of data starting at the corresponding position, with a specified width.
628
629    Args:
630        width (int): The width of the window. It must be an integer and greater than zero.
631        axis (int, optional): The axis along which the sliding window is computed. Default: ``0``.
632
633    Raises:
634        TypeError: If `width` is not of type int.
635        ValueError: If value of `width` is not positive.
636        TypeError: If `axis` is not of type int.
637
638    Supported Platforms:
639        ``CPU``
640
641    Examples:
642        >>> import mindspore.dataset as ds
643        >>> import mindspore.dataset.text as text
644        >>>
645        >>> # Use the transform in dataset pipeline mode
646        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[[1, 2, 3, 4, 5]], column_names=["col1"])
647        >>> # Data before
648        >>> # |     col1     |
649        >>> # +--------------+
650        >>> # | [[1, 2, 3, 4, 5]] |
651        >>> # +--------------+
652        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=text.SlidingWindow(3, 0))
653        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
654        ...     print(item["col1"])
655        [[1 2 3] [2 3 4] [3 4 5]]
656        >>> # Data after
657        >>> # |     col1     |
658        >>> # +--------------+
659        >>> # |  [[1, 2, 3], |
660        >>> # |   [2, 3, 4], |
661        >>> # |   [3, 4, 5]] |
662        >>> # +--------------+
663        >>>
664        >>> # Use the transform in eager mode
665        >>> data = ["happy", "birthday", "to", "you"]
666        >>> output = text.SlidingWindow(2, 0)(data)
667        >>> print(output)
668        [['happy' 'birthday'] ['birthday' 'to'] ['to' 'you']]
669
670    Tutorial Examples:
671        - `Illustration of text transforms
672          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
673    """
674
675    @check_slidingwindow
676    def __init__(self, width, axis=0):
677        super().__init__()
678        self.width = width
679        self.axis = axis
680
681    def parse(self):
682        return cde.SlidingWindowOperation(self.width, self.axis)
683
684
685class ToNumber(TextTensorOperation):
686    """
687    Tensor operation to convert every element of a string tensor to a number.
688
689    Strings are cast according to the rules specified in the following links, except that any strings which represent
690    negative numbers cannot be cast to an unsigned integer type, rules links are as follows:
691    https://en.cppreference.com/w/cpp/string/basic_string/stof,
692    https://en.cppreference.com/w/cpp/string/basic_string/stoul.
693
694    Args:
695        data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype.
696
697    Raises:
698        TypeError: If `data_type` is not of type mindspore.dtype.
699        RuntimeError: If strings are invalid to cast, or are out of range after being cast.
700
701    Supported Platforms:
702        ``CPU``
703
704    Examples:
705        >>> import mindspore.dataset as ds
706        >>> import mindspore.dataset.text as text
707        >>> from mindspore import dtype as mstype
708        >>>
709        >>> # Use the transform in dataset pipeline mode
710        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[["1", "2", "3"]], column_names=["text"])
711        >>> to_number_op = text.ToNumber(mstype.int8)
712        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=to_number_op)
713        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
714        ...     print(item["text"])
715        [1 2 3]
716        >>>
717        >>> # Use the transform in eager mode
718        >>> data = ["1", "2", "3"]
719        >>> output = text.ToNumber(mstype.uint32)(data)
720        >>> print(output)
721        [1 2 3]
722
723    Tutorial Examples:
724        - `Illustration of text transforms
725          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
726    """
727
728    @check_to_number
729    def __init__(self, data_type):
730        super().__init__()
731        data_type = mstype_to_detype(data_type)
732        self.data_type = str(data_type)
733
734    def parse(self):
735        return cde.ToNumberOperation(self.data_type)
736
737
738class ToVectors(TextTensorOperation):
739    """
740    Look up a token into vectors according to the input vector table.
741
742    Args:
743        vectors (Vectors): A vectors object.
744        unk_init (sequence, optional): Sequence used to initialize out-of-vectors (OOV) token.
745            Default: ``None``, initialize with zero vectors.
746        lower_case_backup (bool, optional): Whether to look up the token in the lower case. If ``False``,
747            each token in the original case will be looked up; if ``True``, each token in the original
748            case will be looked up first, if not found in the keys of the property stoi, the token in the
749            lower case will be looked up. Default: ``False``.
750
751    Raises:
752        TypeError: If `unk_init` is not of type sequence.
753        TypeError: If elements of `unk_init` is not of type float or int.
754        TypeError: If `lower_case_backup` is not of type bool.
755
756    Supported Platforms:
757        ``CPU``
758
759    Examples:
760        >>> import mindspore.dataset as ds
761        >>> import mindspore.dataset.text as text
762        >>>
763        >>> # Use the transform in dataset pipeline mode
764        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["happy", "birthday", "to", "you"], column_names=["text"])
765        >>> # Load vectors from file
766        >>> # The paths to vectors_file can be downloaded directly from the mindspore repository. Refer to
767        >>> # https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/testVectors/vectors.txt
768        >>> vectors_file = "tests/ut/data/dataset/testVectors/vectors.txt"
769        >>> vectors = text.Vectors.from_file(vectors_file)
770        >>> # Use ToVectors operation to map tokens to vectors
771        >>> to_vectors = text.ToVectors(vectors)
772        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=[to_vectors])
773        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
774        ...     print(item["text"])
775        ...     break
776        [0. 0. 0. 0. 0. 0.]
777        >>>
778        >>> # Use the transform in eager mode
779        >>> data = ["happy"]
780        >>> output = text.ToVectors(vectors)(data)
781        >>> print(output)
782        [0. 0. 0. 0. 0. 0.]
783
784    Tutorial Examples:
785        - `Illustration of text transforms
786          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
787    """
788
789    @check_to_vectors
790    def __init__(self, vectors, unk_init=None, lower_case_backup=False):
791        super().__init__()
792        self.vectors = vectors
793        self.unk_init = unk_init if unk_init is not None else []
794        self.lower_case_backup = lower_case_backup
795
796    def parse(self):
797        return cde.ToVectorsOperation(self.vectors, self.unk_init, self.lower_case_backup)
798
799
800class Truncate(TextTensorOperation):
801    """
802    Truncate the input sequence so that it does not exceed the maximum length.
803
804    Args:
805        max_seq_len (int): Maximum allowable length.
806
807    Raises:
808        TypeError: If `max_length_len` is not of type int.
809        ValueError: If value of `max_length_len` is not greater than or equal to 0.
810        RuntimeError: If the input tensor is not of dtype bool, int, float, double or str.
811
812    Supported Platforms:
813        ``CPU``
814
815    Examples:
816        >>> import mindspore.dataset as ds
817        >>> import mindspore.dataset.text as text
818        >>>
819        >>> # Use the transform in dataset pipeline mode
820        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[['a', 'b', 'c', 'd', 'e']], column_names=["text"],
821        ...                                              shuffle=False)
822        >>> # Data before
823        >>> # |           col1            |
824        >>> # +---------------------------+
825        >>> # | ['a', 'b', 'c', 'd', 'e'] |
826        >>> # +---------------------------+
827        >>> truncate = text.Truncate(4)
828        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=truncate, input_columns=["text"])
829        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
830        ...     print(item["text"])
831        ['a' 'b' 'c' 'd']
832        >>> # Data after
833        >>> # |          col1          |
834        >>> # +------------------------+
835        >>> # |  ['a', 'b', 'c', 'd']  |
836        >>> # +------------------------+
837        >>>
838        >>> # Use the transform in eager mode
839        >>> data = ["happy", "birthday", "to", "you"]
840        >>> output = text.Truncate(2)(data)
841        >>> print(output)
842        ['happy' 'birthday']
843
844    Tutorial Examples:
845        - `Illustration of text transforms
846          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
847    """
848
849    @check_truncate
850    def __init__(self, max_seq_len):
851        super().__init__()
852        self.max_seq_len = max_seq_len
853
854    def parse(self):
855        return cde.TruncateOperation(self.max_seq_len)
856
857
858class TruncateSequencePair(TextTensorOperation):
859    """
860    Truncate a pair of 1-D string input so that their total length is less than the specified length.
861
862    Args:
863        max_length (int): The maximum total length of the output strings. If it is no less than the
864            total length of the original pair of strings, no truncation is performed; otherwise, the
865            longer of the two input strings is truncated until its total length equals this value.
866
867    Raises:
868        TypeError: If `max_length` is not of type int.
869
870    Supported Platforms:
871        ``CPU``
872
873    Examples:
874        >>> import mindspore.dataset as ds
875        >>> import mindspore.dataset.text as text
876        >>>
877        >>> # Use the transform in dataset pipeline mode
878        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=([[1, 2, 3]], [[4, 5]]), column_names=["col1", "col2"])
879        >>> # Data before
880        >>> # |   col1    |   col2    |
881        >>> # +-----------+-----------|
882        >>> # | [1, 2, 3] |  [4, 5]   |
883        >>> # +-----------+-----------+
884        >>> truncate_sequence_pair_op = text.TruncateSequencePair(max_length=4)
885        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=truncate_sequence_pair_op,
886        ...                                                 input_columns=["col1", "col2"])
887        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
888        ...     print(item["col1"], item["col2"])
889        [1 2] [4 5]
890        >>> # Data after
891        >>> # |   col1    |   col2    |
892        >>> # +-----------+-----------+
893        >>> # |  [1, 2]   |  [4, 5]   |
894        >>> # +-----------+-----------+
895        >>>
896        >>> # Use the transform in eager mode
897        >>> data = [["1", "2", "3"], ["4", "5"]]
898        >>> output = text.TruncateSequencePair(4)(*data)
899        >>> print(output)
900        (array(['1', '2'], dtype='<U1'), array(['4', '5'], dtype='<U1'))
901
902    Tutorial Examples:
903        - `Illustration of text transforms
904          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
905    """
906
907    @check_pair_truncate
908    def __init__(self, max_length):
909        super().__init__()
910        self.max_length = max_length
911
912    def parse(self):
913        return cde.TruncateSequencePairOperation(self.max_length)
914
915
916class UnicodeCharTokenizer(TextTensorOperation):
917    """
918    Unpack the Unicode characters in the input strings.
919
920    Args:
921        with_offsets (bool, optional): Whether to output the start and end offsets of each
922            token in the original string. Default: ``False`` .
923
924    Raises:
925        TypeError: If `with_offsets` is not of type bool.
926
927    Supported Platforms:
928        ``CPU``
929
930    Examples:
931        >>> import mindspore.dataset as ds
932        >>> import mindspore.dataset.text as text
933        >>>
934        >>> # Use the transform in dataset pipeline mode
935        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
936        >>>
937        >>> # If with_offsets=False, default output one column {["text", dtype=str]}
938        >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=False)
939        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
940        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
941        ...     print(item["text"])
942        ...     break
943        ['W' 'e' 'l' 'c' 'o' 'm' 'e' ' ' ' ' ' ' ' ' ' ' 'T' 'o' ' ' ' ' ' ' 'B' 'e' 'i' 'J' 'i' 'n' 'g' '!']
944        >>>
945        >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
946        >>> #                                                  ["offsets_limit", dtype=uint32]}
947        >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True)
948        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
949        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op, input_columns=["text"],
950        ...                                                 output_columns=["token", "offsets_start", "offsets_limit"])
951        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
952        ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
953        ['W' 'e' 'l' 'c' 'o' 'm' 'e' ' ' ' ' ' ' ' ' ' ' 'T' 'o' ' ' ' ' ' ' 'B' 'e' 'i' 'J' 'i' 'n' 'g' '!'] [ 0  1  2
954        3  4  5  6  7  8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
955        16 17 18 19 20 21 22 23 24 25]
956        >>>
957        >>> # Use the transform in eager mode
958        >>> data = 'Welcome     To   BeiJing!'
959        >>> output = text.UnicodeCharTokenizer(with_offsets=True)(data)
960        >>> print(output)
961        (array(['W', 'e', 'l', 'c', 'o', 'm', 'e', ' ', ' ', ' ', ' ', ' ', 'T', 'o', ' ', ' ', ' ', 'B', 'e', 'i', 'J',
962        'i', 'n', 'g', '!'], dtype='<U1'), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
963        17, 18, 19, 20, 21, 22, 23, 24], dtype=uint32), array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
964        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], dtype=uint32))
965
966    Tutorial Examples:
967        - `Illustration of text transforms
968          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
969    """
970
971    @check_with_offsets
972    def __init__(self, with_offsets=False):
973        super().__init__()
974        self.with_offsets = with_offsets
975
976    def parse(self):
977        return cde.UnicodeCharTokenizerOperation(self.with_offsets)
978
979
980class WordpieceTokenizer(TextTensorOperation):
981    """
982    Tokenize the input text to subword tokens.
983
984    Args:
985        vocab (Vocab): Vocabulary used to look up words.
986        suffix_indicator (str, optional): Prefix flags used to indicate subword suffixes. Default: ``'##'``.
987        max_bytes_per_token (int, optional): The maximum length of tokenization, words exceeding this length will
988                not be split. Default: ``100``.
989        unknown_token (str, optional): The output for unknown words. When set to an empty string, the corresponding
990                unknown word will be directly returned as the output. Otherwise, the set string will be returned as the
991                output. Default: ``'[UNK]'``.
992        with_offsets (bool, optional): Whether to output the start and end offsets of each
993            token in the original string. Default: ``False`` .
994
995    Raises:
996        TypeError: If `vocab` is not of type :class:`mindspore.dataset.text.Vocab` .
997        TypeError: If `suffix_indicator` is not of type str.
998        TypeError: If `max_bytes_per_token` is not of type int.
999        TypeError: If `unknown_token` is not of type str.
1000        TypeError: If `with_offsets` is not of type bool.
1001        ValueError: If `max_bytes_per_token` is negative.
1002
1003    Supported Platforms:
1004        ``CPU``
1005
1006    Examples:
1007        >>> import mindspore.dataset as ds
1008        >>> import mindspore.dataset.text as text
1009        >>>
1010        >>> # Use the transform in dataset pipeline mode
1011        >>> seed = ds.config.get_seed()
1012        >>> ds.config.set_seed(12345)
1013        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["happy", "birthday", "to", "you"], column_names=["text"])
1014        >>>
1015        >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"]
1016        >>> vocab = text.Vocab.from_list(vocab_list)
1017        >>>
1018        >>> # If with_offsets=False, default output one column {["text", dtype=str]}
1019        >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
1020        ...                                        max_bytes_per_token=100, with_offsets=False)
1021        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
1022        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1023        ...     print(item["text"])
1024        ...     break
1025        ['[UNK]']
1026        >>>
1027        >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
1028        >>> #                                                  ["offsets_limit", dtype=uint32]}
1029        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["happy", "birthday", "to", "you"], column_names=["text"])
1030        >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
1031        ...                                        max_bytes_per_token=100, with_offsets=True)
1032        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op, input_columns=["text"],
1033        ...                                                 output_columns=["token", "offsets_start", "offsets_limit"])
1034        >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1035        ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
1036        ...     break
1037        ['[UNK]'] [0] [5]
1038        >>>
1039        >>> # Use the transform in eager mode
1040        >>> data = ["happy", "birthday", "to", "you"]
1041        >>> vocab_list = ["book", "cholera", "era", "favor", "**ite", "my", "is", "love", "dur", "**ing", "the"]
1042        >>> vocab = text.Vocab.from_list(vocab_list)
1043        >>> output = text.WordpieceTokenizer(vocab=vocab, suffix_indicator="y", unknown_token='[UNK]')(data)
1044        >>> print(output)
1045        ['[UNK]' '[UNK]' '[UNK]' '[UNK]']
1046        >>> ds.config.set_seed(seed)
1047
1048    Tutorial Examples:
1049        - `Illustration of text transforms
1050          <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1051    """
1052
1053    @check_wordpiece_tokenizer
1054    def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]',
1055                 with_offsets=False):
1056        super().__init__()
1057        self.vocab = vocab
1058        self.suffix_indicator = suffix_indicator
1059        self.max_bytes_per_token = max_bytes_per_token
1060        self.unknown_token = unknown_token
1061        self.with_offsets = with_offsets
1062
1063    def parse(self):
1064        return cde.WordpieceTokenizerOperation(self.vocab.c_vocab, self.suffix_indicator, self.max_bytes_per_token,
1065                                               self.unknown_token, self.with_offsets)
1066
1067
1068if platform.system().lower() != 'windows':
1069    DE_C_INTER_NORMALIZE_FORM = {
1070        NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
1071        NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
1072        NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
1073        NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
1074        NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
1075    }
1076
1077
1078    class BasicTokenizer(TextTensorOperation):
1079        """
1080        Tokenize the input UTF-8 encoded string by specific rules.
1081
1082        Note:
1083            `BasicTokenizer` is not supported on Windows platform yet.
1084
1085        Args:
1086            lower_case (bool, optional): Whether to perform lowercase processing on the text. If True, will fold the
1087                text to lower case and strip accented characters. If False, will only perform normalization on the
1088                text, with mode specified by `normalization_form` . Default: ``False``.
1089            keep_whitespace (bool, optional): If True, the whitespace will be kept in the output. Default: ``False``.
1090            normalization_form (NormalizeForm, optional): The desired normalization form.
1091                See :class:`~.text.NormalizeForm` for details on optional values.
1092                Default: ``NormalizeForm.NFKC`` .
1093            preserve_unused_token (bool, optional): Whether to preserve special tokens. If True, will not split special
1094                tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'. Default: ``True``.
1095            with_offsets (bool, optional): Whether to output the start and end offsets of each
1096                token in the original string. Default: ``False`` .
1097
1098        Raises:
1099            TypeError: If `lower_case` is not of type bool.
1100            TypeError: If `keep_whitespace` is not of type bool.
1101            TypeError: If `normalization_form` is not of type :class:`~.text.NormalizeForm` .
1102            TypeError: If `preserve_unused_token` is not of type bool.
1103            TypeError: If `with_offsets` is not of type bool.
1104            RuntimeError: If dtype of input Tensor is not str.
1105
1106        Supported Platforms:
1107            ``CPU``
1108
1109        Examples:
1110            >>> import mindspore.dataset as ds
1111            >>> import mindspore.dataset.text as text
1112            >>> from mindspore.dataset.text import NormalizeForm
1113            >>>
1114            >>> # Use the transform in dataset pipeline mode
1115            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
1116            >>>
1117            >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]}
1118            >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
1119            ...                                    keep_whitespace=False,
1120            ...                                    normalization_form=NormalizeForm.NONE,
1121            ...                                    preserve_unused_token=True,
1122            ...                                    with_offsets=False)
1123            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
1124            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1125            ...     print(item["text"])
1126            ['Welcome' 'To' 'BeiJing' '!']
1127            >>>
1128            >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str],
1129            >>> #                                                     ["offsets_start", dtype=uint32],
1130            >>> #                                                     ["offsets_limit", dtype=uint32]}
1131            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
1132            >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
1133            ...                                    keep_whitespace=False,
1134            ...                                    normalization_form=NormalizeForm.NONE,
1135            ...                                    preserve_unused_token=True,
1136            ...                                    with_offsets=True)
1137            >>> numpy_slices_dataset = numpy_slices_dataset.map(
1138            ...     operations=tokenizer_op, input_columns=["text"],
1139            ...     output_columns=["token", "offsets_start", "offsets_limit"])
1140            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1141            ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
1142            ['Welcome' 'To' 'BeiJing' '!'] [ 0 12 17 24] [ 7 14 24 25]
1143            >>>
1144            >>> # Use the transform in eager mode
1145            >>> data = 'Welcome     To   BeiJing!'
1146            >>> output = text.BasicTokenizer()(data)
1147            >>> print(output)
1148            ['Welcome' 'To' 'BeiJing' '!']
1149
1150        Tutorial Examples:
1151            - `Illustration of text transforms
1152              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1153        """
1154
1155        @check_basic_tokenizer
1156        def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
1157                     preserve_unused_token=True, with_offsets=False):
1158            super().__init__()
1159            if not isinstance(normalization_form, NormalizeForm):
1160                raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
1161
1162            self.lower_case = lower_case
1163            self.keep_whitespace = keep_whitespace
1164            self.normalization_form = DE_C_INTER_NORMALIZE_FORM.get(normalization_form)
1165            self.preserve_unused_token = preserve_unused_token
1166            self.with_offsets = with_offsets
1167
1168        def parse(self):
1169            return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form,
1170                                               self.preserve_unused_token, self.with_offsets)
1171
1172
1173    class BertTokenizer(TextTensorOperation):
1174        """
1175        Tokenizer used for Bert text process.
1176
1177        Note:
1178            `BertTokenizer` is not supported on Windows platform yet.
1179
1180        Args:
1181            vocab (Vocab): Vocabulary used to look up words.
1182            suffix_indicator (str, optional): Prefix flags used to indicate subword suffixes. Default: ``'##'``.
1183            max_bytes_per_token (int, optional): The maximum length of tokenization, words exceeding this length will
1184                not be split. Default: ``100``.
1185            unknown_token (str, optional): The output for unknown words. When set to an empty string, the corresponding
1186                unknown word will be directly returned as the output. Otherwise, the set string will be returned as the
1187                output. Default: ``'[UNK]'``.
1188            lower_case (bool, optional): Whether to perform lowercase processing on the text. If ``True``, will fold the
1189                text to lower case and strip accented characters. If ``False``, will only perform normalization on the
1190                text, with mode specified by `normalization_form` . Default: ``False``.
1191            keep_whitespace (bool, optional): If ``True``, the whitespace will be kept in the output.
1192                Default: ``False``.
1193            normalization_form (NormalizeForm, optional): The desired normalization form.
1194                See :class:`~.text.NormalizeForm` for details on optional values.
1195                Default: ``NormalizeForm.NFKC`` .
1196            preserve_unused_token (bool, optional): Whether to preserve special tokens. If ``True``,
1197                will not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'.
1198                Default: ``True``.
1199            with_offsets (bool, optional): Whether to output the start and end offsets of each
1200                token in the original string. Default: ``False`` .
1201
1202        Raises:
1203            TypeError: If `vocab` is not of type :class:`mindspore.dataset.text.Vocab` .
1204            TypeError: If `suffix_indicator` is not of type str.
1205            TypeError: If `max_bytes_per_token` is not of type int.
1206            ValueError: If `max_bytes_per_token` is negative.
1207            TypeError: If `unknown_token` is not of type str.
1208            TypeError: If `lower_case` is not of type bool.
1209            TypeError: If `keep_whitespace` is not of type bool.
1210            TypeError: If `normalization_form` is not of type :class:`~.text.NormalizeForm` .
1211            TypeError: If `preserve_unused_token` is not of type bool.
1212            TypeError: If `with_offsets` is not of type bool.
1213
1214        Supported Platforms:
1215            ``CPU``
1216
1217        Examples:
1218            >>> import numpy as np
1219            >>> import mindspore.dataset as ds
1220            >>> import mindspore.dataset.text as text
1221            >>> from mindspore.dataset.text import NormalizeForm
1222            >>>
1223            >>> # Use the transform in dataset pipeline mode
1224            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"])
1225            >>>
1226            >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]}
1227            >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低",
1228            ...               "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak",
1229            ...               "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "+", "/",
1230            ...               "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"]
1231            >>> vocab = text.Vocab.from_list(vocab_list)
1232            >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
1233            ...                                   unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
1234            ...                                   normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
1235            ...                                   with_offsets=False)
1236            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
1237            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1238            ...     print(item["text"])
1239            ['床' '前' '明' '月' '光']
1240            >>>
1241            >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str],
1242            >>> #                                                     ["offsets_start", dtype=uint32],
1243            >>> #                                                     ["offsets_limit", dtype=uint32]}
1244            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"])
1245            >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
1246            ...                                   unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
1247            ...                                   normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
1248            ...                                   with_offsets=True)
1249            >>> numpy_slices_dataset = numpy_slices_dataset.map(
1250            ...     operations=tokenizer_op,
1251            ...     input_columns=["text"],
1252            ...     output_columns=["token", "offsets_start", "offsets_limit"])
1253            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1254            ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
1255            ['床' '前' '明' '月' '光'] [ 0  3  6  9 12] [ 3  6  9 12 15]
1256            >>>
1257            >>> # Use the transform in eager mode
1258            >>> data = "床前明月光"
1259            >>> vocab = text.Vocab.from_list(vocab_list)
1260            >>> tokenizer_op = text.BertTokenizer(vocab=vocab)
1261            >>> output = tokenizer_op(data)
1262            >>> print(output)
1263            ['床' '前' '明' '月' '光']
1264
1265        Tutorial Examples:
1266            - `Illustration of text transforms
1267              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1268        """
1269
1270        @check_bert_tokenizer
1271        def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]',
1272                     lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
1273                     preserve_unused_token=True, with_offsets=False):
1274            super().__init__()
1275            if not isinstance(normalization_form, NormalizeForm):
1276                raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
1277
1278            self.vocab = vocab
1279            self.suffix_indicator = suffix_indicator
1280            self.max_bytes_per_token = max_bytes_per_token
1281            self.unknown_token = unknown_token
1282            self.lower_case = lower_case
1283            self.keep_whitespace = keep_whitespace
1284            self.normalization_form = DE_C_INTER_NORMALIZE_FORM.get(normalization_form)
1285            self.preserve_unused_token = preserve_unused_token
1286            self.with_offsets = with_offsets
1287
1288        def parse(self):
1289            return cde.BertTokenizerOperation(self.vocab.c_vocab, self.suffix_indicator, self.max_bytes_per_token,
1290                                              self.unknown_token, self.lower_case, self.keep_whitespace,
1291                                              self.normalization_form, self.preserve_unused_token, self.with_offsets)
1292
1293
1294    class CaseFold(TextTensorOperation):
1295        """
1296        Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into
1297        lower case than :code:`str.lower` . For supported normalization forms, please refer to
1298        `ICU_Normalizer2 <https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1Normalizer2.html>`_ .
1299
1300        Note:
1301            CaseFold is not supported on Windows platform yet.
1302
1303        Supported Platforms:
1304            ``CPU``
1305
1306        Examples:
1307            >>> import mindspore.dataset as ds
1308            >>> import mindspore.dataset.text as text
1309            >>>
1310            >>> # Use the transform in dataset pipeline mode
1311            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
1312            >>> case_op = text.CaseFold()
1313            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=case_op)
1314            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1315            ...     print(item["text"])
1316            welcome     to   beijing!
1317            >>>
1318            >>> # Use the transform in eager mode
1319            >>> data = 'Welcome     To   BeiJing!'
1320            >>> output = text.CaseFold()(data)
1321            >>> print(output)
1322            welcome     to   beijing!
1323
1324        Tutorial Examples:
1325            - `Illustration of text transforms
1326              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1327        """
1328
1329        def parse(self):
1330            return cde.CaseFoldOperation()
1331
1332
1333    class FilterWikipediaXML(TextTensorOperation):
1334        """
1335        Filter Wikipedia XML dumps to "clean" text consisting only of lowercase letters (a-z, converted from A-Z),
1336        and spaces (never consecutive).
1337
1338        Note:
1339            FilterWikipediaXML is not supported on Windows platform yet.
1340
1341        Supported Platforms:
1342            ``CPU``
1343
1344        Examples:
1345            >>> import mindspore.dataset as ds
1346            >>> import mindspore.dataset.text as text
1347            >>>
1348            >>> # Use the transform in dataset pipeline mode
1349            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["Welcome    to    China", "!!!", "ABC"],
1350            ...                                              column_names=["text"], shuffle=False)
1351            >>> replace_op = text.FilterWikipediaXML()
1352            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=replace_op)
1353            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1354            ...     print(item["text"])
1355            ...     break
1356            welcome to china
1357            >>>
1358            >>> # Use the transform in eager mode
1359            >>> data = "Welcome    to    China"
1360            >>> output = replace_op(data)
1361            >>> print(output)
1362            welcome to china
1363
1364        Tutorial Examples:
1365            - `Illustration of text transforms
1366              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1367        """
1368
1369        def parse(self):
1370            return cde.FilterWikipediaXMLOperation()
1371
1372
1373    class NormalizeUTF8(TextTensorOperation):
1374        """
1375        Normalize the input UTF-8 encoded strings.
1376
1377        Note:
1378            NormalizeUTF8 is not supported on Windows platform yet.
1379
1380        Args:
1381            normalize_form (NormalizeForm, optional): The desired normalization form.
1382                See :class:`~.text.NormalizeForm` for details on optional values.
1383                Default: ``NormalizeForm.NFKC`` .
1384
1385        Raises:
1386            TypeError: If `normalize_form` is not of type :class:`~.text.NormalizeForm`.
1387
1388        Supported Platforms:
1389            ``CPU``
1390
1391        Examples:
1392            >>> import mindspore.dataset as ds
1393            >>> import mindspore.dataset.text as text
1394            >>> from mindspore.dataset.text import NormalizeForm
1395            >>>
1396            >>> # Use the transform in dataset pipeline mode
1397            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["ṩ", "ḍ̇", "q̇", "fi", "2⁵", "ẛ"],
1398            ...                                              column_names=["text"], shuffle=False)
1399            >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC)
1400            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=normalize_op)
1401            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1402            ...     print(item["text"])
1403            ...     break
14041405            >>>
1406            >>> # Use the transform in eager mode
1407            >>> data = ["ṩ", "ḍ̇", "q̇", "fi", "2⁵", "ẛ"]
1408            >>> output = text.NormalizeUTF8(NormalizeForm.NFKC)(data)
1409            >>> print(output)
1410            ['ṩ' 'ḍ̇' 'q̇' 'fi' '25' 'ṡ']
1411
1412        Tutorial Examples:
1413            - `Illustration of text transforms
1414              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1415        """
1416
1417        def __init__(self, normalize_form=NormalizeForm.NFKC):
1418            super().__init__()
1419            if not isinstance(normalize_form, NormalizeForm):
1420                raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
1421
1422            normalize_form = replace_none(normalize_form, NormalizeForm.NFKC)
1423            self.normalize_form = DE_C_INTER_NORMALIZE_FORM.get(normalize_form)
1424
1425        def parse(self):
1426            return cde.NormalizeUTF8Operation(self.normalize_form)
1427
1428
1429    class RegexReplace(TextTensorOperation):
1430        """
1431        Replace part of the input UTF-8 string with a difference text string using regular expressions.
1432
1433        Note:
1434            RegexReplace is not supported on Windows platform yet.
1435
1436        Args:
1437            pattern (str): The regular expression, used to mean the specific, standard textual syntax for
1438                representing patterns for matching text.
1439            replace (str): The string used to replace the matched elements.
1440            replace_all (bool, optional): Whether to replace all matched elements. If ``False``, only the
1441                first matched element will be replaced; otherwise, all matched elements will be replaced.
1442                Default: ``True``.
1443
1444        Raises:
1445            TypeError: If `pattern` is not of type str.
1446            TypeError: If `replace` is not of type str.
1447            TypeError: If `replace_all` is not of type bool.
1448
1449        Supported Platforms:
1450            ``CPU``
1451
1452        Examples:
1453            >>> import mindspore.dataset as ds
1454            >>> import mindspore.dataset.text as text
1455            >>>
1456            >>> # Use the transform in dataset pipeline mode
1457            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['apple orange apple orange apple'],
1458            ...                                              column_names=["text"])
1459            >>> regex_replace = text.RegexReplace('apple', 'orange')
1460            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=regex_replace)
1461            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1462            ...     print(item["text"])
1463            orange orange orange orange orange
1464            >>>
1465            >>> # Use the transform in eager mode
1466            >>> data = 'onetwoonetwoone'
1467            >>> output = text.RegexReplace(pattern="one", replace="two", replace_all=True)(data)
1468            >>> print(output)
1469            twotwotwotwotwo
1470
1471        Tutorial Examples:
1472            - `Illustration of text transforms
1473              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1474        """
1475
1476        @check_regex_replace
1477        def __init__(self, pattern, replace, replace_all=True):
1478            super().__init__()
1479            self.pattern = pattern
1480            self.replace = replace
1481            self.replace_all = replace_all
1482
1483        def parse(self):
1484            return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all)
1485
1486
1487    class RegexTokenizer(TextTensorOperation):
1488        """
1489        Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
1490
1491        See https://unicode-org.github.io/icu/userguide/strings/regexp.html for supported regex pattern.
1492
1493        Note:
1494            RegexTokenizer is not supported on Windows platform yet.
1495
1496        Args:
1497            delim_pattern (str): The pattern of regex delimiters.
1498                The original string will be split by matched elements.
1499            keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
1500                if it can be matched by 'keep_delim_pattern'. The default value is an empty str
1501                which means that delimiters will not be kept as an output token. Default: ``''``.
1502            with_offsets (bool, optional): Whether to output the start and end offsets of each
1503                token in the original string. Default: ``False`` .
1504
1505        Raises:
1506            TypeError: If `delim_pattern` is not of type string.
1507            TypeError: If `keep_delim_pattern` is not of type string.
1508            TypeError: If `with_offsets` is not of type bool.
1509
1510        Supported Platforms:
1511            ``CPU``
1512
1513        Examples:
1514            >>> import mindspore.dataset as ds
1515            >>> import mindspore.dataset.text as text
1516            >>>
1517            >>> # Use the transform in dataset pipeline mode
1518            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome  |,  To  |,  BeiJing!'],
1519            ...                                              column_names=["text"])
1520            >>>
1521            >>> # 1) If with_offsets=False, default output is one column {["text", dtype=str]}
1522            >>> delim_pattern = r"[ |,]"
1523            >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
1524            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
1525            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1526            ...     print(item["text"])
1527            ['Welcome' 'To' 'BeiJing!']
1528            >>>
1529            >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str],
1530            >>> #                                                     ["offsets_start", dtype=uint32],
1531            >>> #                                                     ["offsets_limit", dtype=uint32]}
1532            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome  |,  To  |,  BeiJing!'],
1533            ...                                              column_names=["text"])
1534            >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
1535            >>> numpy_slices_dataset = numpy_slices_dataset.map(
1536            ...     operations=tokenizer_op,
1537            ...     input_columns=["text"],
1538            ...     output_columns=["token", "offsets_start", "offsets_limit"])
1539            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1540            ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
1541            ['Welcome' 'To' 'BeiJing!'] [ 0 13 21] [ 7 15 29]
1542            >>>
1543            >>> # Use the transform in eager mode
1544            >>> data = 'Welcome     To   BeiJing!'
1545            >>> output = text.RegexTokenizer(delim_pattern="To", keep_delim_pattern="To", with_offsets=True)(data)
1546            >>> print(output)
1547            (array(['Welcome     ', 'To', '   BeiJing!'], dtype='<U12'),
1548            array([ 0, 12, 14], dtype=uint32), array([12, 14, 25], dtype=uint32))
1549
1550        Tutorial Examples:
1551            - `Illustration of text transforms
1552              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1553        """
1554
1555        @check_regex_tokenizer
1556        def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False):
1557            super().__init__()
1558            self.delim_pattern = delim_pattern
1559            self.keep_delim_pattern = keep_delim_pattern
1560            self.with_offsets = with_offsets
1561
1562        def parse(self):
1563            return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
1564
1565
1566    class UnicodeScriptTokenizer(TextTensorOperation):
1567        """
1568        Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries.
1569
1570        Note:
1571            UnicodeScriptTokenizer is not supported on Windows platform yet.
1572
1573        Args:
1574            keep_whitespace (bool, optional): Whether or not emit whitespace tokens. Default: ``False``.
1575            with_offsets (bool, optional): Whether to output the start and end offsets of each
1576                token in the original string. Default: ``False`` .
1577
1578        Raises:
1579            TypeError: If `keep_whitespace` is not of type bool.
1580            TypeError: If `with_offsets` is not of type bool.
1581
1582        Supported Platforms:
1583            ``CPU``
1584
1585        Examples:
1586            >>> import mindspore.dataset as ds
1587            >>> import mindspore.dataset.text as text
1588            >>>
1589            >>> # Use the transform in dataset pipeline mode
1590            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["北 京", "123", "欢 迎", "你"],
1591            ...                                              column_names=["text"], shuffle=False)
1592            >>>
1593            >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]}
1594            >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
1595            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
1596            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1597            ...     print(item["text"])
1598            ...     break
1599            ['北' ' ' '京']
1600            >>>
1601            >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str],
1602            >>> #                                                     ["offsets_start", dtype=uint32],
1603            >>> #                                                     ["offsets_limit", dtype=uint32]}
1604            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["北 京", "123", "欢 迎", "你"],
1605            ...                                              column_names=["text"], shuffle=False)
1606            >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
1607            >>> numpy_slices_dataset = numpy_slices_dataset.map(
1608            ...     operations=tokenizer_op,
1609            ...     input_columns=["text"],
1610            ...     output_columns=["token", "offsets_start", "offsets_limit"])
1611            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1612            ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
1613            ...     break
1614            ['北' ' ' '京'] [0 3 4] [3 4 7]
1615            >>>
1616            >>> # Use the transform in eager mode
1617            >>> data = "北 京"
1618            >>> unicode_script_tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
1619            >>> output = unicode_script_tokenizer_op(data)
1620            >>> print(output)
1621            ['北' ' ' '京']
1622
1623        Tutorial Examples:
1624            - `Illustration of text transforms
1625              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1626
1627        """
1628
1629        @check_unicode_script_tokenizer
1630        def __init__(self, keep_whitespace=False, with_offsets=False):
1631            super().__init__()
1632            keep_whitespace = replace_none(keep_whitespace, False)
1633            with_offsets = replace_none(with_offsets, False)
1634            self.keep_whitespace = keep_whitespace
1635            self.with_offsets = with_offsets
1636
1637        def parse(self):
1638            return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets)
1639
1640
1641    class WhitespaceTokenizer(TextTensorOperation):
1642        """
1643        Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
1644
1645        Note:
1646            WhitespaceTokenizer is not supported on Windows platform yet.
1647
1648        Args:
1649            with_offsets (bool, optional): Whether to output the start and end offsets of each
1650                token in the original string. Default: ``False`` .
1651
1652        Raises:
1653            TypeError: If `with_offsets` is not of type bool.
1654
1655        Supported Platforms:
1656            ``CPU``
1657
1658        Examples:
1659            >>> import mindspore.dataset as ds
1660            >>> import mindspore.dataset.text as text
1661            >>>
1662            >>> # Use the transform in dataset pipeline mode
1663            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
1664            >>>
1665            >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]}
1666            >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=False)
1667            >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op)
1668            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1669            ...     print(item["text"])
1670            ['Welcome' 'To' 'BeiJing!']
1671            >>>
1672            >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str],
1673            >>> #                                                     ["offsets_start", dtype=uint32],
1674            >>> #                                                     ["offsets_limit", dtype=uint32]}
1675            >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome     To   BeiJing!'], column_names=["text"])
1676            >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=True)
1677            >>> numpy_slices_dataset = numpy_slices_dataset.map(
1678            ...     operations=tokenizer_op,
1679            ...     input_columns=["text"],
1680            ...     output_columns=["token", "offsets_start", "offsets_limit"])
1681            >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
1682            ...     print(item["token"], item["offsets_start"], item["offsets_limit"])
1683            ['Welcome' 'To' 'BeiJing!'] [ 0 12 17] [ 7 14 25]
1684            >>>
1685            >>> # Use the transform in eager mode
1686            >>> data = 'Welcome     To   BeiJing!'
1687            >>> output = text.WhitespaceTokenizer(with_offsets=True)(data)
1688            >>> print(output)
1689            (array(['Welcome', 'To', 'BeiJing!'], dtype='<U8'), array([ 0, 12, 17], dtype=uint32),
1690            array([ 7, 14, 25], dtype=uint32))
1691
1692        Tutorial Examples:
1693            - `Illustration of text transforms
1694              <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_
1695        """
1696
1697        @check_with_offsets
1698        def __init__(self, with_offsets=False):
1699            super().__init__()
1700            self.with_offsets = with_offsets
1701
1702        def parse(self):
1703            return cde.WhitespaceTokenizerOperation(self.with_offsets)
1704