• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020-2021 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""
15The module text.transforms is inherited from _c_dataengine
16and is implemented based on ICU4C and cppjieba in C++.
17It's a high performance module to process NLP text.
18Users can use Vocab to build their own dictionary,
19use appropriate tokenizers to split sentences into different tokens,
20and use Lookup to find the index of tokens in Vocab.
21
22.. Note::
23    A constructor's arguments for every class in this module must be saved into the
24    class attributes (self.xxx) to support save() and load().
25
26Examples:
27    >>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files
28    >>> # Create a dataset for text sentences saved as line data in a file
29    >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir, shuffle=False)
30    >>> # Tokenize sentences to unicode characters
31    >>> tokenizer = text.UnicodeCharTokenizer()
32    >>> # Load vocabulary from list
33    >>> vocab = text.Vocab.from_list(word_list=['深', '圳', '欢', '迎', '您'])
34    >>> # Use Lookup operator to map tokens to ids
35    >>> lookup = text.Lookup(vocab=vocab)
36    >>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup])
37    >>> # if text line in dataset_file is:
38    >>> # 深圳欢迎您
39    >>> # then the output will be:
40    >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
41"""
42import os
43import re
44import platform
45import numpy as np
46
47import mindspore._c_dataengine as cde
48from mindspore import dtype as mstype
49
50from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
51from .validators import check_lookup, check_jieba_add_dict, \
52    check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
53    check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
54    check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \
55    check_sentence_piece_tokenizer
56from ..core.datatypes import mstype_to_detype
57from ..core.validator_helpers import replace_none
58from ..transforms.c_transforms import TensorOperation
59
60
61class TextTensorOperation(TensorOperation):
62    """
63    Base class of Text Tensor Ops
64    """
65
66    def parse(self):
67        raise NotImplementedError("TextTensorOperation has to implement parse() method.")
68
69
70DE_C_INTER_JIEBA_MODE = {
71    JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX,
72    JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP,
73    JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM
74}
75
76DE_C_INTER_SENTENCEPIECE_LOADTYPE = {
77    SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE,
78    SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL
79}
80
81DE_C_INTER_SENTENCEPIECE_OUTTYPE = {
82    SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString,
83    SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT
84}
85
86
87class JiebaTokenizer(TextTensorOperation):
88    """
89    Tokenize Chinese string into words based on dictionary.
90
91    Note:
92        The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed.
93
94    Args:
95        hmm_path (str): Dictionary file is used by HMMSegment algorithm.
96            The dictionary can be obtained on the official website of cppjieba.
97        mp_path (str): Dictionary file is used by MPSegment algorithm.
98            The dictionary can be obtained on the official website of cppjieba.
99        mode (JiebaMode, optional): Valid values can be any of [JiebaMode.MP, JiebaMode.HMM,
100            JiebaMode.MIX](default=JiebaMode.MIX).
101
102            - JiebaMode.MP, tokenize with MPSegment algorithm.
103            - JiebaMode.HMM, tokenize with Hidden Markov Model Segment algorithm.
104            - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
105        with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
106
107    Examples:
108        >>> from mindspore.dataset.text import JiebaMode
109        >>> # If with_offsets=False, default output one column {["text", dtype=str]}
110        >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
111        >>> jieba_mp_file = "/path/to/jieba/mp/file"
112        >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False)
113        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
114        >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
115        >>> #                                                   ["offsets_limit", dtype=uint32]}
116        >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True)
117        >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
118        ...                                               output_columns=["token", "offsets_start", "offsets_limit"],
119        ...                                               column_order=["token", "offsets_start", "offsets_limit"])
120    """
121
122    @check_jieba_init
123    def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False):
124        if not isinstance(mode, JiebaMode):
125            raise TypeError("Wrong input type for mode, should be JiebaMode.")
126
127        self.mode = mode
128        self.__check_path__(hmm_path)
129        self.hmm_path = hmm_path
130        self.__check_path__(mp_path)
131        self.mp_path = mp_path
132        self.with_offsets = with_offsets
133        self.words = []
134
135    def parse(self):
136        jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path,
137                                                      DE_C_INTER_JIEBA_MODE[self.mode],
138                                                      self.with_offsets)
139        for word in self.words:
140            jieba_tokenizer.add_word(word[0], word[1])
141        return jieba_tokenizer
142
143    @check_jieba_add_word
144    def add_word(self, word, freq=None):
145        """
146        Add a user defined word to JiebaTokenizer's dictionary.
147
148        Args:
149            word (str): The word to be added to the JiebaTokenizer instance.
150                The added word will not be written into the built-in dictionary on disk.
151            freq (int, optional): The frequency of the word to be added. The higher the frequency,
152                the better chance the word will be tokenized (default=None, use default frequency).
153
154        Examples:
155            >>> from mindspore.dataset.text import JiebaMode
156            >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
157            >>> jieba_mp_file = "/path/to/jieba/mp/file"
158            >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
159            >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
160            >>> with open(sentence_piece_vocab_file, 'r') as f:
161            ...     for line in f:
162            ...         word = line.split(',')[0]
163            ...         jieba_op.add_word(word)
164            >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
165        """
166
167        if freq is None:
168            self.words.append((word, 0))
169        else:
170            self.words.append((word, freq))
171
172    @check_jieba_add_dict
173    def add_dict(self, user_dict):
174        """
175        Add a user defined word to JiebaTokenizer's dictionary.
176
177        Args:
178            user_dict (Union[str, dict]): One of the two loading methods is file path(str) loading
179                (according to the Jieba dictionary format) and the other is Python dictionary(dict) loading,
180                Python Dict format: {word1:freq1, word2:freq2,...}.
181                Jieba dictionary format : word(required), freq(optional), such as:
182
183                .. code-block::
184
185                    word1 freq1
186                    word2 None
187                    word3 freq3
188
189                Only valid word-freq pairs in user provided file will be added into the dictionary.
190                Rows containing invalid input will be ignored. No error nor warning Status is returned.
191
192        Examples:
193            >>> from mindspore.dataset.text import JiebaMode
194            >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
195            >>> jieba_mp_file = "/path/to/jieba/mp/file"
196            >>> user_dict = {"男默女泪": 10}
197            >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
198            >>> jieba_op.add_dict(user_dict)
199            >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
200        """
201
202        if isinstance(user_dict, str):
203            self.__add_dict_py_file(user_dict)
204        elif isinstance(user_dict, dict):
205            for k, v in user_dict.items():
206                self.add_word(k, v)
207        else:
208            raise TypeError("The type of user_dict must str or dict.")
209
210    def __add_dict_py_file(self, file_path):
211        """Add user defined word by file"""
212        words_list = self.__parser_file(file_path)
213        for data in words_list:
214            if data[1] is None:
215                freq = 0
216            else:
217                freq = int(data[1])
218            self.add_word(data[0], freq)
219
220    def __parser_file(self, file_path):
221        """parser user defined word by file"""
222        if not os.path.exists(file_path):
223            raise ValueError(
224                "user dict file {} is not exist.".format(file_path))
225        real_file_path = os.path.realpath(file_path)
226        file_dict = open(real_file_path)
227        data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U)
228        words_list = []
229        for item in file_dict:
230            data = item.strip()
231            if not isinstance(data, str):
232                data = self.__decode(data)
233            tmp = data_re.match(data)
234            if not tmp:
235                continue
236            words = tmp.groups()
237            words_list.append(words)
238        file_dict.close()
239        return words_list
240
241    def __decode(self, data):
242        """decode the dict file to utf8"""
243        try:
244            data = data.decode('utf-8')
245        except UnicodeDecodeError:
246            raise ValueError("user dict file must be utf8 format.")
247        return data.lstrip('\ufeff')
248
249    def __check_path__(self, model_path):
250        """check model path"""
251        if not os.path.exists(os.path.realpath(model_path)):
252            raise ValueError(
253                " jieba mode file {} is not exist.".format(model_path))
254
255
256class Lookup(TextTensorOperation):
257    """
258    Look up a word into an id according to the input vocabulary table.
259
260    Args:
261        vocab (Vocab): A vocabulary object.
262        unknown_token (str, optional): Word is used for lookup. In case of the word is out of vocabulary (OOV),
263            the result of lookup will be replaced with unknown_token. If the unknown_token is not specified or
264            it is OOV, runtime error will be thrown (default={}, means no unknown_token is specified).
265        data_type (mindspore.dtype, optional): The data type that lookup operation maps
266            string to(default=mindspore.int32).
267
268    Examples:
269        >>> # Load vocabulary from list
270        >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
271        >>> # Use Lookup operator to map tokens to ids
272        >>> lookup = text.Lookup(vocab)
273        >>> text_file_dataset = text_file_dataset.map(operations=[lookup])
274    """
275
276    @check_lookup
277    def __init__(self, vocab, unknown_token=None, data_type=mstype.int32):
278        self.vocab = vocab
279        self.unknown_token = unknown_token
280        self.data_type = data_type
281
282    def parse(self):
283        return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type)))
284
285
286class Ngram(TextTensorOperation):
287    """
288    TensorOp to generate n-gram from a 1-D string Tensor.
289
290    Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
291
292    Args:
293        n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result
294            would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
295            for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in
296            an empty string produced.
297        left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width).
298            `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the
299            sequence with "__" (default=None).
300        right_pad (tuple, optional): Padding performed on right side of the sequence shaped like
301            ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("_", 2)
302            would pad right side of the sequence with "__" (default=None).
303        separator (str, optional): Symbol used to join strings together. For example. if 2-gram is
304            ["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"]
305            (default=None, which will use whitespace as separator).
306
307    Examples:
308        >>> ngram_op = text.Ngram(3, separator="-")
309        >>> output = ngram_op(["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"])
310        >>> # output
311        >>> # ["WildRose Country-Canada's Ocean Playground-Land of Living Skies"]
312        >>> # same ngram_op called through map
313        >>> text_file_dataset = text_file_dataset.map(operations=ngram_op)
314    """
315
316    @check_ngram
317    def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "):
318        self.ngrams = n
319        self.left_pad = left_pad
320        self.right_pad = right_pad
321        self.separator = separator
322
323    def parse(self):
324        return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator)
325
326
327class SentencePieceTokenizer(TextTensorOperation):
328    """
329    Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
330
331    Args:
332        mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then its type should be string.
333            If the input parameter is a SentencePieceVocab object, then its type should be SentencePieceVocab.
334        out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING,
335            SPieceTokenizerOutType.INT].
336
337            - SPieceTokenizerOutType.STRING, means output type of SentencePice Tokenizer is string.
338            - SPieceTokenizerOutType.INT, means output type of SentencePice Tokenizer is int.
339
340    Examples:
341        >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
342        >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
343        >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 5000, 0.9995,
344        ...                                           SentencePieceModel.UNIGRAM, {})
345        >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
346        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer)
347    """
348    @check_sentence_piece_tokenizer
349    def __init__(self, mode, out_type):
350        self.mode = mode
351        self.out_type = out_type
352
353    def parse(self):
354        return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])
355
356
357class SlidingWindow(TextTensorOperation):
358    """
359    Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis
360    is a slice of data starting at the corresponding position, with a specified width.
361
362    Args:
363        width (int): The width of the window. It must be an integer and greater than zero.
364        axis (int, optional): The axis along which the sliding window is computed (default=0).
365
366    Examples:
367        >>> dataset = ds.NumpySlicesDataset(data=[[1, 2, 3, 4, 5]], column_names="col1")
368        >>> # Data before
369        >>> # |     col1     |
370        >>> # +--------------+
371        >>> # | [[1, 2, 3, 4, 5]] |
372        >>> # +--------------+
373        >>> dataset = dataset.map(operations=text.SlidingWindow(3, 0))
374        >>> # Data after
375        >>> # |     col1     |
376        >>> # +--------------+
377        >>> # |  [[1, 2, 3], |
378        >>> # |   [2, 3, 4], |
379        >>> # |   [3, 4, 5]] |
380        >>> # +--------------+
381    """
382
383    @check_slidingwindow
384    def __init__(self, width, axis=0):
385        self.width = width
386        self.axis = axis
387
388    def parse(self):
389        return cde.SlidingWindowOperation(self.width, self.axis)
390
391
392class ToNumber(TextTensorOperation):
393    """
394    Tensor operation to convert every element of a string tensor to a number.
395
396    Strings are cast according to the rules specified in the following links, except that any strings which represent
397    negative numbers cannot be cast to an unsigned integer type, rules links are as follows:
398    https://en.cppreference.com/w/cpp/string/basic_string/stof,
399    https://en.cppreference.com/w/cpp/string/basic_string/stoul,
400
401    Args:
402        data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype.
403
404    Raises:
405        RuntimeError: If strings are invalid to cast, or are out of range after being cast.
406
407    Examples:
408        >>> from mindspore import dtype as mstype
409        >>> data = [["1", "2", "3"]]
410        >>> dataset = ds.NumpySlicesDataset(data)
411        >>> to_number_op = text.ToNumber(mstype.int8)
412        >>> dataset = dataset.map(operations=to_number_op)
413    """
414
415    @check_to_number
416    def __init__(self, data_type):
417        data_type = mstype_to_detype(data_type)
418        self.data_type = str(data_type)
419
420    def parse(self):
421        return cde.ToNumberOperation(self.data_type)
422
423
424class TruncateSequencePair(TextTensorOperation):
425    """
426    Truncate a pair of rank-1 tensors such that the total length is less than max_length.
427
428    This operation takes two input tensors and returns two output Tensors.
429
430    Args:
431        max_length (int): Maximum length required.
432
433    Examples:
434        >>> dataset = ds.NumpySlicesDataset(data={"col1": [[1, 2, 3]], "col2": [[4, 5]]})
435        >>> # Data before
436        >>> # |   col1    |   col2    |
437        >>> # +-----------+-----------|
438        >>> # | [1, 2, 3] |  [4, 5]   |
439        >>> # +-----------+-----------+
440        >>> truncate_sequence_pair_op = text.TruncateSequencePair(max_length=4)
441        >>> dataset = dataset.map(operations=truncate_sequence_pair_op)
442        >>> # Data after
443        >>> # |   col1    |   col2    |
444        >>> # +-----------+-----------+
445        >>> # |  [1, 2]   |  [4, 5]   |
446        >>> # +-----------+-----------+
447    """
448
449    @check_pair_truncate
450    def __init__(self, max_length):
451        self.max_length = max_length
452
453    def parse(self):
454        return cde.TruncateSequencePairOperation(self.max_length)
455
456
457class UnicodeCharTokenizer(TextTensorOperation):
458    """
459    Tokenize a scalar tensor of UTF-8 string to Unicode characters.
460
461    Args:
462        with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
463
464    Examples:
465        >>> # If with_offsets=False, default output one column {["text", dtype=str]}
466        >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=False)
467        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
468        >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
469        >>> #                                                   ["offsets_limit", dtype=uint32]}
470        >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True)
471        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
472        ...                                           output_columns=["token", "offsets_start", "offsets_limit"],
473        ...                                           column_order=["token", "offsets_start", "offsets_limit"])
474    """
475
476    @check_with_offsets
477    def __init__(self, with_offsets=False):
478        self.with_offsets = with_offsets
479
480    def parse(self):
481        return cde.UnicodeCharTokenizerOperation(self.with_offsets)
482
483
484class WordpieceTokenizer(TextTensorOperation):
485    """
486    Tokenize scalar token or 1-D tokens to 1-D subword tokens.
487
488    Args:
489        vocab (Vocab): A  vocabulary object.
490        suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
491        max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
492        unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
493            return the token directly, else return 'unknown_token' (default='[UNK]').
494        with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
495
496    Examples:
497        >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"]
498        >>> vocab = text.Vocab.from_list(vocab_list)
499        >>> # If with_offsets=False, default output one column {["text", dtype=str]}
500        >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
501        ...                                        max_bytes_per_token=100, with_offsets=False)
502        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
503        >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
504        >>> #                                                   ["offsets_limit", dtype=uint32]}
505        >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
506        ...                                       max_bytes_per_token=100, with_offsets=True)
507        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
508        ...                                           output_columns=["token", "offsets_start", "offsets_limit"],
509        ...                                           column_order=["token", "offsets_start", "offsets_limit"])
510    """
511
512    @check_wordpiece_tokenizer
513    def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
514                 unknown_token='[UNK]', with_offsets=False):
515        self.vocab = vocab
516        self.suffix_indicator = suffix_indicator
517        self.max_bytes_per_token = max_bytes_per_token
518        self.unknown_token = unknown_token
519        self.with_offsets = with_offsets
520
521    def parse(self):
522        return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
523                                               self.unknown_token, self.with_offsets)
524
525
526class PythonTokenizer:
527    """
528    Class that applies user-defined string tokenizer into input string.
529
530    Args:
531        tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
532
533    Examples:
534        >>> def my_tokenizer(line):
535        ...     return line.split()
536        >>> text_file_dataset = text_file_dataset.map(operations=text.PythonTokenizer(my_tokenizer))
537    """
538
539    @check_python_tokenizer
540    def __init__(self, tokenizer):
541        self.pyfunc = tokenizer
542        self.tokenizer = np.vectorize(lambda x: np.array(tokenizer(x), dtype='U'), signature='()->(n)')
543        self.random = False
544
545    def __call__(self, in_array):
546        if not isinstance(in_array, np.ndarray):
547            raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
548        if in_array.dtype.type is np.bytes_:
549            in_array = to_str(in_array)
550        try:
551            tokens = self.tokenizer(in_array)
552        except Exception as e:
553            raise RuntimeError("Error occurred in Pyfunc [" + str(self.pyfunc.__name__) + "], error message: " + str(e))
554        return tokens
555
556
557if platform.system().lower() != 'windows':
558    DE_C_INTER_NORMALIZE_FORM = {
559        NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
560        NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
561        NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
562        NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
563        NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
564    }
565
566
567    class BasicTokenizer(TextTensorOperation):
568        """
569        Tokenize a scalar tensor of UTF-8 string by specific rules.
570
571        Note:
572            BasicTokenizer is not supported on Windows platform yet.
573
574        Args:
575            lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation
576                on input text to fold the text to lower case and strip accents characters. If False, only apply
577                NormalizeUTF8 operation with the specified mode on input text (default=False).
578            keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False).
579            normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode
580                (default=NormalizeForm.NONE). This is only effective when `lower_case` is False. It can be any of
581                [NormalizeForm.NONE, NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD].
582
583                - NormalizeForm.NONE, do nothing for input string tensor.
584                - NormalizeForm.NFC, normalize with Normalization Form C.
585                - NormalizeForm.NFKC, normalize with Normalization Form KC.
586                - NormalizeForm.NFD, normalize with Normalization Form D.
587                - NormalizeForm.NFKD, normalize with Normalization Form KD.
588
589            preserve_unused_token (bool, optional): If True, do not split special tokens like
590                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
591            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
592
593        Examples:
594            >>> from mindspore.dataset.text import NormalizeForm
595            >>>
596            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
597            >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
598            ...                                    keep_whitespace=False,
599            ...                                    normalization_form=NormalizeForm.NONE,
600            ...                                    preserve_unused_token=True,
601            ...                                    with_offsets=False)
602            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
603            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
604            >>> #                                                   ["offsets_start", dtype=uint32],
605            >>> #                                                   ["offsets_limit", dtype=uint32]}
606            >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
607            ...                                    keep_whitespace=False,
608            ...                                    normalization_form=NormalizeForm.NONE,
609            ...                                    preserve_unused_token=True,
610            ...                                    with_offsets=True)
611            >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
612            ...                                               output_columns=["token", "offsets_start",
613            ...                                                               "offsets_limit"],
614            ...                                               column_order=["token", "offsets_start",
615            ...                                                             "offsets_limit"])
616
617        """
618
619        @check_basic_tokenizer
620        def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
621                     preserve_unused_token=True, with_offsets=False):
622            if not isinstance(normalization_form, NormalizeForm):
623                raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
624
625            self.lower_case = lower_case
626            self.keep_whitespace = keep_whitespace
627            self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
628            self.preserve_unused_token = preserve_unused_token
629            self.with_offsets = with_offsets
630
631        def parse(self):
632            return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form,
633                                               self.preserve_unused_token, self.with_offsets)
634
635
636    class BertTokenizer(TextTensorOperation):
637        """
638        Tokenizer used for Bert text process.
639
640        Note:
641            BertTokenizer is not supported on Windows platform yet.
642
643        Args:
644            vocab (Vocab): A vocabulary object.
645            suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
646            max_bytes_per_token (int, optional): If Tokens exceeding this length, it will not be further
647                split (default=100).
648            unknown_token (str, optional): When an unknown token is found, return the token directly if `unknown_token`
649                is an empty string, else return `unknown_token` instead (default='[UNK]').
650            lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation
651                on input text to fold the text to lower case and strip accented characters. If False, only apply
652                NormalizeUTF8 operation with the specified mode on input text (default=False).
653            keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
654            normalization_form (NormalizeForm, optional): This parameter is used to specify a specific normalize mode,
655                only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
656            preserve_unused_token (bool, optional): If True, do not split special tokens like
657                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
658            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
659
660        Examples:
661            >>> from mindspore.dataset.text import NormalizeForm
662            >>>
663            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
664            >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低",
665            ...               "思", "故", "乡","繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak",
666            ...               "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "��", "��",
667            ...               "��", "��", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]",
668            ...               "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"]
669            >>> vocab = text.Vocab.from_list(vocab_list)
670            >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
671            ...                                   unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
672            ...                                   normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
673            ...                                   with_offsets=False)
674            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
675            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
676            >>> #                                                   ["offsets_start", dtype=uint32],
677            >>> #                                                   ["offsets_limit", dtype=uint32]}
678            >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
679            ...                                   unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
680            ...                                   normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
681            ...                                   with_offsets=True)
682            >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
683            ...                                               output_columns=["token", "offsets_start",
684            ...                                                               "offsets_limit"],
685            ...                                               column_order=["token", "offsets_start",
686            ...                                                             "offsets_limit"])
687
688        """
689
690        @check_bert_tokenizer
691        def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]',
692                     lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
693                     preserve_unused_token=True, with_offsets=False):
694            if not isinstance(normalization_form, NormalizeForm):
695                raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
696
697            self.vocab = vocab
698            self.suffix_indicator = suffix_indicator
699            self.max_bytes_per_token = max_bytes_per_token
700            self.unknown_token = unknown_token
701            self.lower_case = lower_case
702            self.keep_whitespace = keep_whitespace
703            self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
704            self.preserve_unused_token = preserve_unused_token
705            self.with_offsets = with_offsets
706
707        def parse(self):
708            return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
709                                              self.unknown_token, self.lower_case, self.keep_whitespace,
710                                              self.normalization_form, self.preserve_unused_token, self.with_offsets)
711
712
713    class CaseFold(TextTensorOperation):
714        """
715        Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into
716        lower case.
717
718        Note:
719            CaseFold is not supported on Windows platform yet.
720
721        Examples:
722            >>> case_op = text.CaseFold()
723            >>> text_file_dataset = text_file_dataset.map(operations=case_op)
724        """
725
726        def parse(self):
727            return cde.CaseFoldOperation()
728
729
730    class NormalizeUTF8(TextTensorOperation):
731        """
732        Apply normalize operation on UTF-8 string tensor.
733
734        Note:
735            NormalizeUTF8 is not supported on Windows platform yet.
736
737        Args:
738            normalize_form (NormalizeForm, optional): Valid values can be [NormalizeForm.NONE, NormalizeForm.NFC,
739                NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD] any of the four unicode
740                normalized forms(default=NormalizeForm.NFKC).
741                See http://unicode.org/reports/tr15/ for details.
742
743                - NormalizeForm.NONE, do nothing for input string tensor.
744                - NormalizeForm.NFC, normalize with Normalization Form C.
745                - NormalizeForm.NFKC, normalize with Normalization Form KC.
746                - NormalizeForm.NFD, normalize with Normalization Form D.
747                - NormalizeForm.NFKD, normalize with Normalization Form KD.
748
749        Examples:
750            >>> from mindspore.dataset.text import NormalizeForm
751            >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC)
752            >>> text_file_dataset = text_file_dataset.map(operations=normalize_op)
753        """
754
755        def __init__(self, normalize_form=NormalizeForm.NFKC):
756            if not isinstance(normalize_form, NormalizeForm):
757                raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
758
759            normalize_form = replace_none(normalize_form, NormalizeForm.NFKC)
760            self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
761
762        def parse(self):
763            return cde.NormalizeUTF8Operation(self.normalize_form)
764
765
766    class RegexReplace(TextTensorOperation):
767        """
768        Replace a part of UTF-8 string tensor with given text according to regular expressions.
769
770        See https://unicode-org.github.io/icu/userguide/strings/regexp.html for supported regex pattern.
771
772        Note:
773            RegexReplace is not supported on Windows platform yet.
774
775        Args:
776            pattern (str): the regex expression patterns.
777            replace (str): the string to replace matched element.
778            replace_all (bool, optional): If False, only replace first matched element;
779                if True, replace all matched elements (default=True).
780
781        Examples:
782            >>> pattern = 'Canada'
783            >>> replace = 'China'
784            >>> replace_op = text.RegexReplace(pattern, replace)
785            >>> text_file_dataset = text_file_dataset.map(operations=replace_op)
786        """
787
788        @check_regex_replace
789        def __init__(self, pattern, replace, replace_all=True):
790            self.pattern = pattern
791            self.replace = replace
792            self.replace_all = replace_all
793
794        def parse(self):
795            return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all)
796
797
798    class RegexTokenizer(TextTensorOperation):
799        """
800        Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
801
802        See https://unicode-org.github.io/icu/userguide/strings/regexp.html for supported regex pattern.
803
804        Note:
805            RegexTokenizer is not supported on Windows platform yet.
806
807        Args:
808            delim_pattern (str): The pattern of regex delimiters.
809                The original string will be split by matched elements.
810            keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
811                if it can be matched by 'keep_delim_pattern'. The default value is an empty str
812                which means that delimiters will not be kept as an output token (default='').
813            with_offsets (bool, optional): Whether or not output offsets of tokens(default=False).
814
815        Examples:
816            >>> # If with_offsets=False, default output is one column {["text", dtype=str]}
817            >>> delim_pattern = r"[ |,]"
818            >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
819            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
820            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
821            >>> #                                                   ["offsets_start", dtype=uint32],
822            >>> #                                                   ["offsets_limit", dtype=uint32]}
823            >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
824            >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
825            ...                                               output_columns=["token", "offsets_start",
826            ...                                                               "offsets_limit"],
827            ...                                               column_order=["token", "offsets_start",
828            ...                                                             "offsets_limit"])
829        """
830
831        @check_regex_tokenizer
832        def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False):
833            self.delim_pattern = delim_pattern
834            self.keep_delim_pattern = keep_delim_pattern
835            self.with_offsets = with_offsets
836
837        def parse(self):
838            return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
839
840
841    class UnicodeScriptTokenizer(TextTensorOperation):
842        """
843        Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries.
844
845        Note:
846            UnicodeScriptTokenizer is not supported on Windows platform yet.
847
848        Args:
849            keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False).
850            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
851
852        Examples:
853            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
854            >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
855            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
856            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
857            >>> #                                                  ["offsets_start", dtype=uint32],
858            >>> #                                                  ["offsets_limit", dtype=uint32]}
859            >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
860            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
861            ...                                           output_columns=["token", "offsets_start", "offsets_limit"],
862            ...                                           column_order=["token", "offsets_start", "offsets_limit"])
863
864        """
865
866        @check_unicode_script_tokenizer
867        def __init__(self, keep_whitespace=False, with_offsets=False):
868            keep_whitespace = replace_none(keep_whitespace, False)
869            with_offsets = replace_none(with_offsets, False)
870            self.keep_whitespace = keep_whitespace
871            self.with_offsets = with_offsets
872
873        def parse(self):
874            return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets)
875
876
877    class WhitespaceTokenizer(TextTensorOperation):
878        """
879        Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
880
881        Note:
882            WhitespaceTokenizer is not supported on Windows platform yet.
883
884        Args:
885            with_offsets (bool, optional): Whether or not output offsets of tokens (default=False).
886
887        Examples:
888            >>> # If with_offsets=False, default output one column {["text", dtype=str]}
889            >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=False)
890            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
891            >>> # If with_offsets=True, then output three columns {["token", dtype=str],
892            >>> #                                                   ["offsets_start", dtype=uint32],
893            >>> #                                                   ["offsets_limit", dtype=uint32]}
894            >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=True)
895            >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"],
896            ...                                           output_columns=["token", "offsets_start", "offsets_limit"],
897            ...                                           column_order=["token", "offsets_start", "offsets_limit"])
898        """
899
900        @check_with_offsets
901        def __init__(self, with_offsets=False):
902            self.with_offsets = with_offsets
903
904        def parse(self):
905            return cde.WhitespaceTokenizerOperation(self.with_offsets)
906