1# Copyright 2020-2022 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14""" 15The module text.transforms is inherited from _c_dataengine 16and is implemented based on ICU4C and cppjieba in C++. 17It's a high performance module to process NLP text. 18Users can use Vocab to build their own dictionary, 19use appropriate tokenizers to split sentences into different tokens, 20and use Lookup to find the index of tokens in Vocab. 21 22.. Note:: 23 A constructor's arguments for every class in this module must be saved into the 24 class attributes (self.xxx) to support save() and load(). 25 26Examples: 27 >>> import mindspore.dataset as ds 28 >>> import mindspore.dataset.text as text 29 >>> 30 >>> # Create a dataset for text sentences saved as line data in a file 31 >>> text_file_list = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files 32 >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_list, shuffle=False) 33 >>> 34 >>> # Tokenize sentences to unicode characters 35 >>> tokenizer = text.UnicodeCharTokenizer() 36 >>> # Load vocabulary from list 37 >>> vocab = text.Vocab.from_list(word_list=['深', '圳', '欢', '迎', '您']) 38 >>> # Use Lookup operation to map tokens to ids 39 >>> lookup = text.Lookup(vocab=vocab) 40 >>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup]) 41 >>> # if text line in dataset_file is: 42 >>> # 深圳欢迎您 43 >>> # then the output will be: 44 >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} 45""" 46import json 47import os 48import re 49import platform 50import numpy as np 51 52import mindspore._c_dataengine as cde 53from mindspore.common import dtype as mstype 54 55from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab 56from .validators import check_add_token, check_lookup, check_jieba_add_dict, check_to_vectors, \ 57 check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \ 58 check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \ 59 check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \ 60 check_sentence_piece_tokenizer, check_truncate 61from ..core.datatypes import mstype_to_detype 62from ..core.validator_helpers import replace_none 63from ..transforms.py_transforms_util import Implementation 64from ..transforms.transforms import TensorOperation 65from ..transforms.validators import invalidate_callable 66 67 68class TextTensorOperation(TensorOperation): 69 """ 70 Base class of Text Tensor Ops 71 """ 72 73 def __init__(self): 74 super().__init__() 75 self.implementation = Implementation.C 76 77 def parse(self): 78 raise NotImplementedError("TextTensorOperation has to implement parse() method.") 79 80 81DE_C_INTER_JIEBA_MODE = { 82 JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX, 83 JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP, 84 JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM 85} 86 87DE_C_INTER_SENTENCEPIECE_LOADTYPE = { 88 SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE, 89 SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL 90} 91 92DE_C_INTER_SENTENCEPIECE_OUTTYPE = { 93 SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString, 94 SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT 95} 96 97 98class AddToken(TextTensorOperation): 99 """ 100 Add token to beginning or end of sequence. 101 102 Args: 103 token (str): The token to be added. 104 begin (bool, optional): Choose the position where the token is inserted. If True, 105 the token will be inserted at the beginning of the sequence. Otherwise, it will 106 be inserted at the end of the sequence. Default: ``True``. 107 108 Raises: 109 TypeError: If `token` is not of type string. 110 TypeError: If `begin` is not of type bool. 111 112 Supported Platforms: 113 ``CPU`` 114 115 Examples: 116 >>> import mindspore.dataset as ds 117 >>> import mindspore.dataset.text as text 118 >>> 119 >>> # Use the transform in dataset pipeline mode 120 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[['a', 'b', 'c', 'd', 'e']], column_names=["text"]) 121 >>> # Data before 122 >>> # | text | 123 >>> # +---------------------------+ 124 >>> # | ['a', 'b', 'c', 'd', 'e'] | 125 >>> # +---------------------------+ 126 >>> add_token_op = text.AddToken(token='TOKEN', begin=True) 127 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=add_token_op) 128 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 129 ... print(item["text"]) 130 ['TOKEN' 'a' 'b' 'c' 'd' 'e'] 131 >>> # Data after 132 >>> # | text | 133 >>> # +---------------------------+ 134 >>> # | ['TOKEN', 'a', 'b', 'c', 'd', 'e'] | 135 >>> # +---------------------------+ 136 >>> 137 >>> # Use the transform in eager mode 138 >>> data = ["happy", "birthday", "to", "you"] 139 >>> output = text.AddToken(token='TOKEN', begin=True)(data) 140 >>> print(output) 141 ['TOKEN' 'happy' 'birthday' 'to' 'you'] 142 143 Tutorial Examples: 144 - `Illustration of text transforms 145 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 146 """ 147 148 @check_add_token 149 def __init__(self, token, begin=True): 150 super().__init__() 151 self.token = token 152 self.begin = begin 153 154 def parse(self): 155 return cde.AddTokenOperation(self.token, self.begin) 156 157 158class JiebaTokenizer(TextTensorOperation): 159 """ 160 Use Jieba tokenizer to tokenize Chinese strings. 161 162 Note: 163 The dictionary files used by Hidden Markov Model segment and Max Probability segment can be 164 obtained through the `cppjieba GitHub <https://github.com/yanyiwu/cppjieba/tree/master/dict>`_ . 165 Please ensure the validity and integrity of these files. 166 167 Args: 168 hmm_path (str): Path to the dictionary file used by Hidden Markov Model segment. 169 mp_path (str): Path to the dictionary file used by Max Probability segment. 170 mode (JiebaMode, optional): The desired segment algorithms. See :class:`~.text.JiebaMode` 171 for details on optional values. Default: ``JiebaMode.MIX`` . 172 with_offsets (bool, optional): Whether to output the start and end offsets of each 173 token in the original string. Default: ``False`` . 174 175 Raises: 176 TypeError: If `hmm_path` is not of type str. 177 TypeError: If `mp_path` is not of type str. 178 TypeError: If `mode` is not of type :class:`~.text.JiebaMode` . 179 TypeError: If `with_offsets` is not of type bool. 180 181 Supported Platforms: 182 ``CPU`` 183 184 Examples: 185 >>> import mindspore.dataset as ds 186 >>> import mindspore.dataset.text as text 187 >>> from mindspore.dataset.text import JiebaMode 188 >>> 189 >>> # Use the transform in dataset pipeline mode 190 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"]) 191 >>> 192 >>> # 1) If with_offsets=False, return one data column {["text", dtype=str]} 193 >>> # The paths to jieba_hmm_file and jieba_mp_file can be downloaded directly from the mindspore repository. 194 >>> # Refer to https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/jiebadict/hmm_model.utf8 195 >>> # and https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/jiebadict/jieba.dict.utf8 196 >>> jieba_hmm_file = "tests/ut/data/dataset/jiebadict/hmm_model.utf8" 197 >>> jieba_mp_file = "tests/ut/data/dataset/jiebadict/jieba.dict.utf8" 198 >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False) 199 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 200 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 201 ... print(item["text"]) 202 ['床' '前' '明月光'] 203 >>> 204 >>> # 2) If with_offsets=True, return three columns {["token", dtype=str], ["offsets_start", dtype=uint32], 205 >>> # ["offsets_limit", dtype=uint32]} 206 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"]) 207 >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True) 208 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op, input_columns=["text"], 209 ... output_columns=["token", "offsets_start", "offsets_limit"]) 210 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 211 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 212 ['床' '前' '明月光'] [0 3 6] [ 3 6 15] 213 >>> 214 >>> # Use the transform in eager mode 215 >>> data = "床前明月光" 216 >>> output = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)(data) 217 >>> print(output) 218 ['床' '前' '明月光'] 219 220 Tutorial Examples: 221 - `Illustration of text transforms 222 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 223 """ 224 225 @check_jieba_init 226 def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False): 227 super().__init__() 228 if not isinstance(mode, JiebaMode): 229 raise TypeError("Wrong input type for mode, should be JiebaMode.") 230 231 self.mode = mode 232 self.__check_path__(hmm_path) 233 self.hmm_path = hmm_path 234 self.__check_path__(mp_path) 235 self.mp_path = mp_path 236 self.with_offsets = with_offsets 237 self.words = [] 238 239 def __check_path__(self, model_path): 240 """check model path""" 241 if not os.path.exists(os.path.realpath(model_path)): 242 raise ValueError( 243 " jieba mode file {} is not exist.".format(model_path)) 244 245 def parse(self): 246 jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path, 247 DE_C_INTER_JIEBA_MODE.get(self.mode), 248 self.with_offsets) 249 for word in self.words: 250 jieba_tokenizer.add_word(word[0], word[1]) 251 return jieba_tokenizer 252 253 @invalidate_callable 254 @check_jieba_add_word 255 def add_word(self, word, freq=None): 256 """ 257 Add a specified word mapping to the Vocab of the tokenizer. 258 259 Args: 260 word (str): The word to be added to the Vocab. 261 freq (int, optional): The frequency of the word to be added. The higher the word frequency, 262 the greater the chance that the word will be tokenized. Default: ``None``, using the 263 default word frequency. 264 265 Examples: 266 >>> import mindspore.dataset as ds 267 >>> import mindspore.dataset.text as text 268 >>> from mindspore.dataset.text import JiebaMode 269 >>> 270 >>> jieba_hmm_file = "/path/to/jieba/hmm/file" 271 >>> jieba_mp_file = "/path/to/jieba/mp/file" 272 >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP) 273 >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" 274 >>> with open(sentence_piece_vocab_file, 'r') as f: 275 ... for line in f: 276 ... word = line.split(',')[0] 277 ... jieba_op.add_word(word) 278 >>> 279 >>> text_file_list = ["/path/to/text_file_dataset_file"] 280 >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_list) 281 >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) 282 """ 283 284 if freq is None: 285 self.words.append((word, 0)) 286 else: 287 self.words.append((word, freq)) 288 289 @invalidate_callable 290 @check_jieba_add_dict 291 def add_dict(self, user_dict): 292 """ 293 Add the specified word mappings to the Vocab of the tokenizer. 294 295 Args: 296 user_dict (Union[str, dict[str, int]]): The word mappings to be added to the Vocab. 297 If the input type is str, it means the path of the file storing the word mappings to be added. 298 Each line of the file should contain two fields separated by a space, where the first field 299 indicates the word itself and the second field should be a number indicating the word frequency. 300 Invalid lines will be ignored and no error or warning will be returned. 301 If the input type is dict[str, int], it means the dictionary storing the word mappings to be added, 302 where the key name is the word itself and the key value is the word frequency. 303 304 Examples: 305 >>> import mindspore.dataset as ds 306 >>> import mindspore.dataset.text as text 307 >>> from mindspore.dataset.text import JiebaMode 308 >>> 309 >>> jieba_hmm_file = "/path/to/jieba/hmm/file" 310 >>> jieba_mp_file = "/path/to/jieba/mp/file" 311 >>> user_dict = {"男默女泪": 10} 312 >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP) 313 >>> jieba_op.add_dict(user_dict) 314 >>> 315 >>> text_file_list = ["/path/to/text_file_dataset_file"] 316 >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_list) 317 >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) 318 """ 319 320 if isinstance(user_dict, str): 321 self.__add_dict_py_file(user_dict) 322 elif isinstance(user_dict, dict): 323 for k, v in user_dict.items(): 324 self.add_word(k, v) 325 else: 326 raise TypeError("The type of user_dict must str or dict.") 327 328 def __add_dict_py_file(self, file_path): 329 """Add user defined word by file""" 330 words_list = self.__parser_file(file_path) 331 for data in words_list: 332 if data[1] is None: 333 freq = 0 334 else: 335 freq = int(data[1]) 336 self.add_word(data[0], freq) 337 338 def __decode(self, data): 339 """decode the dict file to utf8""" 340 try: 341 data = data.decode('utf-8') 342 except UnicodeDecodeError: 343 raise ValueError("user dict file must be utf8 format.") 344 return data.lstrip('\ufeff') 345 346 def __parser_file(self, file_path): 347 """parser user defined word by file""" 348 if not os.path.exists(file_path): 349 raise ValueError( 350 "user dict file {} is not exist.".format(file_path)) 351 real_file_path = os.path.realpath(file_path) 352 file_dict = open(real_file_path, "r") 353 data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U) 354 words_list = [] 355 for item in file_dict: 356 data = item.strip() 357 if not isinstance(data, str): 358 data = self.__decode(data) 359 tmp = data_re.match(data) 360 if not tmp: 361 continue 362 words = tmp.groups() 363 words_list.append(words) 364 file_dict.close() 365 return words_list 366 367 368class Lookup(TextTensorOperation): 369 """ 370 Look up a word into an id according to the input vocabulary table. 371 372 Args: 373 vocab (Vocab): A vocabulary object. 374 unknown_token (str, optional): Word is used for lookup. In case of the word is out of vocabulary (OOV), 375 the result of lookup will be replaced with unknown_token. If the unknown_token is not specified or 376 it is OOV, runtime error will be thrown. Default: ``None``, means no unknown_token is specified. 377 data_type (mindspore.dtype, optional): The data type that lookup operation maps 378 string to. Default: ``mstype.int32``. 379 380 Raises: 381 TypeError: If `vocab` is not of type text.Vocab. 382 TypeError: If `unknown_token` is not of type string. 383 TypeError: If `data_type` is not of type mindspore.dtype. 384 385 Supported Platforms: 386 ``CPU`` 387 388 Examples: 389 >>> import mindspore.dataset as ds 390 >>> import mindspore.dataset.text as text 391 >>> 392 >>> # Use the transform in dataset pipeline mode 393 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["with"], column_names=["text"]) 394 >>> # Load vocabulary from list 395 >>> vocab = text.Vocab.from_list(["?", "##", "with", "the", "test", "符号"]) 396 >>> # Use Lookup operation to map tokens to ids 397 >>> lookup = text.Lookup(vocab) 398 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=[lookup]) 399 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 400 ... print(item["text"]) 401 2 402 >>> 403 >>> # Use the transform in eager mode 404 >>> vocab = text.Vocab.from_list(["?", "##", "with", "the", "test", "符号"]) 405 >>> data = "with" 406 >>> output = text.Lookup(vocab=vocab, unknown_token="test")(data) 407 >>> print(output) 408 2 409 410 Tutorial Examples: 411 - `Illustration of text transforms 412 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 413 """ 414 415 @check_lookup 416 def __init__(self, vocab, unknown_token=None, data_type=mstype.int32): 417 super().__init__() 418 self.vocab = vocab 419 self.unknown_token = unknown_token 420 self.data_type = data_type 421 422 def parse(self): 423 return cde.LookupOperation(self.vocab.c_vocab, self.unknown_token, str(mstype_to_detype(self.data_type))) 424 425 426class Ngram(TextTensorOperation): 427 """ 428 Generate n-gram from a 1-D string Tensor. 429 430 Refer to `N-gram <https://en.wikipedia.org/wiki/N-gram#Examples>`_ 431 for an overview of what n-gram is and how it works. 432 433 Args: 434 n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result 435 would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up 436 for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in 437 an empty string produced. 438 left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width). 439 `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the 440 sequence with "__". Default: ``('', 0)``. 441 right_pad (tuple, optional): Padding performed on right side of the sequence shaped like 442 ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("_", 2) 443 would pad right side of the sequence with "__". Default: ``('', 0)``. 444 separator (str, optional): Symbol used to join strings together. For example, if 2-gram is 445 ["mindspore", "amazing"] with separator is ``"-"``, the result would be ["mindspore-amazing"]. 446 Default: ``' '``, which will use whitespace as separator. 447 448 Raises: 449 TypeError: If values of `n` not positive is not of type int. 450 ValueError: If values of `n` not positive. 451 ValueError: If `left_pad` is not a tuple of length 2. 452 ValueError: If `right_pad` is not a tuple of length 2. 453 TypeError: If `separator` is not of type string. 454 455 Supported Platforms: 456 ``CPU`` 457 458 Examples: 459 >>> import numpy as np 460 >>> import mindspore.dataset as ds 461 >>> import mindspore.dataset.text as text 462 >>> 463 >>> # Use the transform in dataset pipeline mode 464 >>> def gen(texts): 465 ... for line in texts: 466 ... yield(np.array(line.split(" "), dtype=str),) 467 >>> data = ["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"] 468 >>> generator_dataset = ds.GeneratorDataset(gen(data), ["text"]) 469 >>> ngram_op = text.Ngram(3, separator="-") 470 >>> generator_dataset = generator_dataset.map(operations=ngram_op) 471 >>> for item in generator_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 472 ... print(item["text"]) 473 ... break 474 [''] 475 >>> 476 >>> # Use the transform in eager mode 477 >>> output = ngram_op(data) 478 >>> print(output) 479 ["WildRose Country-Canada's Ocean Playground-Land of Living Skies"] 480 481 Tutorial Examples: 482 - `Illustration of text transforms 483 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 484 """ 485 486 @check_ngram 487 def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "): 488 super().__init__() 489 self.ngrams = n 490 self.left_pad = left_pad 491 self.right_pad = right_pad 492 self.separator = separator 493 494 def parse(self): 495 return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator) 496 497 498class PythonTokenizer: 499 """ 500 Class that applies user-defined string tokenizer into input string. 501 502 Args: 503 tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens. 504 505 Raises: 506 TypeError: If `tokenizer` is not a callable Python function. 507 508 Supported Platforms: 509 ``CPU`` 510 511 Examples: 512 >>> import numpy as np 513 >>> import mindspore.dataset as ds 514 >>> import mindspore.dataset.text as text 515 >>> 516 >>> # Use the transform in dataset pipeline mode 517 >>> def my_tokenizer(line): 518 ... return line.split() 519 >>> 520 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Hello world'], column_names=["text"]) 521 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=text.PythonTokenizer(my_tokenizer)) 522 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 523 ... print(item["text"]) 524 ['Hello' 'world'] 525 >>> 526 >>> # Use the transform in eager mode 527 >>> data = np.array('Hello world'.encode()) 528 >>> output = text.PythonTokenizer(my_tokenizer)(data) 529 >>> print(output) 530 ['Hello' 'world'] 531 532 Tutorial Examples: 533 - `Illustration of text transforms 534 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 535 """ 536 537 @check_python_tokenizer 538 def __init__(self, tokenizer): 539 self.pyfunc = tokenizer 540 self.tokenizer = np.vectorize(lambda x: np.array(tokenizer(x), dtype='U'), signature='()->(n)') 541 self.random = False 542 543 def __call__(self, in_array): 544 if not isinstance(in_array, np.ndarray): 545 raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array))) 546 if in_array.dtype.type is np.bytes_: 547 in_array = to_str(in_array) 548 try: 549 tokens = self.tokenizer(in_array) 550 except Exception as e: 551 raise RuntimeError("Error occurred in Pyfunc [" + str(self.pyfunc.__name__) + "], error message: " + str(e)) 552 return tokens 553 554 def to_json(self): 555 json_obj = {} 556 json_obj["tensor_op_name"] = self.pyfunc.__name__ 557 json_obj["python_module"] = self.__class__.__module__ 558 return json.dumps(json_obj) 559 560 561class SentencePieceTokenizer(TextTensorOperation): 562 """ 563 Tokenize scalar token or 1-D tokens to tokens by sentencepiece. 564 565 Args: 566 mode (Union[str, SentencePieceVocab]): SentencePiece model. 567 If the input parameter is a file, it represents the path of SentencePiece mode to be loaded. 568 If the input parameter is a SentencePieceVocab object, it should be constructed in advanced. 569 out_type (SPieceTokenizerOutType): The type of output, it can be ``SPieceTokenizerOutType.STRING``, 570 ``SPieceTokenizerOutType.INT``. 571 572 - ``SPieceTokenizerOutType.STRING``, means output type of SentencePice Tokenizer is string. 573 - ``SPieceTokenizerOutType.INT``, means output type of SentencePice Tokenizer is int. 574 575 Raises: 576 TypeError: If `mode` is not of type string or SentencePieceVocab. 577 TypeError: If `out_type` is not of type SPieceTokenizerOutType. 578 579 Supported Platforms: 580 ``CPU`` 581 582 Examples: 583 >>> import mindspore.dataset as ds 584 >>> import mindspore.dataset.text as text 585 >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType 586 >>> 587 >>> # Use the transform in dataset pipeline mode 588 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Hello world'], column_names=["text"]) 589 >>> # The paths to sentence_piece_vocab_file can be downloaded directly from the mindspore repository. Refer to 590 >>> # https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/test_sentencepiece/vocab.txt 591 >>> sentence_piece_vocab_file = "tests/ut/data/dataset/test_sentencepiece/vocab.txt" 592 >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 512, 0.9995, 593 ... SentencePieceModel.UNIGRAM, {}) 594 >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 595 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer) 596 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 597 ... print(item["text"]) 598 ['▁H' 'e' 'l' 'lo' '▁w' 'o' 'r' 'l' 'd'] 599 >>> 600 >>> # Use the transform in eager mode 601 >>> data = "Hello world" 602 >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 100, 0.9995, 603 ... SentencePieceModel.UNIGRAM, {}) 604 >>> output = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)(data) 605 >>> print(output) 606 ['▁' 'H' 'e' 'l' 'l' 'o' '▁' 'w' 'o' 'r' 'l' 'd'] 607 608 Tutorial Examples: 609 - `Illustration of text transforms 610 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 611 """ 612 613 @check_sentence_piece_tokenizer 614 def __init__(self, mode, out_type): 615 super().__init__() 616 self.mode = mode 617 self.out_type = out_type 618 619 def parse(self): 620 self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode 621 return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE.get(self.out_type)) 622 623 624class SlidingWindow(TextTensorOperation): 625 """ 626 Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis 627 is a slice of data starting at the corresponding position, with a specified width. 628 629 Args: 630 width (int): The width of the window. It must be an integer and greater than zero. 631 axis (int, optional): The axis along which the sliding window is computed. Default: ``0``. 632 633 Raises: 634 TypeError: If `width` is not of type int. 635 ValueError: If value of `width` is not positive. 636 TypeError: If `axis` is not of type int. 637 638 Supported Platforms: 639 ``CPU`` 640 641 Examples: 642 >>> import mindspore.dataset as ds 643 >>> import mindspore.dataset.text as text 644 >>> 645 >>> # Use the transform in dataset pipeline mode 646 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[[1, 2, 3, 4, 5]], column_names=["col1"]) 647 >>> # Data before 648 >>> # | col1 | 649 >>> # +--------------+ 650 >>> # | [[1, 2, 3, 4, 5]] | 651 >>> # +--------------+ 652 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=text.SlidingWindow(3, 0)) 653 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 654 ... print(item["col1"]) 655 [[1 2 3] [2 3 4] [3 4 5]] 656 >>> # Data after 657 >>> # | col1 | 658 >>> # +--------------+ 659 >>> # | [[1, 2, 3], | 660 >>> # | [2, 3, 4], | 661 >>> # | [3, 4, 5]] | 662 >>> # +--------------+ 663 >>> 664 >>> # Use the transform in eager mode 665 >>> data = ["happy", "birthday", "to", "you"] 666 >>> output = text.SlidingWindow(2, 0)(data) 667 >>> print(output) 668 [['happy' 'birthday'] ['birthday' 'to'] ['to' 'you']] 669 670 Tutorial Examples: 671 - `Illustration of text transforms 672 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 673 """ 674 675 @check_slidingwindow 676 def __init__(self, width, axis=0): 677 super().__init__() 678 self.width = width 679 self.axis = axis 680 681 def parse(self): 682 return cde.SlidingWindowOperation(self.width, self.axis) 683 684 685class ToNumber(TextTensorOperation): 686 """ 687 Tensor operation to convert every element of a string tensor to a number. 688 689 Strings are cast according to the rules specified in the following links, except that any strings which represent 690 negative numbers cannot be cast to an unsigned integer type, rules links are as follows: 691 https://en.cppreference.com/w/cpp/string/basic_string/stof, 692 https://en.cppreference.com/w/cpp/string/basic_string/stoul. 693 694 Args: 695 data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype. 696 697 Raises: 698 TypeError: If `data_type` is not of type mindspore.dtype. 699 RuntimeError: If strings are invalid to cast, or are out of range after being cast. 700 701 Supported Platforms: 702 ``CPU`` 703 704 Examples: 705 >>> import mindspore.dataset as ds 706 >>> import mindspore.dataset.text as text 707 >>> from mindspore import dtype as mstype 708 >>> 709 >>> # Use the transform in dataset pipeline mode 710 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[["1", "2", "3"]], column_names=["text"]) 711 >>> to_number_op = text.ToNumber(mstype.int8) 712 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=to_number_op) 713 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 714 ... print(item["text"]) 715 [1 2 3] 716 >>> 717 >>> # Use the transform in eager mode 718 >>> data = ["1", "2", "3"] 719 >>> output = text.ToNumber(mstype.uint32)(data) 720 >>> print(output) 721 [1 2 3] 722 723 Tutorial Examples: 724 - `Illustration of text transforms 725 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 726 """ 727 728 @check_to_number 729 def __init__(self, data_type): 730 super().__init__() 731 data_type = mstype_to_detype(data_type) 732 self.data_type = str(data_type) 733 734 def parse(self): 735 return cde.ToNumberOperation(self.data_type) 736 737 738class ToVectors(TextTensorOperation): 739 """ 740 Look up a token into vectors according to the input vector table. 741 742 Args: 743 vectors (Vectors): A vectors object. 744 unk_init (sequence, optional): Sequence used to initialize out-of-vectors (OOV) token. 745 Default: ``None``, initialize with zero vectors. 746 lower_case_backup (bool, optional): Whether to look up the token in the lower case. If ``False``, 747 each token in the original case will be looked up; if ``True``, each token in the original 748 case will be looked up first, if not found in the keys of the property stoi, the token in the 749 lower case will be looked up. Default: ``False``. 750 751 Raises: 752 TypeError: If `unk_init` is not of type sequence. 753 TypeError: If elements of `unk_init` is not of type float or int. 754 TypeError: If `lower_case_backup` is not of type bool. 755 756 Supported Platforms: 757 ``CPU`` 758 759 Examples: 760 >>> import mindspore.dataset as ds 761 >>> import mindspore.dataset.text as text 762 >>> 763 >>> # Use the transform in dataset pipeline mode 764 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["happy", "birthday", "to", "you"], column_names=["text"]) 765 >>> # Load vectors from file 766 >>> # The paths to vectors_file can be downloaded directly from the mindspore repository. Refer to 767 >>> # https://gitee.com/mindspore/mindspore/blob/master/tests/ut/data/dataset/testVectors/vectors.txt 768 >>> vectors_file = "tests/ut/data/dataset/testVectors/vectors.txt" 769 >>> vectors = text.Vectors.from_file(vectors_file) 770 >>> # Use ToVectors operation to map tokens to vectors 771 >>> to_vectors = text.ToVectors(vectors) 772 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=[to_vectors]) 773 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 774 ... print(item["text"]) 775 ... break 776 [0. 0. 0. 0. 0. 0.] 777 >>> 778 >>> # Use the transform in eager mode 779 >>> data = ["happy"] 780 >>> output = text.ToVectors(vectors)(data) 781 >>> print(output) 782 [0. 0. 0. 0. 0. 0.] 783 784 Tutorial Examples: 785 - `Illustration of text transforms 786 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 787 """ 788 789 @check_to_vectors 790 def __init__(self, vectors, unk_init=None, lower_case_backup=False): 791 super().__init__() 792 self.vectors = vectors 793 self.unk_init = unk_init if unk_init is not None else [] 794 self.lower_case_backup = lower_case_backup 795 796 def parse(self): 797 return cde.ToVectorsOperation(self.vectors, self.unk_init, self.lower_case_backup) 798 799 800class Truncate(TextTensorOperation): 801 """ 802 Truncate the input sequence so that it does not exceed the maximum length. 803 804 Args: 805 max_seq_len (int): Maximum allowable length. 806 807 Raises: 808 TypeError: If `max_length_len` is not of type int. 809 ValueError: If value of `max_length_len` is not greater than or equal to 0. 810 RuntimeError: If the input tensor is not of dtype bool, int, float, double or str. 811 812 Supported Platforms: 813 ``CPU`` 814 815 Examples: 816 >>> import mindspore.dataset as ds 817 >>> import mindspore.dataset.text as text 818 >>> 819 >>> # Use the transform in dataset pipeline mode 820 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=[['a', 'b', 'c', 'd', 'e']], column_names=["text"], 821 ... shuffle=False) 822 >>> # Data before 823 >>> # | col1 | 824 >>> # +---------------------------+ 825 >>> # | ['a', 'b', 'c', 'd', 'e'] | 826 >>> # +---------------------------+ 827 >>> truncate = text.Truncate(4) 828 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=truncate, input_columns=["text"]) 829 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 830 ... print(item["text"]) 831 ['a' 'b' 'c' 'd'] 832 >>> # Data after 833 >>> # | col1 | 834 >>> # +------------------------+ 835 >>> # | ['a', 'b', 'c', 'd'] | 836 >>> # +------------------------+ 837 >>> 838 >>> # Use the transform in eager mode 839 >>> data = ["happy", "birthday", "to", "you"] 840 >>> output = text.Truncate(2)(data) 841 >>> print(output) 842 ['happy' 'birthday'] 843 844 Tutorial Examples: 845 - `Illustration of text transforms 846 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 847 """ 848 849 @check_truncate 850 def __init__(self, max_seq_len): 851 super().__init__() 852 self.max_seq_len = max_seq_len 853 854 def parse(self): 855 return cde.TruncateOperation(self.max_seq_len) 856 857 858class TruncateSequencePair(TextTensorOperation): 859 """ 860 Truncate a pair of 1-D string input so that their total length is less than the specified length. 861 862 Args: 863 max_length (int): The maximum total length of the output strings. If it is no less than the 864 total length of the original pair of strings, no truncation is performed; otherwise, the 865 longer of the two input strings is truncated until its total length equals this value. 866 867 Raises: 868 TypeError: If `max_length` is not of type int. 869 870 Supported Platforms: 871 ``CPU`` 872 873 Examples: 874 >>> import mindspore.dataset as ds 875 >>> import mindspore.dataset.text as text 876 >>> 877 >>> # Use the transform in dataset pipeline mode 878 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=([[1, 2, 3]], [[4, 5]]), column_names=["col1", "col2"]) 879 >>> # Data before 880 >>> # | col1 | col2 | 881 >>> # +-----------+-----------| 882 >>> # | [1, 2, 3] | [4, 5] | 883 >>> # +-----------+-----------+ 884 >>> truncate_sequence_pair_op = text.TruncateSequencePair(max_length=4) 885 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=truncate_sequence_pair_op, 886 ... input_columns=["col1", "col2"]) 887 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 888 ... print(item["col1"], item["col2"]) 889 [1 2] [4 5] 890 >>> # Data after 891 >>> # | col1 | col2 | 892 >>> # +-----------+-----------+ 893 >>> # | [1, 2] | [4, 5] | 894 >>> # +-----------+-----------+ 895 >>> 896 >>> # Use the transform in eager mode 897 >>> data = [["1", "2", "3"], ["4", "5"]] 898 >>> output = text.TruncateSequencePair(4)(*data) 899 >>> print(output) 900 (array(['1', '2'], dtype='<U1'), array(['4', '5'], dtype='<U1')) 901 902 Tutorial Examples: 903 - `Illustration of text transforms 904 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 905 """ 906 907 @check_pair_truncate 908 def __init__(self, max_length): 909 super().__init__() 910 self.max_length = max_length 911 912 def parse(self): 913 return cde.TruncateSequencePairOperation(self.max_length) 914 915 916class UnicodeCharTokenizer(TextTensorOperation): 917 """ 918 Unpack the Unicode characters in the input strings. 919 920 Args: 921 with_offsets (bool, optional): Whether to output the start and end offsets of each 922 token in the original string. Default: ``False`` . 923 924 Raises: 925 TypeError: If `with_offsets` is not of type bool. 926 927 Supported Platforms: 928 ``CPU`` 929 930 Examples: 931 >>> import mindspore.dataset as ds 932 >>> import mindspore.dataset.text as text 933 >>> 934 >>> # Use the transform in dataset pipeline mode 935 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 936 >>> 937 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 938 >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=False) 939 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 940 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 941 ... print(item["text"]) 942 ... break 943 ['W' 'e' 'l' 'c' 'o' 'm' 'e' ' ' ' ' ' ' ' ' ' ' 'T' 'o' ' ' ' ' ' ' 'B' 'e' 'i' 'J' 'i' 'n' 'g' '!'] 944 >>> 945 >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], 946 >>> # ["offsets_limit", dtype=uint32]} 947 >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True) 948 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 949 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op, input_columns=["text"], 950 ... output_columns=["token", "offsets_start", "offsets_limit"]) 951 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 952 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 953 ['W' 'e' 'l' 'c' 'o' 'm' 'e' ' ' ' ' ' ' ' ' ' ' 'T' 'o' ' ' ' ' ' ' 'B' 'e' 'i' 'J' 'i' 'n' 'g' '!'] [ 0 1 2 954 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 955 16 17 18 19 20 21 22 23 24 25] 956 >>> 957 >>> # Use the transform in eager mode 958 >>> data = 'Welcome To BeiJing!' 959 >>> output = text.UnicodeCharTokenizer(with_offsets=True)(data) 960 >>> print(output) 961 (array(['W', 'e', 'l', 'c', 'o', 'm', 'e', ' ', ' ', ' ', ' ', ' ', 'T', 'o', ' ', ' ', ' ', 'B', 'e', 'i', 'J', 962 'i', 'n', 'g', '!'], dtype='<U1'), array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 963 17, 18, 19, 20, 21, 22, 23, 24], dtype=uint32), array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 964 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], dtype=uint32)) 965 966 Tutorial Examples: 967 - `Illustration of text transforms 968 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 969 """ 970 971 @check_with_offsets 972 def __init__(self, with_offsets=False): 973 super().__init__() 974 self.with_offsets = with_offsets 975 976 def parse(self): 977 return cde.UnicodeCharTokenizerOperation(self.with_offsets) 978 979 980class WordpieceTokenizer(TextTensorOperation): 981 """ 982 Tokenize the input text to subword tokens. 983 984 Args: 985 vocab (Vocab): Vocabulary used to look up words. 986 suffix_indicator (str, optional): Prefix flags used to indicate subword suffixes. Default: ``'##'``. 987 max_bytes_per_token (int, optional): The maximum length of tokenization, words exceeding this length will 988 not be split. Default: ``100``. 989 unknown_token (str, optional): The output for unknown words. When set to an empty string, the corresponding 990 unknown word will be directly returned as the output. Otherwise, the set string will be returned as the 991 output. Default: ``'[UNK]'``. 992 with_offsets (bool, optional): Whether to output the start and end offsets of each 993 token in the original string. Default: ``False`` . 994 995 Raises: 996 TypeError: If `vocab` is not of type :class:`mindspore.dataset.text.Vocab` . 997 TypeError: If `suffix_indicator` is not of type str. 998 TypeError: If `max_bytes_per_token` is not of type int. 999 TypeError: If `unknown_token` is not of type str. 1000 TypeError: If `with_offsets` is not of type bool. 1001 ValueError: If `max_bytes_per_token` is negative. 1002 1003 Supported Platforms: 1004 ``CPU`` 1005 1006 Examples: 1007 >>> import mindspore.dataset as ds 1008 >>> import mindspore.dataset.text as text 1009 >>> 1010 >>> # Use the transform in dataset pipeline mode 1011 >>> seed = ds.config.get_seed() 1012 >>> ds.config.set_seed(12345) 1013 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["happy", "birthday", "to", "you"], column_names=["text"]) 1014 >>> 1015 >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] 1016 >>> vocab = text.Vocab.from_list(vocab_list) 1017 >>> 1018 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 1019 >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', 1020 ... max_bytes_per_token=100, with_offsets=False) 1021 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 1022 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1023 ... print(item["text"]) 1024 ... break 1025 ['[UNK]'] 1026 >>> 1027 >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], 1028 >>> # ["offsets_limit", dtype=uint32]} 1029 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["happy", "birthday", "to", "you"], column_names=["text"]) 1030 >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', 1031 ... max_bytes_per_token=100, with_offsets=True) 1032 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op, input_columns=["text"], 1033 ... output_columns=["token", "offsets_start", "offsets_limit"]) 1034 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1035 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 1036 ... break 1037 ['[UNK]'] [0] [5] 1038 >>> 1039 >>> # Use the transform in eager mode 1040 >>> data = ["happy", "birthday", "to", "you"] 1041 >>> vocab_list = ["book", "cholera", "era", "favor", "**ite", "my", "is", "love", "dur", "**ing", "the"] 1042 >>> vocab = text.Vocab.from_list(vocab_list) 1043 >>> output = text.WordpieceTokenizer(vocab=vocab, suffix_indicator="y", unknown_token='[UNK]')(data) 1044 >>> print(output) 1045 ['[UNK]' '[UNK]' '[UNK]' '[UNK]'] 1046 >>> ds.config.set_seed(seed) 1047 1048 Tutorial Examples: 1049 - `Illustration of text transforms 1050 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1051 """ 1052 1053 @check_wordpiece_tokenizer 1054 def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', 1055 with_offsets=False): 1056 super().__init__() 1057 self.vocab = vocab 1058 self.suffix_indicator = suffix_indicator 1059 self.max_bytes_per_token = max_bytes_per_token 1060 self.unknown_token = unknown_token 1061 self.with_offsets = with_offsets 1062 1063 def parse(self): 1064 return cde.WordpieceTokenizerOperation(self.vocab.c_vocab, self.suffix_indicator, self.max_bytes_per_token, 1065 self.unknown_token, self.with_offsets) 1066 1067 1068if platform.system().lower() != 'windows': 1069 DE_C_INTER_NORMALIZE_FORM = { 1070 NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, 1071 NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC, 1072 NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC, 1073 NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD, 1074 NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD 1075 } 1076 1077 1078 class BasicTokenizer(TextTensorOperation): 1079 """ 1080 Tokenize the input UTF-8 encoded string by specific rules. 1081 1082 Note: 1083 `BasicTokenizer` is not supported on Windows platform yet. 1084 1085 Args: 1086 lower_case (bool, optional): Whether to perform lowercase processing on the text. If True, will fold the 1087 text to lower case and strip accented characters. If False, will only perform normalization on the 1088 text, with mode specified by `normalization_form` . Default: ``False``. 1089 keep_whitespace (bool, optional): If True, the whitespace will be kept in the output. Default: ``False``. 1090 normalization_form (NormalizeForm, optional): The desired normalization form. 1091 See :class:`~.text.NormalizeForm` for details on optional values. 1092 Default: ``NormalizeForm.NFKC`` . 1093 preserve_unused_token (bool, optional): Whether to preserve special tokens. If True, will not split special 1094 tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'. Default: ``True``. 1095 with_offsets (bool, optional): Whether to output the start and end offsets of each 1096 token in the original string. Default: ``False`` . 1097 1098 Raises: 1099 TypeError: If `lower_case` is not of type bool. 1100 TypeError: If `keep_whitespace` is not of type bool. 1101 TypeError: If `normalization_form` is not of type :class:`~.text.NormalizeForm` . 1102 TypeError: If `preserve_unused_token` is not of type bool. 1103 TypeError: If `with_offsets` is not of type bool. 1104 RuntimeError: If dtype of input Tensor is not str. 1105 1106 Supported Platforms: 1107 ``CPU`` 1108 1109 Examples: 1110 >>> import mindspore.dataset as ds 1111 >>> import mindspore.dataset.text as text 1112 >>> from mindspore.dataset.text import NormalizeForm 1113 >>> 1114 >>> # Use the transform in dataset pipeline mode 1115 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 1116 >>> 1117 >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]} 1118 >>> tokenizer_op = text.BasicTokenizer(lower_case=False, 1119 ... keep_whitespace=False, 1120 ... normalization_form=NormalizeForm.NONE, 1121 ... preserve_unused_token=True, 1122 ... with_offsets=False) 1123 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 1124 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1125 ... print(item["text"]) 1126 ['Welcome' 'To' 'BeiJing' '!'] 1127 >>> 1128 >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str], 1129 >>> # ["offsets_start", dtype=uint32], 1130 >>> # ["offsets_limit", dtype=uint32]} 1131 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 1132 >>> tokenizer_op = text.BasicTokenizer(lower_case=False, 1133 ... keep_whitespace=False, 1134 ... normalization_form=NormalizeForm.NONE, 1135 ... preserve_unused_token=True, 1136 ... with_offsets=True) 1137 >>> numpy_slices_dataset = numpy_slices_dataset.map( 1138 ... operations=tokenizer_op, input_columns=["text"], 1139 ... output_columns=["token", "offsets_start", "offsets_limit"]) 1140 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1141 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 1142 ['Welcome' 'To' 'BeiJing' '!'] [ 0 12 17 24] [ 7 14 24 25] 1143 >>> 1144 >>> # Use the transform in eager mode 1145 >>> data = 'Welcome To BeiJing!' 1146 >>> output = text.BasicTokenizer()(data) 1147 >>> print(output) 1148 ['Welcome' 'To' 'BeiJing' '!'] 1149 1150 Tutorial Examples: 1151 - `Illustration of text transforms 1152 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1153 """ 1154 1155 @check_basic_tokenizer 1156 def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, 1157 preserve_unused_token=True, with_offsets=False): 1158 super().__init__() 1159 if not isinstance(normalization_form, NormalizeForm): 1160 raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") 1161 1162 self.lower_case = lower_case 1163 self.keep_whitespace = keep_whitespace 1164 self.normalization_form = DE_C_INTER_NORMALIZE_FORM.get(normalization_form) 1165 self.preserve_unused_token = preserve_unused_token 1166 self.with_offsets = with_offsets 1167 1168 def parse(self): 1169 return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form, 1170 self.preserve_unused_token, self.with_offsets) 1171 1172 1173 class BertTokenizer(TextTensorOperation): 1174 """ 1175 Tokenizer used for Bert text process. 1176 1177 Note: 1178 `BertTokenizer` is not supported on Windows platform yet. 1179 1180 Args: 1181 vocab (Vocab): Vocabulary used to look up words. 1182 suffix_indicator (str, optional): Prefix flags used to indicate subword suffixes. Default: ``'##'``. 1183 max_bytes_per_token (int, optional): The maximum length of tokenization, words exceeding this length will 1184 not be split. Default: ``100``. 1185 unknown_token (str, optional): The output for unknown words. When set to an empty string, the corresponding 1186 unknown word will be directly returned as the output. Otherwise, the set string will be returned as the 1187 output. Default: ``'[UNK]'``. 1188 lower_case (bool, optional): Whether to perform lowercase processing on the text. If ``True``, will fold the 1189 text to lower case and strip accented characters. If ``False``, will only perform normalization on the 1190 text, with mode specified by `normalization_form` . Default: ``False``. 1191 keep_whitespace (bool, optional): If ``True``, the whitespace will be kept in the output. 1192 Default: ``False``. 1193 normalization_form (NormalizeForm, optional): The desired normalization form. 1194 See :class:`~.text.NormalizeForm` for details on optional values. 1195 Default: ``NormalizeForm.NFKC`` . 1196 preserve_unused_token (bool, optional): Whether to preserve special tokens. If ``True``, 1197 will not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'. 1198 Default: ``True``. 1199 with_offsets (bool, optional): Whether to output the start and end offsets of each 1200 token in the original string. Default: ``False`` . 1201 1202 Raises: 1203 TypeError: If `vocab` is not of type :class:`mindspore.dataset.text.Vocab` . 1204 TypeError: If `suffix_indicator` is not of type str. 1205 TypeError: If `max_bytes_per_token` is not of type int. 1206 ValueError: If `max_bytes_per_token` is negative. 1207 TypeError: If `unknown_token` is not of type str. 1208 TypeError: If `lower_case` is not of type bool. 1209 TypeError: If `keep_whitespace` is not of type bool. 1210 TypeError: If `normalization_form` is not of type :class:`~.text.NormalizeForm` . 1211 TypeError: If `preserve_unused_token` is not of type bool. 1212 TypeError: If `with_offsets` is not of type bool. 1213 1214 Supported Platforms: 1215 ``CPU`` 1216 1217 Examples: 1218 >>> import numpy as np 1219 >>> import mindspore.dataset as ds 1220 >>> import mindspore.dataset.text as text 1221 >>> from mindspore.dataset.text import NormalizeForm 1222 >>> 1223 >>> # Use the transform in dataset pipeline mode 1224 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"]) 1225 >>> 1226 >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]} 1227 >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", 1228 ... "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak", 1229 ... "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "+", "/", 1230 ... "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"] 1231 >>> vocab = text.Vocab.from_list(vocab_list) 1232 >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, 1233 ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False, 1234 ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True, 1235 ... with_offsets=False) 1236 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 1237 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1238 ... print(item["text"]) 1239 ['床' '前' '明' '月' '光'] 1240 >>> 1241 >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str], 1242 >>> # ["offsets_start", dtype=uint32], 1243 >>> # ["offsets_limit", dtype=uint32]} 1244 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["床前明月光"], column_names=["text"]) 1245 >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, 1246 ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False, 1247 ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True, 1248 ... with_offsets=True) 1249 >>> numpy_slices_dataset = numpy_slices_dataset.map( 1250 ... operations=tokenizer_op, 1251 ... input_columns=["text"], 1252 ... output_columns=["token", "offsets_start", "offsets_limit"]) 1253 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1254 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 1255 ['床' '前' '明' '月' '光'] [ 0 3 6 9 12] [ 3 6 9 12 15] 1256 >>> 1257 >>> # Use the transform in eager mode 1258 >>> data = "床前明月光" 1259 >>> vocab = text.Vocab.from_list(vocab_list) 1260 >>> tokenizer_op = text.BertTokenizer(vocab=vocab) 1261 >>> output = tokenizer_op(data) 1262 >>> print(output) 1263 ['床' '前' '明' '月' '光'] 1264 1265 Tutorial Examples: 1266 - `Illustration of text transforms 1267 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1268 """ 1269 1270 @check_bert_tokenizer 1271 def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', 1272 lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, 1273 preserve_unused_token=True, with_offsets=False): 1274 super().__init__() 1275 if not isinstance(normalization_form, NormalizeForm): 1276 raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") 1277 1278 self.vocab = vocab 1279 self.suffix_indicator = suffix_indicator 1280 self.max_bytes_per_token = max_bytes_per_token 1281 self.unknown_token = unknown_token 1282 self.lower_case = lower_case 1283 self.keep_whitespace = keep_whitespace 1284 self.normalization_form = DE_C_INTER_NORMALIZE_FORM.get(normalization_form) 1285 self.preserve_unused_token = preserve_unused_token 1286 self.with_offsets = with_offsets 1287 1288 def parse(self): 1289 return cde.BertTokenizerOperation(self.vocab.c_vocab, self.suffix_indicator, self.max_bytes_per_token, 1290 self.unknown_token, self.lower_case, self.keep_whitespace, 1291 self.normalization_form, self.preserve_unused_token, self.with_offsets) 1292 1293 1294 class CaseFold(TextTensorOperation): 1295 """ 1296 Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into 1297 lower case than :code:`str.lower` . For supported normalization forms, please refer to 1298 `ICU_Normalizer2 <https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1Normalizer2.html>`_ . 1299 1300 Note: 1301 CaseFold is not supported on Windows platform yet. 1302 1303 Supported Platforms: 1304 ``CPU`` 1305 1306 Examples: 1307 >>> import mindspore.dataset as ds 1308 >>> import mindspore.dataset.text as text 1309 >>> 1310 >>> # Use the transform in dataset pipeline mode 1311 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 1312 >>> case_op = text.CaseFold() 1313 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=case_op) 1314 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1315 ... print(item["text"]) 1316 welcome to beijing! 1317 >>> 1318 >>> # Use the transform in eager mode 1319 >>> data = 'Welcome To BeiJing!' 1320 >>> output = text.CaseFold()(data) 1321 >>> print(output) 1322 welcome to beijing! 1323 1324 Tutorial Examples: 1325 - `Illustration of text transforms 1326 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1327 """ 1328 1329 def parse(self): 1330 return cde.CaseFoldOperation() 1331 1332 1333 class FilterWikipediaXML(TextTensorOperation): 1334 """ 1335 Filter Wikipedia XML dumps to "clean" text consisting only of lowercase letters (a-z, converted from A-Z), 1336 and spaces (never consecutive). 1337 1338 Note: 1339 FilterWikipediaXML is not supported on Windows platform yet. 1340 1341 Supported Platforms: 1342 ``CPU`` 1343 1344 Examples: 1345 >>> import mindspore.dataset as ds 1346 >>> import mindspore.dataset.text as text 1347 >>> 1348 >>> # Use the transform in dataset pipeline mode 1349 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["Welcome to China", "!!!", "ABC"], 1350 ... column_names=["text"], shuffle=False) 1351 >>> replace_op = text.FilterWikipediaXML() 1352 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=replace_op) 1353 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1354 ... print(item["text"]) 1355 ... break 1356 welcome to china 1357 >>> 1358 >>> # Use the transform in eager mode 1359 >>> data = "Welcome to China" 1360 >>> output = replace_op(data) 1361 >>> print(output) 1362 welcome to china 1363 1364 Tutorial Examples: 1365 - `Illustration of text transforms 1366 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1367 """ 1368 1369 def parse(self): 1370 return cde.FilterWikipediaXMLOperation() 1371 1372 1373 class NormalizeUTF8(TextTensorOperation): 1374 """ 1375 Normalize the input UTF-8 encoded strings. 1376 1377 Note: 1378 NormalizeUTF8 is not supported on Windows platform yet. 1379 1380 Args: 1381 normalize_form (NormalizeForm, optional): The desired normalization form. 1382 See :class:`~.text.NormalizeForm` for details on optional values. 1383 Default: ``NormalizeForm.NFKC`` . 1384 1385 Raises: 1386 TypeError: If `normalize_form` is not of type :class:`~.text.NormalizeForm`. 1387 1388 Supported Platforms: 1389 ``CPU`` 1390 1391 Examples: 1392 >>> import mindspore.dataset as ds 1393 >>> import mindspore.dataset.text as text 1394 >>> from mindspore.dataset.text import NormalizeForm 1395 >>> 1396 >>> # Use the transform in dataset pipeline mode 1397 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["ṩ", "ḍ̇", "q̇", "fi", "2⁵", "ẛ"], 1398 ... column_names=["text"], shuffle=False) 1399 >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC) 1400 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=normalize_op) 1401 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1402 ... print(item["text"]) 1403 ... break 1404 ṩ 1405 >>> 1406 >>> # Use the transform in eager mode 1407 >>> data = ["ṩ", "ḍ̇", "q̇", "fi", "2⁵", "ẛ"] 1408 >>> output = text.NormalizeUTF8(NormalizeForm.NFKC)(data) 1409 >>> print(output) 1410 ['ṩ' 'ḍ̇' 'q̇' 'fi' '25' 'ṡ'] 1411 1412 Tutorial Examples: 1413 - `Illustration of text transforms 1414 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1415 """ 1416 1417 def __init__(self, normalize_form=NormalizeForm.NFKC): 1418 super().__init__() 1419 if not isinstance(normalize_form, NormalizeForm): 1420 raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") 1421 1422 normalize_form = replace_none(normalize_form, NormalizeForm.NFKC) 1423 self.normalize_form = DE_C_INTER_NORMALIZE_FORM.get(normalize_form) 1424 1425 def parse(self): 1426 return cde.NormalizeUTF8Operation(self.normalize_form) 1427 1428 1429 class RegexReplace(TextTensorOperation): 1430 """ 1431 Replace part of the input UTF-8 string with a difference text string using regular expressions. 1432 1433 Note: 1434 RegexReplace is not supported on Windows platform yet. 1435 1436 Args: 1437 pattern (str): The regular expression, used to mean the specific, standard textual syntax for 1438 representing patterns for matching text. 1439 replace (str): The string used to replace the matched elements. 1440 replace_all (bool, optional): Whether to replace all matched elements. If ``False``, only the 1441 first matched element will be replaced; otherwise, all matched elements will be replaced. 1442 Default: ``True``. 1443 1444 Raises: 1445 TypeError: If `pattern` is not of type str. 1446 TypeError: If `replace` is not of type str. 1447 TypeError: If `replace_all` is not of type bool. 1448 1449 Supported Platforms: 1450 ``CPU`` 1451 1452 Examples: 1453 >>> import mindspore.dataset as ds 1454 >>> import mindspore.dataset.text as text 1455 >>> 1456 >>> # Use the transform in dataset pipeline mode 1457 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['apple orange apple orange apple'], 1458 ... column_names=["text"]) 1459 >>> regex_replace = text.RegexReplace('apple', 'orange') 1460 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=regex_replace) 1461 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1462 ... print(item["text"]) 1463 orange orange orange orange orange 1464 >>> 1465 >>> # Use the transform in eager mode 1466 >>> data = 'onetwoonetwoone' 1467 >>> output = text.RegexReplace(pattern="one", replace="two", replace_all=True)(data) 1468 >>> print(output) 1469 twotwotwotwotwo 1470 1471 Tutorial Examples: 1472 - `Illustration of text transforms 1473 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1474 """ 1475 1476 @check_regex_replace 1477 def __init__(self, pattern, replace, replace_all=True): 1478 super().__init__() 1479 self.pattern = pattern 1480 self.replace = replace 1481 self.replace_all = replace_all 1482 1483 def parse(self): 1484 return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all) 1485 1486 1487 class RegexTokenizer(TextTensorOperation): 1488 """ 1489 Tokenize a scalar tensor of UTF-8 string by regex expression pattern. 1490 1491 See https://unicode-org.github.io/icu/userguide/strings/regexp.html for supported regex pattern. 1492 1493 Note: 1494 RegexTokenizer is not supported on Windows platform yet. 1495 1496 Args: 1497 delim_pattern (str): The pattern of regex delimiters. 1498 The original string will be split by matched elements. 1499 keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token 1500 if it can be matched by 'keep_delim_pattern'. The default value is an empty str 1501 which means that delimiters will not be kept as an output token. Default: ``''``. 1502 with_offsets (bool, optional): Whether to output the start and end offsets of each 1503 token in the original string. Default: ``False`` . 1504 1505 Raises: 1506 TypeError: If `delim_pattern` is not of type string. 1507 TypeError: If `keep_delim_pattern` is not of type string. 1508 TypeError: If `with_offsets` is not of type bool. 1509 1510 Supported Platforms: 1511 ``CPU`` 1512 1513 Examples: 1514 >>> import mindspore.dataset as ds 1515 >>> import mindspore.dataset.text as text 1516 >>> 1517 >>> # Use the transform in dataset pipeline mode 1518 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome |, To |, BeiJing!'], 1519 ... column_names=["text"]) 1520 >>> 1521 >>> # 1) If with_offsets=False, default output is one column {["text", dtype=str]} 1522 >>> delim_pattern = r"[ |,]" 1523 >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False) 1524 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 1525 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1526 ... print(item["text"]) 1527 ['Welcome' 'To' 'BeiJing!'] 1528 >>> 1529 >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str], 1530 >>> # ["offsets_start", dtype=uint32], 1531 >>> # ["offsets_limit", dtype=uint32]} 1532 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome |, To |, BeiJing!'], 1533 ... column_names=["text"]) 1534 >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True) 1535 >>> numpy_slices_dataset = numpy_slices_dataset.map( 1536 ... operations=tokenizer_op, 1537 ... input_columns=["text"], 1538 ... output_columns=["token", "offsets_start", "offsets_limit"]) 1539 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1540 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 1541 ['Welcome' 'To' 'BeiJing!'] [ 0 13 21] [ 7 15 29] 1542 >>> 1543 >>> # Use the transform in eager mode 1544 >>> data = 'Welcome To BeiJing!' 1545 >>> output = text.RegexTokenizer(delim_pattern="To", keep_delim_pattern="To", with_offsets=True)(data) 1546 >>> print(output) 1547 (array(['Welcome ', 'To', ' BeiJing!'], dtype='<U12'), 1548 array([ 0, 12, 14], dtype=uint32), array([12, 14, 25], dtype=uint32)) 1549 1550 Tutorial Examples: 1551 - `Illustration of text transforms 1552 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1553 """ 1554 1555 @check_regex_tokenizer 1556 def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False): 1557 super().__init__() 1558 self.delim_pattern = delim_pattern 1559 self.keep_delim_pattern = keep_delim_pattern 1560 self.with_offsets = with_offsets 1561 1562 def parse(self): 1563 return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) 1564 1565 1566 class UnicodeScriptTokenizer(TextTensorOperation): 1567 """ 1568 Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries. 1569 1570 Note: 1571 UnicodeScriptTokenizer is not supported on Windows platform yet. 1572 1573 Args: 1574 keep_whitespace (bool, optional): Whether or not emit whitespace tokens. Default: ``False``. 1575 with_offsets (bool, optional): Whether to output the start and end offsets of each 1576 token in the original string. Default: ``False`` . 1577 1578 Raises: 1579 TypeError: If `keep_whitespace` is not of type bool. 1580 TypeError: If `with_offsets` is not of type bool. 1581 1582 Supported Platforms: 1583 ``CPU`` 1584 1585 Examples: 1586 >>> import mindspore.dataset as ds 1587 >>> import mindspore.dataset.text as text 1588 >>> 1589 >>> # Use the transform in dataset pipeline mode 1590 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["北 京", "123", "欢 迎", "你"], 1591 ... column_names=["text"], shuffle=False) 1592 >>> 1593 >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]} 1594 >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False) 1595 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 1596 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1597 ... print(item["text"]) 1598 ... break 1599 ['北' ' ' '京'] 1600 >>> 1601 >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str], 1602 >>> # ["offsets_start", dtype=uint32], 1603 >>> # ["offsets_limit", dtype=uint32]} 1604 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=["北 京", "123", "欢 迎", "你"], 1605 ... column_names=["text"], shuffle=False) 1606 >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) 1607 >>> numpy_slices_dataset = numpy_slices_dataset.map( 1608 ... operations=tokenizer_op, 1609 ... input_columns=["text"], 1610 ... output_columns=["token", "offsets_start", "offsets_limit"]) 1611 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1612 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 1613 ... break 1614 ['北' ' ' '京'] [0 3 4] [3 4 7] 1615 >>> 1616 >>> # Use the transform in eager mode 1617 >>> data = "北 京" 1618 >>> unicode_script_tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False) 1619 >>> output = unicode_script_tokenizer_op(data) 1620 >>> print(output) 1621 ['北' ' ' '京'] 1622 1623 Tutorial Examples: 1624 - `Illustration of text transforms 1625 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1626 1627 """ 1628 1629 @check_unicode_script_tokenizer 1630 def __init__(self, keep_whitespace=False, with_offsets=False): 1631 super().__init__() 1632 keep_whitespace = replace_none(keep_whitespace, False) 1633 with_offsets = replace_none(with_offsets, False) 1634 self.keep_whitespace = keep_whitespace 1635 self.with_offsets = with_offsets 1636 1637 def parse(self): 1638 return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets) 1639 1640 1641 class WhitespaceTokenizer(TextTensorOperation): 1642 """ 1643 Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. 1644 1645 Note: 1646 WhitespaceTokenizer is not supported on Windows platform yet. 1647 1648 Args: 1649 with_offsets (bool, optional): Whether to output the start and end offsets of each 1650 token in the original string. Default: ``False`` . 1651 1652 Raises: 1653 TypeError: If `with_offsets` is not of type bool. 1654 1655 Supported Platforms: 1656 ``CPU`` 1657 1658 Examples: 1659 >>> import mindspore.dataset as ds 1660 >>> import mindspore.dataset.text as text 1661 >>> 1662 >>> # Use the transform in dataset pipeline mode 1663 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 1664 >>> 1665 >>> # 1) If with_offsets=False, default output one column {["text", dtype=str]} 1666 >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=False) 1667 >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=tokenizer_op) 1668 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1669 ... print(item["text"]) 1670 ['Welcome' 'To' 'BeiJing!'] 1671 >>> 1672 >>> # 2) If with_offsets=True, then output three columns {["token", dtype=str], 1673 >>> # ["offsets_start", dtype=uint32], 1674 >>> # ["offsets_limit", dtype=uint32]} 1675 >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=['Welcome To BeiJing!'], column_names=["text"]) 1676 >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=True) 1677 >>> numpy_slices_dataset = numpy_slices_dataset.map( 1678 ... operations=tokenizer_op, 1679 ... input_columns=["text"], 1680 ... output_columns=["token", "offsets_start", "offsets_limit"]) 1681 >>> for item in numpy_slices_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 1682 ... print(item["token"], item["offsets_start"], item["offsets_limit"]) 1683 ['Welcome' 'To' 'BeiJing!'] [ 0 12 17] [ 7 14 25] 1684 >>> 1685 >>> # Use the transform in eager mode 1686 >>> data = 'Welcome To BeiJing!' 1687 >>> output = text.WhitespaceTokenizer(with_offsets=True)(data) 1688 >>> print(output) 1689 (array(['Welcome', 'To', 'BeiJing!'], dtype='<U8'), array([ 0, 12, 17], dtype=uint32), 1690 array([ 7, 14, 25], dtype=uint32)) 1691 1692 Tutorial Examples: 1693 - `Illustration of text transforms 1694 <https://www.mindspore.cn/docs/en/master/api_python/samples/dataset/text_gallery.html>`_ 1695 """ 1696 1697 @check_with_offsets 1698 def __init__(self, with_offsets=False): 1699 super().__init__() 1700 self.with_offsets = with_offsets 1701 1702 def parse(self): 1703 return cde.WhitespaceTokenizerOperation(self.with_offsets) 1704