1# Copyright 2020-2021 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14""" 15The module text.transforms is inherited from _c_dataengine 16and is implemented based on ICU4C and cppjieba in C++. 17It's a high performance module to process NLP text. 18Users can use Vocab to build their own dictionary, 19use appropriate tokenizers to split sentences into different tokens, 20and use Lookup to find the index of tokens in Vocab. 21 22.. Note:: 23 A constructor's arguments for every class in this module must be saved into the 24 class attributes (self.xxx) to support save() and load(). 25 26Examples: 27 >>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"] # contains 1 or multiple text files 28 >>> # Create a dataset for text sentences saved as line data in a file 29 >>> text_file_dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir, shuffle=False) 30 >>> # Tokenize sentences to unicode characters 31 >>> tokenizer = text.UnicodeCharTokenizer() 32 >>> # Load vocabulary from list 33 >>> vocab = text.Vocab.from_list(word_list=['深', '圳', '欢', '迎', '您']) 34 >>> # Use Lookup operator to map tokens to ids 35 >>> lookup = text.Lookup(vocab=vocab) 36 >>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup]) 37 >>> # if text line in dataset_file is: 38 >>> # 深圳欢迎您 39 >>> # then the output will be: 40 >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} 41""" 42import os 43import re 44import platform 45import numpy as np 46 47import mindspore._c_dataengine as cde 48from mindspore import dtype as mstype 49 50from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType 51from .validators import check_lookup, check_jieba_add_dict, \ 52 check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \ 53 check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \ 54 check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \ 55 check_sentence_piece_tokenizer 56from ..core.datatypes import mstype_to_detype 57from ..core.validator_helpers import replace_none 58from ..transforms.c_transforms import TensorOperation 59 60 61class TextTensorOperation(TensorOperation): 62 """ 63 Base class of Text Tensor Ops 64 """ 65 66 def parse(self): 67 raise NotImplementedError("TextTensorOperation has to implement parse() method.") 68 69 70DE_C_INTER_JIEBA_MODE = { 71 JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX, 72 JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP, 73 JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM 74} 75 76DE_C_INTER_SENTENCEPIECE_LOADTYPE = { 77 SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE, 78 SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL 79} 80 81DE_C_INTER_SENTENCEPIECE_OUTTYPE = { 82 SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString, 83 SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT 84} 85 86 87class JiebaTokenizer(TextTensorOperation): 88 """ 89 Tokenize Chinese string into words based on dictionary. 90 91 Note: 92 The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed. 93 94 Args: 95 hmm_path (str): Dictionary file is used by HMMSegment algorithm. 96 The dictionary can be obtained on the official website of cppjieba. 97 mp_path (str): Dictionary file is used by MPSegment algorithm. 98 The dictionary can be obtained on the official website of cppjieba. 99 mode (JiebaMode, optional): Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, 100 JiebaMode.MIX](default=JiebaMode.MIX). 101 102 - JiebaMode.MP, tokenize with MPSegment algorithm. 103 - JiebaMode.HMM, tokenize with Hidden Markov Model Segment algorithm. 104 - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. 105 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 106 107 Examples: 108 >>> from mindspore.dataset.text import JiebaMode 109 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 110 >>> jieba_hmm_file = "/path/to/jieba/hmm/file" 111 >>> jieba_mp_file = "/path/to/jieba/mp/file" 112 >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False) 113 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 114 >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], 115 >>> # ["offsets_limit", dtype=uint32]} 116 >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True) 117 >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], 118 ... output_columns=["token", "offsets_start", "offsets_limit"], 119 ... column_order=["token", "offsets_start", "offsets_limit"]) 120 """ 121 122 @check_jieba_init 123 def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False): 124 if not isinstance(mode, JiebaMode): 125 raise TypeError("Wrong input type for mode, should be JiebaMode.") 126 127 self.mode = mode 128 self.__check_path__(hmm_path) 129 self.hmm_path = hmm_path 130 self.__check_path__(mp_path) 131 self.mp_path = mp_path 132 self.with_offsets = with_offsets 133 self.words = [] 134 135 def parse(self): 136 jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path, 137 DE_C_INTER_JIEBA_MODE[self.mode], 138 self.with_offsets) 139 for word in self.words: 140 jieba_tokenizer.add_word(word[0], word[1]) 141 return jieba_tokenizer 142 143 @check_jieba_add_word 144 def add_word(self, word, freq=None): 145 """ 146 Add a user defined word to JiebaTokenizer's dictionary. 147 148 Args: 149 word (str): The word to be added to the JiebaTokenizer instance. 150 The added word will not be written into the built-in dictionary on disk. 151 freq (int, optional): The frequency of the word to be added. The higher the frequency, 152 the better chance the word will be tokenized (default=None, use default frequency). 153 154 Examples: 155 >>> from mindspore.dataset.text import JiebaMode 156 >>> jieba_hmm_file = "/path/to/jieba/hmm/file" 157 >>> jieba_mp_file = "/path/to/jieba/mp/file" 158 >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP) 159 >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" 160 >>> with open(sentence_piece_vocab_file, 'r') as f: 161 ... for line in f: 162 ... word = line.split(',')[0] 163 ... jieba_op.add_word(word) 164 >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) 165 """ 166 167 if freq is None: 168 self.words.append((word, 0)) 169 else: 170 self.words.append((word, freq)) 171 172 @check_jieba_add_dict 173 def add_dict(self, user_dict): 174 """ 175 Add a user defined word to JiebaTokenizer's dictionary. 176 177 Args: 178 user_dict (Union[str, dict]): One of the two loading methods is file path(str) loading 179 (according to the Jieba dictionary format) and the other is Python dictionary(dict) loading, 180 Python Dict format: {word1:freq1, word2:freq2,...}. 181 Jieba dictionary format : word(required), freq(optional), such as: 182 183 .. code-block:: 184 185 word1 freq1 186 word2 None 187 word3 freq3 188 189 Only valid word-freq pairs in user provided file will be added into the dictionary. 190 Rows containing invalid input will be ignored. No error nor warning Status is returned. 191 192 Examples: 193 >>> from mindspore.dataset.text import JiebaMode 194 >>> jieba_hmm_file = "/path/to/jieba/hmm/file" 195 >>> jieba_mp_file = "/path/to/jieba/mp/file" 196 >>> user_dict = {"男默女泪": 10} 197 >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP) 198 >>> jieba_op.add_dict(user_dict) 199 >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) 200 """ 201 202 if isinstance(user_dict, str): 203 self.__add_dict_py_file(user_dict) 204 elif isinstance(user_dict, dict): 205 for k, v in user_dict.items(): 206 self.add_word(k, v) 207 else: 208 raise TypeError("The type of user_dict must str or dict.") 209 210 def __add_dict_py_file(self, file_path): 211 """Add user defined word by file""" 212 words_list = self.__parser_file(file_path) 213 for data in words_list: 214 if data[1] is None: 215 freq = 0 216 else: 217 freq = int(data[1]) 218 self.add_word(data[0], freq) 219 220 def __parser_file(self, file_path): 221 """parser user defined word by file""" 222 if not os.path.exists(file_path): 223 raise ValueError( 224 "user dict file {} is not exist.".format(file_path)) 225 real_file_path = os.path.realpath(file_path) 226 file_dict = open(real_file_path) 227 data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U) 228 words_list = [] 229 for item in file_dict: 230 data = item.strip() 231 if not isinstance(data, str): 232 data = self.__decode(data) 233 tmp = data_re.match(data) 234 if not tmp: 235 continue 236 words = tmp.groups() 237 words_list.append(words) 238 file_dict.close() 239 return words_list 240 241 def __decode(self, data): 242 """decode the dict file to utf8""" 243 try: 244 data = data.decode('utf-8') 245 except UnicodeDecodeError: 246 raise ValueError("user dict file must be utf8 format.") 247 return data.lstrip('\ufeff') 248 249 def __check_path__(self, model_path): 250 """check model path""" 251 if not os.path.exists(os.path.realpath(model_path)): 252 raise ValueError( 253 " jieba mode file {} is not exist.".format(model_path)) 254 255 256class Lookup(TextTensorOperation): 257 """ 258 Look up a word into an id according to the input vocabulary table. 259 260 Args: 261 vocab (Vocab): A vocabulary object. 262 unknown_token (str, optional): Word is used for lookup. In case of the word is out of vocabulary (OOV), 263 the result of lookup will be replaced with unknown_token. If the unknown_token is not specified or 264 it is OOV, runtime error will be thrown (default={}, means no unknown_token is specified). 265 data_type (mindspore.dtype, optional): The data type that lookup operation maps 266 string to(default=mindspore.int32). 267 268 Examples: 269 >>> # Load vocabulary from list 270 >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) 271 >>> # Use Lookup operator to map tokens to ids 272 >>> lookup = text.Lookup(vocab) 273 >>> text_file_dataset = text_file_dataset.map(operations=[lookup]) 274 """ 275 276 @check_lookup 277 def __init__(self, vocab, unknown_token=None, data_type=mstype.int32): 278 self.vocab = vocab 279 self.unknown_token = unknown_token 280 self.data_type = data_type 281 282 def parse(self): 283 return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type))) 284 285 286class Ngram(TextTensorOperation): 287 """ 288 TensorOp to generate n-gram from a 1-D string Tensor. 289 290 Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works. 291 292 Args: 293 n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result 294 would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up 295 for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in 296 an empty string produced. 297 left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width). 298 `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the 299 sequence with "__" (default=None). 300 right_pad (tuple, optional): Padding performed on right side of the sequence shaped like 301 ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("_", 2) 302 would pad right side of the sequence with "__" (default=None). 303 separator (str, optional): Symbol used to join strings together. For example. if 2-gram is 304 ["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"] 305 (default=None, which will use whitespace as separator). 306 307 Examples: 308 >>> ngram_op = text.Ngram(3, separator="-") 309 >>> output = ngram_op(["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"]) 310 >>> # output 311 >>> # ["WildRose Country-Canada's Ocean Playground-Land of Living Skies"] 312 >>> # same ngram_op called through map 313 >>> text_file_dataset = text_file_dataset.map(operations=ngram_op) 314 """ 315 316 @check_ngram 317 def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "): 318 self.ngrams = n 319 self.left_pad = left_pad 320 self.right_pad = right_pad 321 self.separator = separator 322 323 def parse(self): 324 return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator) 325 326 327class SentencePieceTokenizer(TextTensorOperation): 328 """ 329 Tokenize scalar token or 1-D tokens to tokens by sentencepiece. 330 331 Args: 332 mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then its type should be string. 333 If the input parameter is a SentencePieceVocab object, then its type should be SentencePieceVocab. 334 out_type (SPieceTokenizerOutType): The type of output, it can be any of [SPieceTokenizerOutType.STRING, 335 SPieceTokenizerOutType.INT]. 336 337 - SPieceTokenizerOutType.STRING, means output type of SentencePice Tokenizer is string. 338 - SPieceTokenizerOutType.INT, means output type of SentencePice Tokenizer is int. 339 340 Examples: 341 >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType 342 >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" 343 >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 5000, 0.9995, 344 ... SentencePieceModel.UNIGRAM, {}) 345 >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 346 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer) 347 """ 348 @check_sentence_piece_tokenizer 349 def __init__(self, mode, out_type): 350 self.mode = mode 351 self.out_type = out_type 352 353 def parse(self): 354 return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type]) 355 356 357class SlidingWindow(TextTensorOperation): 358 """ 359 Construct a tensor from given data (only support 1-D for now), where each element in the dimension axis 360 is a slice of data starting at the corresponding position, with a specified width. 361 362 Args: 363 width (int): The width of the window. It must be an integer and greater than zero. 364 axis (int, optional): The axis along which the sliding window is computed (default=0). 365 366 Examples: 367 >>> dataset = ds.NumpySlicesDataset(data=[[1, 2, 3, 4, 5]], column_names="col1") 368 >>> # Data before 369 >>> # | col1 | 370 >>> # +--------------+ 371 >>> # | [[1, 2, 3, 4, 5]] | 372 >>> # +--------------+ 373 >>> dataset = dataset.map(operations=text.SlidingWindow(3, 0)) 374 >>> # Data after 375 >>> # | col1 | 376 >>> # +--------------+ 377 >>> # | [[1, 2, 3], | 378 >>> # | [2, 3, 4], | 379 >>> # | [3, 4, 5]] | 380 >>> # +--------------+ 381 """ 382 383 @check_slidingwindow 384 def __init__(self, width, axis=0): 385 self.width = width 386 self.axis = axis 387 388 def parse(self): 389 return cde.SlidingWindowOperation(self.width, self.axis) 390 391 392class ToNumber(TextTensorOperation): 393 """ 394 Tensor operation to convert every element of a string tensor to a number. 395 396 Strings are cast according to the rules specified in the following links, except that any strings which represent 397 negative numbers cannot be cast to an unsigned integer type, rules links are as follows: 398 https://en.cppreference.com/w/cpp/string/basic_string/stof, 399 https://en.cppreference.com/w/cpp/string/basic_string/stoul, 400 401 Args: 402 data_type (mindspore.dtype): Type to be cast to. Must be a numeric type in mindspore.dtype. 403 404 Raises: 405 RuntimeError: If strings are invalid to cast, or are out of range after being cast. 406 407 Examples: 408 >>> from mindspore import dtype as mstype 409 >>> data = [["1", "2", "3"]] 410 >>> dataset = ds.NumpySlicesDataset(data) 411 >>> to_number_op = text.ToNumber(mstype.int8) 412 >>> dataset = dataset.map(operations=to_number_op) 413 """ 414 415 @check_to_number 416 def __init__(self, data_type): 417 data_type = mstype_to_detype(data_type) 418 self.data_type = str(data_type) 419 420 def parse(self): 421 return cde.ToNumberOperation(self.data_type) 422 423 424class TruncateSequencePair(TextTensorOperation): 425 """ 426 Truncate a pair of rank-1 tensors such that the total length is less than max_length. 427 428 This operation takes two input tensors and returns two output Tensors. 429 430 Args: 431 max_length (int): Maximum length required. 432 433 Examples: 434 >>> dataset = ds.NumpySlicesDataset(data={"col1": [[1, 2, 3]], "col2": [[4, 5]]}) 435 >>> # Data before 436 >>> # | col1 | col2 | 437 >>> # +-----------+-----------| 438 >>> # | [1, 2, 3] | [4, 5] | 439 >>> # +-----------+-----------+ 440 >>> truncate_sequence_pair_op = text.TruncateSequencePair(max_length=4) 441 >>> dataset = dataset.map(operations=truncate_sequence_pair_op) 442 >>> # Data after 443 >>> # | col1 | col2 | 444 >>> # +-----------+-----------+ 445 >>> # | [1, 2] | [4, 5] | 446 >>> # +-----------+-----------+ 447 """ 448 449 @check_pair_truncate 450 def __init__(self, max_length): 451 self.max_length = max_length 452 453 def parse(self): 454 return cde.TruncateSequencePairOperation(self.max_length) 455 456 457class UnicodeCharTokenizer(TextTensorOperation): 458 """ 459 Tokenize a scalar tensor of UTF-8 string to Unicode characters. 460 461 Args: 462 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 463 464 Examples: 465 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 466 >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=False) 467 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 468 >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], 469 >>> # ["offsets_limit", dtype=uint32]} 470 >>> tokenizer_op = text.UnicodeCharTokenizer(with_offsets=True) 471 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], 472 ... output_columns=["token", "offsets_start", "offsets_limit"], 473 ... column_order=["token", "offsets_start", "offsets_limit"]) 474 """ 475 476 @check_with_offsets 477 def __init__(self, with_offsets=False): 478 self.with_offsets = with_offsets 479 480 def parse(self): 481 return cde.UnicodeCharTokenizerOperation(self.with_offsets) 482 483 484class WordpieceTokenizer(TextTensorOperation): 485 """ 486 Tokenize scalar token or 1-D tokens to 1-D subword tokens. 487 488 Args: 489 vocab (Vocab): A vocabulary object. 490 suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##'). 491 max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100). 492 unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string, 493 return the token directly, else return 'unknown_token' (default='[UNK]'). 494 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 495 496 Examples: 497 >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] 498 >>> vocab = text.Vocab.from_list(vocab_list) 499 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 500 >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', 501 ... max_bytes_per_token=100, with_offsets=False) 502 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 503 >>> # If with_offsets=True, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], 504 >>> # ["offsets_limit", dtype=uint32]} 505 >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', 506 ... max_bytes_per_token=100, with_offsets=True) 507 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], 508 ... output_columns=["token", "offsets_start", "offsets_limit"], 509 ... column_order=["token", "offsets_start", "offsets_limit"]) 510 """ 511 512 @check_wordpiece_tokenizer 513 def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, 514 unknown_token='[UNK]', with_offsets=False): 515 self.vocab = vocab 516 self.suffix_indicator = suffix_indicator 517 self.max_bytes_per_token = max_bytes_per_token 518 self.unknown_token = unknown_token 519 self.with_offsets = with_offsets 520 521 def parse(self): 522 return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, 523 self.unknown_token, self.with_offsets) 524 525 526class PythonTokenizer: 527 """ 528 Class that applies user-defined string tokenizer into input string. 529 530 Args: 531 tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens. 532 533 Examples: 534 >>> def my_tokenizer(line): 535 ... return line.split() 536 >>> text_file_dataset = text_file_dataset.map(operations=text.PythonTokenizer(my_tokenizer)) 537 """ 538 539 @check_python_tokenizer 540 def __init__(self, tokenizer): 541 self.pyfunc = tokenizer 542 self.tokenizer = np.vectorize(lambda x: np.array(tokenizer(x), dtype='U'), signature='()->(n)') 543 self.random = False 544 545 def __call__(self, in_array): 546 if not isinstance(in_array, np.ndarray): 547 raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array))) 548 if in_array.dtype.type is np.bytes_: 549 in_array = to_str(in_array) 550 try: 551 tokens = self.tokenizer(in_array) 552 except Exception as e: 553 raise RuntimeError("Error occurred in Pyfunc [" + str(self.pyfunc.__name__) + "], error message: " + str(e)) 554 return tokens 555 556 557if platform.system().lower() != 'windows': 558 DE_C_INTER_NORMALIZE_FORM = { 559 NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, 560 NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC, 561 NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC, 562 NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD, 563 NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD 564 } 565 566 567 class BasicTokenizer(TextTensorOperation): 568 """ 569 Tokenize a scalar tensor of UTF-8 string by specific rules. 570 571 Note: 572 BasicTokenizer is not supported on Windows platform yet. 573 574 Args: 575 lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation 576 on input text to fold the text to lower case and strip accents characters. If False, only apply 577 NormalizeUTF8 operation with the specified mode on input text (default=False). 578 keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False). 579 normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode 580 (default=NormalizeForm.NONE). This is only effective when `lower_case` is False. It can be any of 581 [NormalizeForm.NONE, NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD]. 582 583 - NormalizeForm.NONE, do nothing for input string tensor. 584 - NormalizeForm.NFC, normalize with Normalization Form C. 585 - NormalizeForm.NFKC, normalize with Normalization Form KC. 586 - NormalizeForm.NFD, normalize with Normalization Form D. 587 - NormalizeForm.NFKD, normalize with Normalization Form KD. 588 589 preserve_unused_token (bool, optional): If True, do not split special tokens like 590 '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). 591 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 592 593 Examples: 594 >>> from mindspore.dataset.text import NormalizeForm 595 >>> 596 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 597 >>> tokenizer_op = text.BasicTokenizer(lower_case=False, 598 ... keep_whitespace=False, 599 ... normalization_form=NormalizeForm.NONE, 600 ... preserve_unused_token=True, 601 ... with_offsets=False) 602 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 603 >>> # If with_offsets=True, then output three columns {["token", dtype=str], 604 >>> # ["offsets_start", dtype=uint32], 605 >>> # ["offsets_limit", dtype=uint32]} 606 >>> tokenizer_op = text.BasicTokenizer(lower_case=False, 607 ... keep_whitespace=False, 608 ... normalization_form=NormalizeForm.NONE, 609 ... preserve_unused_token=True, 610 ... with_offsets=True) 611 >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], 612 ... output_columns=["token", "offsets_start", 613 ... "offsets_limit"], 614 ... column_order=["token", "offsets_start", 615 ... "offsets_limit"]) 616 617 """ 618 619 @check_basic_tokenizer 620 def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, 621 preserve_unused_token=True, with_offsets=False): 622 if not isinstance(normalization_form, NormalizeForm): 623 raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") 624 625 self.lower_case = lower_case 626 self.keep_whitespace = keep_whitespace 627 self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] 628 self.preserve_unused_token = preserve_unused_token 629 self.with_offsets = with_offsets 630 631 def parse(self): 632 return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form, 633 self.preserve_unused_token, self.with_offsets) 634 635 636 class BertTokenizer(TextTensorOperation): 637 """ 638 Tokenizer used for Bert text process. 639 640 Note: 641 BertTokenizer is not supported on Windows platform yet. 642 643 Args: 644 vocab (Vocab): A vocabulary object. 645 suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##'). 646 max_bytes_per_token (int, optional): If Tokens exceeding this length, it will not be further 647 split (default=100). 648 unknown_token (str, optional): When an unknown token is found, return the token directly if `unknown_token` 649 is an empty string, else return `unknown_token` instead (default='[UNK]'). 650 lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation 651 on input text to fold the text to lower case and strip accented characters. If False, only apply 652 NormalizeUTF8 operation with the specified mode on input text (default=False). 653 keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False). 654 normalization_form (NormalizeForm, optional): This parameter is used to specify a specific normalize mode, 655 only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). 656 preserve_unused_token (bool, optional): If True, do not split special tokens like 657 '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). 658 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 659 660 Examples: 661 >>> from mindspore.dataset.text import NormalizeForm 662 >>> 663 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 664 >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", 665 ... "思", "故", "乡","繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak", 666 ... "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "", "", 667 ... "", "", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]", 668 ... "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"] 669 >>> vocab = text.Vocab.from_list(vocab_list) 670 >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, 671 ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False, 672 ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True, 673 ... with_offsets=False) 674 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 675 >>> # If with_offsets=True, then output three columns {["token", dtype=str], 676 >>> # ["offsets_start", dtype=uint32], 677 >>> # ["offsets_limit", dtype=uint32]} 678 >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, 679 ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False, 680 ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True, 681 ... with_offsets=True) 682 >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], 683 ... output_columns=["token", "offsets_start", 684 ... "offsets_limit"], 685 ... column_order=["token", "offsets_start", 686 ... "offsets_limit"]) 687 688 """ 689 690 @check_bert_tokenizer 691 def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', 692 lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, 693 preserve_unused_token=True, with_offsets=False): 694 if not isinstance(normalization_form, NormalizeForm): 695 raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") 696 697 self.vocab = vocab 698 self.suffix_indicator = suffix_indicator 699 self.max_bytes_per_token = max_bytes_per_token 700 self.unknown_token = unknown_token 701 self.lower_case = lower_case 702 self.keep_whitespace = keep_whitespace 703 self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] 704 self.preserve_unused_token = preserve_unused_token 705 self.with_offsets = with_offsets 706 707 def parse(self): 708 return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, 709 self.unknown_token, self.lower_case, self.keep_whitespace, 710 self.normalization_form, self.preserve_unused_token, self.with_offsets) 711 712 713 class CaseFold(TextTensorOperation): 714 """ 715 Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into 716 lower case. 717 718 Note: 719 CaseFold is not supported on Windows platform yet. 720 721 Examples: 722 >>> case_op = text.CaseFold() 723 >>> text_file_dataset = text_file_dataset.map(operations=case_op) 724 """ 725 726 def parse(self): 727 return cde.CaseFoldOperation() 728 729 730 class NormalizeUTF8(TextTensorOperation): 731 """ 732 Apply normalize operation on UTF-8 string tensor. 733 734 Note: 735 NormalizeUTF8 is not supported on Windows platform yet. 736 737 Args: 738 normalize_form (NormalizeForm, optional): Valid values can be [NormalizeForm.NONE, NormalizeForm.NFC, 739 NormalizeForm.NFKC, NormalizeForm.NFD, NormalizeForm.NFKD] any of the four unicode 740 normalized forms(default=NormalizeForm.NFKC). 741 See http://unicode.org/reports/tr15/ for details. 742 743 - NormalizeForm.NONE, do nothing for input string tensor. 744 - NormalizeForm.NFC, normalize with Normalization Form C. 745 - NormalizeForm.NFKC, normalize with Normalization Form KC. 746 - NormalizeForm.NFD, normalize with Normalization Form D. 747 - NormalizeForm.NFKD, normalize with Normalization Form KD. 748 749 Examples: 750 >>> from mindspore.dataset.text import NormalizeForm 751 >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC) 752 >>> text_file_dataset = text_file_dataset.map(operations=normalize_op) 753 """ 754 755 def __init__(self, normalize_form=NormalizeForm.NFKC): 756 if not isinstance(normalize_form, NormalizeForm): 757 raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") 758 759 normalize_form = replace_none(normalize_form, NormalizeForm.NFKC) 760 self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] 761 762 def parse(self): 763 return cde.NormalizeUTF8Operation(self.normalize_form) 764 765 766 class RegexReplace(TextTensorOperation): 767 """ 768 Replace a part of UTF-8 string tensor with given text according to regular expressions. 769 770 See https://unicode-org.github.io/icu/userguide/strings/regexp.html for supported regex pattern. 771 772 Note: 773 RegexReplace is not supported on Windows platform yet. 774 775 Args: 776 pattern (str): the regex expression patterns. 777 replace (str): the string to replace matched element. 778 replace_all (bool, optional): If False, only replace first matched element; 779 if True, replace all matched elements (default=True). 780 781 Examples: 782 >>> pattern = 'Canada' 783 >>> replace = 'China' 784 >>> replace_op = text.RegexReplace(pattern, replace) 785 >>> text_file_dataset = text_file_dataset.map(operations=replace_op) 786 """ 787 788 @check_regex_replace 789 def __init__(self, pattern, replace, replace_all=True): 790 self.pattern = pattern 791 self.replace = replace 792 self.replace_all = replace_all 793 794 def parse(self): 795 return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all) 796 797 798 class RegexTokenizer(TextTensorOperation): 799 """ 800 Tokenize a scalar tensor of UTF-8 string by regex expression pattern. 801 802 See https://unicode-org.github.io/icu/userguide/strings/regexp.html for supported regex pattern. 803 804 Note: 805 RegexTokenizer is not supported on Windows platform yet. 806 807 Args: 808 delim_pattern (str): The pattern of regex delimiters. 809 The original string will be split by matched elements. 810 keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token 811 if it can be matched by 'keep_delim_pattern'. The default value is an empty str 812 which means that delimiters will not be kept as an output token (default=''). 813 with_offsets (bool, optional): Whether or not output offsets of tokens(default=False). 814 815 Examples: 816 >>> # If with_offsets=False, default output is one column {["text", dtype=str]} 817 >>> delim_pattern = r"[ |,]" 818 >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False) 819 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 820 >>> # If with_offsets=True, then output three columns {["token", dtype=str], 821 >>> # ["offsets_start", dtype=uint32], 822 >>> # ["offsets_limit", dtype=uint32]} 823 >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True) 824 >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], 825 ... output_columns=["token", "offsets_start", 826 ... "offsets_limit"], 827 ... column_order=["token", "offsets_start", 828 ... "offsets_limit"]) 829 """ 830 831 @check_regex_tokenizer 832 def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False): 833 self.delim_pattern = delim_pattern 834 self.keep_delim_pattern = keep_delim_pattern 835 self.with_offsets = with_offsets 836 837 def parse(self): 838 return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) 839 840 841 class UnicodeScriptTokenizer(TextTensorOperation): 842 """ 843 Tokenize a scalar tensor of UTF-8 string based on Unicode script boundaries. 844 845 Note: 846 UnicodeScriptTokenizer is not supported on Windows platform yet. 847 848 Args: 849 keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False). 850 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 851 852 Examples: 853 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 854 >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False) 855 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 856 >>> # If with_offsets=True, then output three columns {["token", dtype=str], 857 >>> # ["offsets_start", dtype=uint32], 858 >>> # ["offsets_limit", dtype=uint32]} 859 >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) 860 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], 861 ... output_columns=["token", "offsets_start", "offsets_limit"], 862 ... column_order=["token", "offsets_start", "offsets_limit"]) 863 864 """ 865 866 @check_unicode_script_tokenizer 867 def __init__(self, keep_whitespace=False, with_offsets=False): 868 keep_whitespace = replace_none(keep_whitespace, False) 869 with_offsets = replace_none(with_offsets, False) 870 self.keep_whitespace = keep_whitespace 871 self.with_offsets = with_offsets 872 873 def parse(self): 874 return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets) 875 876 877 class WhitespaceTokenizer(TextTensorOperation): 878 """ 879 Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. 880 881 Note: 882 WhitespaceTokenizer is not supported on Windows platform yet. 883 884 Args: 885 with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). 886 887 Examples: 888 >>> # If with_offsets=False, default output one column {["text", dtype=str]} 889 >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=False) 890 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) 891 >>> # If with_offsets=True, then output three columns {["token", dtype=str], 892 >>> # ["offsets_start", dtype=uint32], 893 >>> # ["offsets_limit", dtype=uint32]} 894 >>> tokenizer_op = text.WhitespaceTokenizer(with_offsets=True) 895 >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op, input_columns=["text"], 896 ... output_columns=["token", "offsets_start", "offsets_limit"], 897 ... column_order=["token", "offsets_start", "offsets_limit"]) 898 """ 899 900 @check_with_offsets 901 def __init__(self, with_offsets=False): 902 self.with_offsets = with_offsets 903 904 def parse(self): 905 return cde.WhitespaceTokenizerOperation(self.with_offsets) 906