1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16Testing UnicodeCharTokenizer op in DE 17""" 18import numpy as np 19import mindspore.dataset as ds 20from mindspore import log as logger 21import mindspore.dataset.text as text 22 23DATA_FILE = "../data/dataset/testTokenizerData/1.txt" 24NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" 25REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt" 26REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt" 27 28 29def split_by_unicode_char(input_strs): 30 """ 31 Split utf-8 strings to unicode characters 32 """ 33 out = [] 34 for s in input_strs: 35 out.append([c for c in s]) 36 return out 37 38 39def test_unicode_char_tokenizer_default(): 40 """ 41 Test UnicodeCharTokenizer 42 """ 43 input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") 44 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 45 tokenizer = text.UnicodeCharTokenizer() 46 dataset = dataset.map(operations=tokenizer) 47 tokens = [] 48 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 49 token = text.to_str(i['text']).tolist() 50 tokens.append(token) 51 logger.info("The out tokens is : {}".format(tokens)) 52 assert split_by_unicode_char(input_strs) == tokens 53 54 55def test_unicode_char_tokenizer_with_offsets(): 56 """ 57 Test UnicodeCharTokenizer 58 """ 59 input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") 60 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 61 tokenizer = text.UnicodeCharTokenizer(with_offsets=True) 62 dataset = dataset.map(operations=tokenizer, input_columns=['text'], 63 output_columns=['token', 'offsets_start', 'offsets_limit'], 64 column_order=['token', 'offsets_start', 'offsets_limit']) 65 tokens = [] 66 expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], 67 [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]] 68 expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 69 [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]] 70 count = 0 71 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 72 token = text.to_str(i['token']).tolist() 73 tokens.append(token) 74 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 75 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 76 count += 1 77 logger.info("The out tokens is : {}".format(tokens)) 78 assert split_by_unicode_char(input_strs) == tokens 79 80 81def test_whitespace_tokenizer_default(): 82 """ 83 Test WhitespaceTokenizer 84 """ 85 whitespace_strs = [["Welcome", "to", "Beijing!"], 86 ["北京欢迎您!"], 87 ["我喜欢English!"], 88 [""]] 89 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 90 tokenizer = text.WhitespaceTokenizer() 91 dataset = dataset.map(operations=tokenizer) 92 tokens = [] 93 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 94 token = text.to_str(i['text']).tolist() 95 tokens.append(token) 96 logger.info("The out tokens is : {}".format(tokens)) 97 assert whitespace_strs == tokens 98 99 100def test_whitespace_tokenizer_with_offsets(): 101 """ 102 Test WhitespaceTokenizer 103 """ 104 whitespace_strs = [["Welcome", "to", "Beijing!"], 105 ["北京欢迎您!"], 106 ["我喜欢English!"], 107 [""]] 108 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 109 tokenizer = text.WhitespaceTokenizer(with_offsets=True) 110 dataset = dataset.map(operations=tokenizer, input_columns=['text'], 111 output_columns=['token', 'offsets_start', 'offsets_limit'], 112 column_order=['token', 'offsets_start', 'offsets_limit']) 113 tokens = [] 114 expected_offsets_start = [[0, 8, 11], [0], [0], [0]] 115 expected_offsets_limit = [[7, 10, 19], [18], [17], [0]] 116 count = 0 117 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 118 token = text.to_str(i['token']).tolist() 119 tokens.append(token) 120 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 121 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 122 count += 1 123 124 logger.info("The out tokens is : {}".format(tokens)) 125 assert whitespace_strs == tokens 126 127 128def test_unicode_script_tokenizer_default(): 129 """ 130 Test UnicodeScriptTokenizer when para keep_whitespace=False 131 """ 132 unicode_script_strs = [["Welcome", "to", "Beijing", "!"], 133 ["北京欢迎您", "!"], 134 ["我喜欢", "English", "!"], 135 [""]] 136 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 137 tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False) 138 dataset = dataset.map(operations=tokenizer) 139 140 tokens = [] 141 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 142 token = text.to_str(i['text']).tolist() 143 tokens.append(token) 144 logger.info("The out tokens is : {}".format(tokens)) 145 assert unicode_script_strs == tokens 146 147 148def test_unicode_script_tokenizer_default2(): 149 """ 150 Test UnicodeScriptTokenizer when para keep_whitespace=True 151 """ 152 unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], 153 ["北京欢迎您", "!"], 154 ["我喜欢", "English", "!"], 155 [" "]] 156 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 157 tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True) 158 dataset = dataset.map(operations=tokenizer) 159 tokens = [] 160 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 161 token = text.to_str(i['text']).tolist() 162 tokens.append(token) 163 logger.info("The out tokens is :", tokens) 164 assert unicode_script_strs2 == tokens 165 166 167def test_unicode_script_tokenizer_with_offsets(): 168 """ 169 Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True 170 """ 171 unicode_script_strs = [["Welcome", "to", "Beijing", "!"], 172 ["北京欢迎您", "!"], 173 ["我喜欢", "English", "!"], 174 [""]] 175 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 176 tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True) 177 dataset = dataset.map(operations=tokenizer, input_columns=['text'], 178 output_columns=['token', 'offsets_start', 'offsets_limit'], 179 column_order=['token', 'offsets_start', 'offsets_limit']) 180 tokens = [] 181 expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]] 182 expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]] 183 count = 0 184 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 185 token = text.to_str(i['token']).tolist() 186 tokens.append(token) 187 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 188 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 189 count += 1 190 logger.info("The out tokens is : {}".format(tokens)) 191 assert unicode_script_strs == tokens 192 193 194def test_unicode_script_tokenizer_with_offsets2(): 195 """ 196 Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True 197 """ 198 unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], 199 ["北京欢迎您", "!"], 200 ["我喜欢", "English", "!"], 201 [" "]] 202 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 203 tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) 204 dataset = dataset.map(operations=tokenizer, input_columns=['text'], 205 output_columns=['token', 'offsets_start', 'offsets_limit'], 206 column_order=['token', 'offsets_start', 'offsets_limit']) 207 tokens = [] 208 expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]] 209 expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]] 210 count = 0 211 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 212 token = text.to_str(i['token']).tolist() 213 tokens.append(token) 214 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 215 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 216 count += 1 217 logger.info("The out tokens is :", tokens) 218 assert unicode_script_strs2 == tokens 219 220 221def test_case_fold(): 222 """ 223 Test CaseFold 224 """ 225 expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] 226 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 227 op = text.CaseFold() 228 dataset = dataset.map(operations=op) 229 230 lower_strs = [] 231 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 232 token = text.to_str(i['text']).tolist() 233 lower_strs.append(token) 234 assert lower_strs == expect_strs 235 236 237def test_normalize_utf8(): 238 """ 239 Test NormalizeUTF8 240 """ 241 242 def normalize(normalize_form): 243 dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) 244 normalize = text.NormalizeUTF8(normalize_form=normalize_form) 245 dataset = dataset.map(operations=normalize) 246 out_bytes = [] 247 out_texts = [] 248 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 249 out_bytes.append(i['text']) 250 out_texts.append(text.to_str(i['text']).tolist()) 251 logger.info("The out bytes is : ", out_bytes) 252 logger.info("The out texts is: ", out_texts) 253 return out_bytes 254 255 expect_normlize_data = [ 256 # NFC 257 [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', 258 b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'], 259 # NFKC 260 [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', 261 b'fi', b'25', b'\xe1\xb9\xa9'], 262 # NFD 263 [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', 264 b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'], 265 # NFKD 266 [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', 267 b'fi', b'25', b's\xcc\xa3\xcc\x87'] 268 ] 269 assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0] 270 assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1] 271 assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2] 272 assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3] 273 274 275def test_regex_replace(): 276 """ 277 Test RegexReplace 278 """ 279 280 def regex_replace(first, last, expect_str, pattern, replace): 281 dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False) 282 if first > 1: 283 dataset = dataset.skip(first - 1) 284 if last >= first: 285 dataset = dataset.take(last - first + 1) 286 replace_op = text.RegexReplace(pattern, replace) 287 dataset = dataset.map(operations=replace_op) 288 out_text = [] 289 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 290 token = text.to_str(i['text']).tolist() 291 out_text.append(token) 292 logger.info("Out:", out_text) 293 logger.info("Exp:", expect_str) 294 assert expect_str == out_text 295 296 regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_') 297 regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "") 298 regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "") 299 regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") 300 301 302def test_regex_tokenizer_default(): 303 """ 304 Test RegexTokenizer 305 """ 306 307 def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): 308 dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) 309 if first > 1: 310 dataset = dataset.skip(first - 1) 311 if last >= first: 312 dataset = dataset.take(last - first + 1) 313 tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern) 314 dataset = dataset.map(operations=tokenizer_op) 315 out_text = [] 316 count = 0 317 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 318 token = text.to_str(i['text']).tolist() 319 np.testing.assert_array_equal(token, expect_str[count]) 320 count += 1 321 out_text.append(token) 322 logger.info("Out:", out_text) 323 logger.info("Exp:", expect_str) 324 325 regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "") 326 regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+") 327 regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}") 328 regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") 329 regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "") 330 regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") 331 332 333def test_regex_tokenizer_with_offsets(): 334 """ 335 Test RegexTokenizer 336 """ 337 338 def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern, 339 keep_delim_pattern): 340 dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) 341 if first > 1: 342 dataset = dataset.skip(first - 1) 343 if last >= first: 344 dataset = dataset.take(last - first + 1) 345 tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) 346 dataset = dataset.map(operations=tokenizer_op, input_columns=['text'], 347 output_columns=['token', 'offsets_start', 'offsets_limit'], 348 column_order=['token', 'offsets_start', 'offsets_limit']) 349 out_text = [] 350 count = 0 351 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 352 token = text.to_str(i['token']).tolist() 353 np.testing.assert_array_equal(token, expect_str[count]) 354 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 355 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 356 count += 1 357 out_text.append(token) 358 logger.info("Out:", out_text) 359 logger.info("Exp:", expect_str) 360 361 regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "") 362 regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]], 363 "\\s+", "\\s+") 364 regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]], 365 [[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}") 366 regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]], 367 r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") 368 regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "") 369 regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "") 370 371 372if __name__ == '__main__': 373 test_unicode_char_tokenizer_default() 374 test_unicode_char_tokenizer_with_offsets() 375 test_whitespace_tokenizer_default() 376 test_whitespace_tokenizer_with_offsets() 377 test_unicode_script_tokenizer_default() 378 test_unicode_script_tokenizer_default2() 379 test_unicode_script_tokenizer_with_offsets() 380 test_unicode_script_tokenizer_with_offsets2() 381 test_case_fold() 382 test_normalize_utf8() 383 test_regex_replace() 384 test_regex_tokenizer_default() 385 test_regex_tokenizer_with_offsets() 386