1# Copyright 2020-2021 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15import numpy as np 16import pytest 17import mindspore.dataset as ds 18from mindspore.dataset.text import JiebaTokenizer 19from mindspore.dataset.text import JiebaMode, to_str 20from mindspore import log as logger 21 22DATA_FILE = "../data/dataset/testJiebaDataset/3.txt" 23DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*" 24 25HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8" 26MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8" 27 28 29def test_jieba_callable(): 30 """ 31 Test jieba tokenizer op is callable 32 """ 33 logger.info("test_jieba_callable") 34 jieba_op1 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 35 jieba_op2 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) 36 37 # test one tensor 38 text1 = "今天天气太好了我们一起去外面玩吧" 39 text2 = "男默女泪市长江大桥" 40 assert np.array_equal(jieba_op1(text1), ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']) 41 assert np.array_equal(jieba_op2(text1), ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']) 42 jieba_op1.add_word("男默女泪") 43 assert np.array_equal(jieba_op1(text2), ['男默女泪', '市', '长江大桥']) 44 45 # test input multiple tensors 46 with pytest.raises(RuntimeError) as info: 47 _ = jieba_op1(text1, text2) 48 assert "JiebaTokenizerOp: input should be one column data." in str(info.value) 49 50 51def test_jieba_1(): 52 """Test jieba tokenizer with MP mode""" 53 data = ds.TextFileDataset(DATA_FILE) 54 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 55 data = data.map(operations=jieba_op, input_columns=["text"], 56 num_parallel_workers=1) 57 expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] 58 ret = [] 59 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 60 ret = to_str(i["text"]) 61 for index, item in enumerate(ret): 62 assert item == expect[index] 63 64 65def test_jieba_1_1(): 66 """Test jieba tokenizer with HMM mode""" 67 data = ds.TextFileDataset(DATA_FILE) 68 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) 69 data = data.map(operations=jieba_op, input_columns=["text"], 70 num_parallel_workers=1) 71 expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] 72 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 73 ret = to_str(i["text"]) 74 for index, item in enumerate(ret): 75 assert item == expect[index] 76 77 78def test_jieba_1_2(): 79 """Test jieba tokenizer with HMM MIX""" 80 data = ds.TextFileDataset(DATA_FILE) 81 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX) 82 data = data.map(operations=jieba_op, input_columns=["text"], 83 num_parallel_workers=1) 84 expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] 85 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 86 ret = to_str(i["text"]) 87 for index, item in enumerate(ret): 88 assert item == expect[index] 89 90 91def test_jieba_2(): 92 """Test add_word""" 93 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 94 data = ds.TextFileDataset(DATA_FILE4) 95 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 96 jieba_op.add_word("男默女泪") 97 expect = ['男默女泪', '市', '长江大桥'] 98 data = data.map(operations=jieba_op, input_columns=["text"], 99 num_parallel_workers=2) 100 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 101 ret = to_str(i["text"]) 102 for index, item in enumerate(ret): 103 assert item == expect[index] 104 105 106def test_jieba_2_1(): 107 """Test add_word with freq""" 108 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 109 data = ds.TextFileDataset(DATA_FILE4) 110 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 111 jieba_op.add_word("男默女泪", 10) 112 data = data.map(operations=jieba_op, input_columns=["text"], 113 num_parallel_workers=2) 114 expect = ['男默女泪', '市', '长江大桥'] 115 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 116 ret = to_str(i["text"]) 117 for index, item in enumerate(ret): 118 assert item == expect[index] 119 120 121def test_jieba_2_2(): 122 """Test add_word with invalid None Input""" 123 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 124 try: 125 jieba_op.add_word(None) 126 except ValueError: 127 pass 128 129 130def test_jieba_2_3(): 131 """Test add_word with freq, the value of freq affects the result of segmentation""" 132 DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" 133 data = ds.TextFileDataset(DATA_FILE4) 134 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 135 jieba_op.add_word("江大桥", 20000) 136 data = data.map(operations=jieba_op, input_columns=["text"], 137 num_parallel_workers=2) 138 expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] 139 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 140 ret = to_str(i["text"]) 141 for index, item in enumerate(ret): 142 assert item == expect[index] 143 144 145def test_jieba_3(): 146 """Test add_dict with dict""" 147 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 148 user_dict = { 149 "男默女泪": 10 150 } 151 data = ds.TextFileDataset(DATA_FILE4) 152 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 153 jieba_op.add_dict(user_dict) 154 data = data.map(operations=jieba_op, input_columns=["text"], 155 num_parallel_workers=1) 156 expect = ['男默女泪', '市', '长江大桥'] 157 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 158 ret = to_str(i["text"]) 159 for index, item in enumerate(ret): 160 assert item == expect[index] 161 162 163def test_jieba_3_1(): 164 """Test add_dict with dict""" 165 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 166 user_dict = { 167 "男默女泪": 10, 168 "江大桥": 20000 169 } 170 data = ds.TextFileDataset(DATA_FILE4) 171 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 172 jieba_op.add_dict(user_dict) 173 data = data.map(operations=jieba_op, input_columns=["text"], 174 num_parallel_workers=1) 175 expect = ['男默女泪', '市长', '江大桥'] 176 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 177 ret = to_str(i["text"]) 178 for index, item in enumerate(ret): 179 assert item == expect[index] 180 181 182def test_jieba_4(): 183 DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" 184 DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" 185 186 data = ds.TextFileDataset(DATA_FILE4) 187 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 188 jieba_op.add_dict(DICT_FILE) 189 data = data.map(operations=jieba_op, input_columns=["text"], 190 num_parallel_workers=1) 191 expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] 192 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 193 ret = to_str(i["text"]) 194 for index, item in enumerate(ret): 195 assert item == expect[index] 196 197 198def test_jieba_4_1(): 199 """Test add dict with invalid file path""" 200 DICT_FILE = "" 201 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 202 try: 203 jieba_op.add_dict(DICT_FILE) 204 except ValueError: 205 pass 206 207 208def test_jieba_5(): 209 """Test add dict with file path""" 210 DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" 211 212 data = ds.TextFileDataset(DATA_FILE4) 213 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) 214 jieba_op.add_word("江大桥", 20000) 215 data = data.map(operations=jieba_op, input_columns=["text"], 216 num_parallel_workers=1) 217 expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] 218 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 219 ret = to_str(i["text"]) 220 for index, item in enumerate(ret): 221 assert item == expect[index] 222 223 224def test_jieba_with_offsets_1(): 225 """Test jieba tokenizer with MP mode""" 226 data = ds.TextFileDataset(DATA_FILE) 227 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 228 data = data.map(operations=jieba_op, input_columns=["text"], 229 output_columns=["token", "offsets_start", "offsets_limit"], 230 column_order=["token", "offsets_start", "offsets_limit"], 231 num_parallel_workers=1) 232 expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] 233 expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] 234 expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] 235 ret = [] 236 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 237 ret = to_str(i["token"]) 238 for index, item in enumerate(ret): 239 assert item == expect[index] 240 for index, item in enumerate(i["offsets_start"]): 241 assert item == expected_offsets_start[index] 242 for index, item in enumerate(i["offsets_limit"]): 243 assert item == expected_offsets_limit[index] 244 245 246def test_jieba_with_offsets_1_1(): 247 """Test jieba tokenizer with HMM mode""" 248 data = ds.TextFileDataset(DATA_FILE) 249 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True) 250 data = data.map(operations=jieba_op, input_columns=["text"], 251 output_columns=["token", "offsets_start", "offsets_limit"], 252 column_order=["token", "offsets_start", "offsets_limit"], 253 num_parallel_workers=1) 254 expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] 255 expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45] 256 expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48] 257 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 258 ret = to_str(i["token"]) 259 for index, item in enumerate(ret): 260 assert item == expect[index] 261 for index, item in enumerate(i["offsets_start"]): 262 assert item == expected_offsets_start[index] 263 for index, item in enumerate(i["offsets_limit"]): 264 assert item == expected_offsets_limit[index] 265 266 267def test_jieba_with_offsets_1_2(): 268 """Test jieba tokenizer with HMM MIX""" 269 data = ds.TextFileDataset(DATA_FILE) 270 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True) 271 data = data.map(operations=jieba_op, input_columns=["text"], 272 output_columns=["token", "offsets_start", "offsets_limit"], 273 column_order=["token", "offsets_start", "offsets_limit"], 274 num_parallel_workers=1) 275 expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] 276 expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] 277 expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] 278 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 279 ret = to_str(i["token"]) 280 for index, item in enumerate(ret): 281 assert item == expect[index] 282 for index, item in enumerate(i["offsets_start"]): 283 assert item == expected_offsets_start[index] 284 for index, item in enumerate(i["offsets_limit"]): 285 assert item == expected_offsets_limit[index] 286 287 288def test_jieba_with_offsets_2(): 289 """Test add_word""" 290 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 291 data = ds.TextFileDataset(DATA_FILE4) 292 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 293 jieba_op.add_word("男默女泪") 294 expect = ['男默女泪', '市', '长江大桥'] 295 data = data.map(operations=jieba_op, input_columns=["text"], 296 output_columns=["token", "offsets_start", "offsets_limit"], 297 column_order=["token", "offsets_start", "offsets_limit"], 298 num_parallel_workers=2) 299 expected_offsets_start = [0, 12, 15] 300 expected_offsets_limit = [12, 15, 27] 301 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 302 ret = to_str(i["token"]) 303 for index, item in enumerate(ret): 304 assert item == expect[index] 305 for index, item in enumerate(i["offsets_start"]): 306 assert item == expected_offsets_start[index] 307 for index, item in enumerate(i["offsets_limit"]): 308 assert item == expected_offsets_limit[index] 309 310 311def test_jieba_with_offsets_2_1(): 312 """Test add_word with freq""" 313 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 314 data = ds.TextFileDataset(DATA_FILE4) 315 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 316 jieba_op.add_word("男默女泪", 10) 317 data = data.map(operations=jieba_op, input_columns=["text"], 318 output_columns=["token", "offsets_start", "offsets_limit"], 319 column_order=["token", "offsets_start", "offsets_limit"], 320 num_parallel_workers=2) 321 expect = ['男默女泪', '市', '长江大桥'] 322 expected_offsets_start = [0, 12, 15] 323 expected_offsets_limit = [12, 15, 27] 324 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 325 ret = to_str(i["token"]) 326 for index, item in enumerate(ret): 327 assert item == expect[index] 328 for index, item in enumerate(i["offsets_start"]): 329 assert item == expected_offsets_start[index] 330 for index, item in enumerate(i["offsets_limit"]): 331 assert item == expected_offsets_limit[index] 332 333 334def test_jieba_with_offsets_2_2(): 335 """Test add_word with freq, the value of freq affects the result of segmentation""" 336 DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" 337 data = ds.TextFileDataset(DATA_FILE4) 338 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 339 jieba_op.add_word("江大桥", 20000) 340 data = data.map(operations=jieba_op, input_columns=["text"], 341 output_columns=["token", "offsets_start", "offsets_limit"], 342 column_order=["token", "offsets_start", "offsets_limit"], 343 num_parallel_workers=2) 344 expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] 345 expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] 346 expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] 347 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 348 ret = to_str(i["token"]) 349 for index, item in enumerate(ret): 350 assert item == expect[index] 351 for index, item in enumerate(i["offsets_start"]): 352 assert item == expected_offsets_start[index] 353 for index, item in enumerate(i["offsets_limit"]): 354 assert item == expected_offsets_limit[index] 355 356 357def test_jieba_with_offsets_3(): 358 """Test add_dict with dict""" 359 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 360 user_dict = { 361 "男默女泪": 10 362 } 363 data = ds.TextFileDataset(DATA_FILE4) 364 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 365 jieba_op.add_dict(user_dict) 366 data = data.map(operations=jieba_op, input_columns=["text"], 367 output_columns=["token", "offsets_start", "offsets_limit"], 368 column_order=["token", "offsets_start", "offsets_limit"], 369 num_parallel_workers=1) 370 expect = ['男默女泪', '市', '长江大桥'] 371 expected_offsets_start = [0, 12, 15] 372 expected_offsets_limit = [12, 15, 27] 373 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 374 ret = to_str(i["token"]) 375 for index, item in enumerate(ret): 376 assert item == expect[index] 377 for index, item in enumerate(i["offsets_start"]): 378 assert item == expected_offsets_start[index] 379 for index, item in enumerate(i["offsets_limit"]): 380 assert item == expected_offsets_limit[index] 381 382 383def test_jieba_with_offsets_3_1(): 384 """Test add_dict with dict""" 385 DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" 386 user_dict = { 387 "男默女泪": 10, 388 "江大桥": 20000 389 } 390 data = ds.TextFileDataset(DATA_FILE4) 391 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 392 jieba_op.add_dict(user_dict) 393 data = data.map(operations=jieba_op, input_columns=["text"], 394 output_columns=["token", "offsets_start", "offsets_limit"], 395 column_order=["token", "offsets_start", "offsets_limit"], 396 num_parallel_workers=1) 397 expect = ['男默女泪', '市长', '江大桥'] 398 expected_offsets_start = [0, 12, 18] 399 expected_offsets_limit = [12, 18, 27] 400 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 401 ret = to_str(i["token"]) 402 for index, item in enumerate(ret): 403 assert item == expect[index] 404 for index, item in enumerate(i["offsets_start"]): 405 assert item == expected_offsets_start[index] 406 for index, item in enumerate(i["offsets_limit"]): 407 assert item == expected_offsets_limit[index] 408 409 410def test_jieba_with_offsets_4(): 411 DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" 412 DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" 413 414 data = ds.TextFileDataset(DATA_FILE4) 415 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 416 jieba_op.add_dict(DICT_FILE) 417 data = data.map(operations=jieba_op, input_columns=["text"], 418 output_columns=["token", "offsets_start", "offsets_limit"], 419 column_order=["token", "offsets_start", "offsets_limit"], 420 num_parallel_workers=1) 421 expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] 422 expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] 423 expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] 424 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 425 ret = to_str(i["token"]) 426 for index, item in enumerate(ret): 427 assert item == expect[index] 428 for index, item in enumerate(i["offsets_start"]): 429 assert item == expected_offsets_start[index] 430 for index, item in enumerate(i["offsets_limit"]): 431 assert item == expected_offsets_limit[index] 432 433 434def test_jieba_with_offsets_5(): 435 """Test add dict with file path""" 436 DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" 437 438 data = ds.TextFileDataset(DATA_FILE4) 439 jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) 440 jieba_op.add_word("江大桥", 20000) 441 data = data.map(operations=jieba_op, input_columns=["text"], 442 output_columns=["token", "offsets_start", "offsets_limit"], 443 column_order=["token", "offsets_start", "offsets_limit"], 444 num_parallel_workers=1) 445 expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] 446 expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] 447 expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] 448 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 449 ret = to_str(i["token"]) 450 for index, item in enumerate(ret): 451 assert item == expect[index] 452 for index, item in enumerate(i["offsets_start"]): 453 assert item == expected_offsets_start[index] 454 for index, item in enumerate(i["offsets_limit"]): 455 assert item == expected_offsets_limit[index] 456 457 458def gen(): 459 text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S') 460 yield (text,) 461 462 463def pytoken_op(input_data): 464 te = str(to_str(input_data)) 465 tokens = [] 466 tokens.append(te[:5].encode("UTF8")) 467 tokens.append(te[5:10].encode("UTF8")) 468 tokens.append(te[10:].encode("UTF8")) 469 return np.array(tokens, dtype='S') 470 471 472def test_jieba_6(): 473 data = ds.GeneratorDataset(gen, column_names=["text"]) 474 data = data.map(operations=pytoken_op, input_columns=["text"], 475 num_parallel_workers=1) 476 expect = ['今天天气太', '好了我们一', '起去外面玩吧'] 477 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): 478 ret = to_str(i["text"]) 479 for index, item in enumerate(ret): 480 assert item == expect[index] 481 482 483if __name__ == "__main__": 484 test_jieba_callable() 485 test_jieba_1() 486 test_jieba_1_1() 487 test_jieba_1_2() 488 test_jieba_2() 489 test_jieba_2_1() 490 test_jieba_2_2() 491 test_jieba_3() 492 test_jieba_3_1() 493 test_jieba_4() 494 test_jieba_4_1() 495 test_jieba_5() 496 test_jieba_5() 497 test_jieba_6() 498 test_jieba_with_offsets_1() 499 test_jieba_with_offsets_1_1() 500 test_jieba_with_offsets_1_2() 501 test_jieba_with_offsets_2() 502 test_jieba_with_offsets_2_1() 503 test_jieba_with_offsets_2_2() 504 test_jieba_with_offsets_3() 505 test_jieba_with_offsets_3_1() 506 test_jieba_with_offsets_4() 507 test_jieba_with_offsets_5() 508