• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020-2021 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15import numpy as np
16import pytest
17import mindspore.dataset as ds
18from mindspore.dataset.text import JiebaTokenizer
19from mindspore.dataset.text import JiebaMode, to_str
20from mindspore import log as logger
21
22DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
23DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
24
25HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
26MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
27
28
29def test_jieba_callable():
30    """
31    Test jieba tokenizer op is callable
32    """
33    logger.info("test_jieba_callable")
34    jieba_op1 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
35    jieba_op2 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
36
37    # test one tensor
38    text1 = "今天天气太好了我们一起去外面玩吧"
39    text2 = "男默女泪市长江大桥"
40    assert np.array_equal(jieba_op1(text1), ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'])
41    assert np.array_equal(jieba_op2(text1), ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'])
42    jieba_op1.add_word("男默女泪")
43    assert np.array_equal(jieba_op1(text2), ['男默女泪', '市', '长江大桥'])
44
45    # test input multiple tensors
46    with pytest.raises(RuntimeError) as info:
47        _ = jieba_op1(text1, text2)
48    assert "JiebaTokenizerOp: input should be one column data." in str(info.value)
49
50
51def test_jieba_1():
52    """Test jieba tokenizer with MP mode"""
53    data = ds.TextFileDataset(DATA_FILE)
54    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
55    data = data.map(operations=jieba_op, input_columns=["text"],
56                    num_parallel_workers=1)
57    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
58    ret = []
59    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
60        ret = to_str(i["text"])
61        for index, item in enumerate(ret):
62            assert item == expect[index]
63
64
65def test_jieba_1_1():
66    """Test jieba tokenizer with HMM mode"""
67    data = ds.TextFileDataset(DATA_FILE)
68    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
69    data = data.map(operations=jieba_op, input_columns=["text"],
70                    num_parallel_workers=1)
71    expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
72    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
73        ret = to_str(i["text"])
74        for index, item in enumerate(ret):
75            assert item == expect[index]
76
77
78def test_jieba_1_2():
79    """Test jieba tokenizer with HMM MIX"""
80    data = ds.TextFileDataset(DATA_FILE)
81    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
82    data = data.map(operations=jieba_op, input_columns=["text"],
83                    num_parallel_workers=1)
84    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
85    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
86        ret = to_str(i["text"])
87        for index, item in enumerate(ret):
88            assert item == expect[index]
89
90
91def test_jieba_2():
92    """Test add_word"""
93    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
94    data = ds.TextFileDataset(DATA_FILE4)
95    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
96    jieba_op.add_word("男默女泪")
97    expect = ['男默女泪', '市', '长江大桥']
98    data = data.map(operations=jieba_op, input_columns=["text"],
99                    num_parallel_workers=2)
100    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
101        ret = to_str(i["text"])
102        for index, item in enumerate(ret):
103            assert item == expect[index]
104
105
106def test_jieba_2_1():
107    """Test add_word with freq"""
108    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
109    data = ds.TextFileDataset(DATA_FILE4)
110    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
111    jieba_op.add_word("男默女泪", 10)
112    data = data.map(operations=jieba_op, input_columns=["text"],
113                    num_parallel_workers=2)
114    expect = ['男默女泪', '市', '长江大桥']
115    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
116        ret = to_str(i["text"])
117        for index, item in enumerate(ret):
118            assert item == expect[index]
119
120
121def test_jieba_2_2():
122    """Test add_word with invalid None Input"""
123    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
124    try:
125        jieba_op.add_word(None)
126    except ValueError:
127        pass
128
129
130def test_jieba_2_3():
131    """Test add_word with freq, the value of freq affects the result of segmentation"""
132    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
133    data = ds.TextFileDataset(DATA_FILE4)
134    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
135    jieba_op.add_word("江大桥", 20000)
136    data = data.map(operations=jieba_op, input_columns=["text"],
137                    num_parallel_workers=2)
138    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
139    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
140        ret = to_str(i["text"])
141        for index, item in enumerate(ret):
142            assert item == expect[index]
143
144
145def test_jieba_3():
146    """Test add_dict with dict"""
147    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
148    user_dict = {
149        "男默女泪": 10
150    }
151    data = ds.TextFileDataset(DATA_FILE4)
152    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
153    jieba_op.add_dict(user_dict)
154    data = data.map(operations=jieba_op, input_columns=["text"],
155                    num_parallel_workers=1)
156    expect = ['男默女泪', '市', '长江大桥']
157    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
158        ret = to_str(i["text"])
159        for index, item in enumerate(ret):
160            assert item == expect[index]
161
162
163def test_jieba_3_1():
164    """Test add_dict with dict"""
165    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
166    user_dict = {
167        "男默女泪": 10,
168        "江大桥": 20000
169    }
170    data = ds.TextFileDataset(DATA_FILE4)
171    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
172    jieba_op.add_dict(user_dict)
173    data = data.map(operations=jieba_op, input_columns=["text"],
174                    num_parallel_workers=1)
175    expect = ['男默女泪', '市长', '江大桥']
176    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
177        ret = to_str(i["text"])
178        for index, item in enumerate(ret):
179            assert item == expect[index]
180
181
182def test_jieba_4():
183    DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
184    DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
185
186    data = ds.TextFileDataset(DATA_FILE4)
187    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
188    jieba_op.add_dict(DICT_FILE)
189    data = data.map(operations=jieba_op, input_columns=["text"],
190                    num_parallel_workers=1)
191    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
192    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
193        ret = to_str(i["text"])
194        for index, item in enumerate(ret):
195            assert item == expect[index]
196
197
198def test_jieba_4_1():
199    """Test add dict with invalid file path"""
200    DICT_FILE = ""
201    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
202    try:
203        jieba_op.add_dict(DICT_FILE)
204    except ValueError:
205        pass
206
207
208def test_jieba_5():
209    """Test add dict with file path"""
210    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
211
212    data = ds.TextFileDataset(DATA_FILE4)
213    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
214    jieba_op.add_word("江大桥", 20000)
215    data = data.map(operations=jieba_op, input_columns=["text"],
216                    num_parallel_workers=1)
217    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
218    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
219        ret = to_str(i["text"])
220        for index, item in enumerate(ret):
221            assert item == expect[index]
222
223
224def test_jieba_with_offsets_1():
225    """Test jieba tokenizer with MP mode"""
226    data = ds.TextFileDataset(DATA_FILE)
227    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
228    data = data.map(operations=jieba_op, input_columns=["text"],
229                    output_columns=["token", "offsets_start", "offsets_limit"],
230                    column_order=["token", "offsets_start", "offsets_limit"],
231                    num_parallel_workers=1)
232    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
233    expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
234    expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
235    ret = []
236    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
237        ret = to_str(i["token"])
238        for index, item in enumerate(ret):
239            assert item == expect[index]
240        for index, item in enumerate(i["offsets_start"]):
241            assert item == expected_offsets_start[index]
242        for index, item in enumerate(i["offsets_limit"]):
243            assert item == expected_offsets_limit[index]
244
245
246def test_jieba_with_offsets_1_1():
247    """Test jieba tokenizer with HMM mode"""
248    data = ds.TextFileDataset(DATA_FILE)
249    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True)
250    data = data.map(operations=jieba_op, input_columns=["text"],
251                    output_columns=["token", "offsets_start", "offsets_limit"],
252                    column_order=["token", "offsets_start", "offsets_limit"],
253                    num_parallel_workers=1)
254    expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
255    expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45]
256    expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48]
257    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
258        ret = to_str(i["token"])
259        for index, item in enumerate(ret):
260            assert item == expect[index]
261        for index, item in enumerate(i["offsets_start"]):
262            assert item == expected_offsets_start[index]
263        for index, item in enumerate(i["offsets_limit"]):
264            assert item == expected_offsets_limit[index]
265
266
267def test_jieba_with_offsets_1_2():
268    """Test jieba tokenizer with HMM MIX"""
269    data = ds.TextFileDataset(DATA_FILE)
270    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True)
271    data = data.map(operations=jieba_op, input_columns=["text"],
272                    output_columns=["token", "offsets_start", "offsets_limit"],
273                    column_order=["token", "offsets_start", "offsets_limit"],
274                    num_parallel_workers=1)
275    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
276    expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
277    expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
278    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
279        ret = to_str(i["token"])
280        for index, item in enumerate(ret):
281            assert item == expect[index]
282        for index, item in enumerate(i["offsets_start"]):
283            assert item == expected_offsets_start[index]
284        for index, item in enumerate(i["offsets_limit"]):
285            assert item == expected_offsets_limit[index]
286
287
288def test_jieba_with_offsets_2():
289    """Test add_word"""
290    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
291    data = ds.TextFileDataset(DATA_FILE4)
292    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
293    jieba_op.add_word("男默女泪")
294    expect = ['男默女泪', '市', '长江大桥']
295    data = data.map(operations=jieba_op, input_columns=["text"],
296                    output_columns=["token", "offsets_start", "offsets_limit"],
297                    column_order=["token", "offsets_start", "offsets_limit"],
298                    num_parallel_workers=2)
299    expected_offsets_start = [0, 12, 15]
300    expected_offsets_limit = [12, 15, 27]
301    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
302        ret = to_str(i["token"])
303        for index, item in enumerate(ret):
304            assert item == expect[index]
305        for index, item in enumerate(i["offsets_start"]):
306            assert item == expected_offsets_start[index]
307        for index, item in enumerate(i["offsets_limit"]):
308            assert item == expected_offsets_limit[index]
309
310
311def test_jieba_with_offsets_2_1():
312    """Test add_word with freq"""
313    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
314    data = ds.TextFileDataset(DATA_FILE4)
315    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
316    jieba_op.add_word("男默女泪", 10)
317    data = data.map(operations=jieba_op, input_columns=["text"],
318                    output_columns=["token", "offsets_start", "offsets_limit"],
319                    column_order=["token", "offsets_start", "offsets_limit"],
320                    num_parallel_workers=2)
321    expect = ['男默女泪', '市', '长江大桥']
322    expected_offsets_start = [0, 12, 15]
323    expected_offsets_limit = [12, 15, 27]
324    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
325        ret = to_str(i["token"])
326        for index, item in enumerate(ret):
327            assert item == expect[index]
328        for index, item in enumerate(i["offsets_start"]):
329            assert item == expected_offsets_start[index]
330        for index, item in enumerate(i["offsets_limit"]):
331            assert item == expected_offsets_limit[index]
332
333
334def test_jieba_with_offsets_2_2():
335    """Test add_word with freq, the value of freq affects the result of segmentation"""
336    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
337    data = ds.TextFileDataset(DATA_FILE4)
338    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
339    jieba_op.add_word("江大桥", 20000)
340    data = data.map(operations=jieba_op, input_columns=["text"],
341                    output_columns=["token", "offsets_start", "offsets_limit"],
342                    column_order=["token", "offsets_start", "offsets_limit"],
343                    num_parallel_workers=2)
344    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
345    expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
346    expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
347    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
348        ret = to_str(i["token"])
349        for index, item in enumerate(ret):
350            assert item == expect[index]
351        for index, item in enumerate(i["offsets_start"]):
352            assert item == expected_offsets_start[index]
353        for index, item in enumerate(i["offsets_limit"]):
354            assert item == expected_offsets_limit[index]
355
356
357def test_jieba_with_offsets_3():
358    """Test add_dict with dict"""
359    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
360    user_dict = {
361        "男默女泪": 10
362    }
363    data = ds.TextFileDataset(DATA_FILE4)
364    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
365    jieba_op.add_dict(user_dict)
366    data = data.map(operations=jieba_op, input_columns=["text"],
367                    output_columns=["token", "offsets_start", "offsets_limit"],
368                    column_order=["token", "offsets_start", "offsets_limit"],
369                    num_parallel_workers=1)
370    expect = ['男默女泪', '市', '长江大桥']
371    expected_offsets_start = [0, 12, 15]
372    expected_offsets_limit = [12, 15, 27]
373    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
374        ret = to_str(i["token"])
375        for index, item in enumerate(ret):
376            assert item == expect[index]
377        for index, item in enumerate(i["offsets_start"]):
378            assert item == expected_offsets_start[index]
379        for index, item in enumerate(i["offsets_limit"]):
380            assert item == expected_offsets_limit[index]
381
382
383def test_jieba_with_offsets_3_1():
384    """Test add_dict with dict"""
385    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
386    user_dict = {
387        "男默女泪": 10,
388        "江大桥": 20000
389    }
390    data = ds.TextFileDataset(DATA_FILE4)
391    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
392    jieba_op.add_dict(user_dict)
393    data = data.map(operations=jieba_op, input_columns=["text"],
394                    output_columns=["token", "offsets_start", "offsets_limit"],
395                    column_order=["token", "offsets_start", "offsets_limit"],
396                    num_parallel_workers=1)
397    expect = ['男默女泪', '市长', '江大桥']
398    expected_offsets_start = [0, 12, 18]
399    expected_offsets_limit = [12, 18, 27]
400    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
401        ret = to_str(i["token"])
402        for index, item in enumerate(ret):
403            assert item == expect[index]
404        for index, item in enumerate(i["offsets_start"]):
405            assert item == expected_offsets_start[index]
406        for index, item in enumerate(i["offsets_limit"]):
407            assert item == expected_offsets_limit[index]
408
409
410def test_jieba_with_offsets_4():
411    DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
412    DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
413
414    data = ds.TextFileDataset(DATA_FILE4)
415    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
416    jieba_op.add_dict(DICT_FILE)
417    data = data.map(operations=jieba_op, input_columns=["text"],
418                    output_columns=["token", "offsets_start", "offsets_limit"],
419                    column_order=["token", "offsets_start", "offsets_limit"],
420                    num_parallel_workers=1)
421    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
422    expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
423    expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
424    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
425        ret = to_str(i["token"])
426        for index, item in enumerate(ret):
427            assert item == expect[index]
428        for index, item in enumerate(i["offsets_start"]):
429            assert item == expected_offsets_start[index]
430        for index, item in enumerate(i["offsets_limit"]):
431            assert item == expected_offsets_limit[index]
432
433
434def test_jieba_with_offsets_5():
435    """Test add dict with file path"""
436    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
437
438    data = ds.TextFileDataset(DATA_FILE4)
439    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
440    jieba_op.add_word("江大桥", 20000)
441    data = data.map(operations=jieba_op, input_columns=["text"],
442                    output_columns=["token", "offsets_start", "offsets_limit"],
443                    column_order=["token", "offsets_start", "offsets_limit"],
444                    num_parallel_workers=1)
445    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
446    expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
447    expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
448    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
449        ret = to_str(i["token"])
450        for index, item in enumerate(ret):
451            assert item == expect[index]
452        for index, item in enumerate(i["offsets_start"]):
453            assert item == expected_offsets_start[index]
454        for index, item in enumerate(i["offsets_limit"]):
455            assert item == expected_offsets_limit[index]
456
457
458def gen():
459    text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
460    yield (text,)
461
462
463def pytoken_op(input_data):
464    te = str(to_str(input_data))
465    tokens = []
466    tokens.append(te[:5].encode("UTF8"))
467    tokens.append(te[5:10].encode("UTF8"))
468    tokens.append(te[10:].encode("UTF8"))
469    return np.array(tokens, dtype='S')
470
471
472def test_jieba_6():
473    data = ds.GeneratorDataset(gen, column_names=["text"])
474    data = data.map(operations=pytoken_op, input_columns=["text"],
475                    num_parallel_workers=1)
476    expect = ['今天天气太', '好了我们一', '起去外面玩吧']
477    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
478        ret = to_str(i["text"])
479        for index, item in enumerate(ret):
480            assert item == expect[index]
481
482
483if __name__ == "__main__":
484    test_jieba_callable()
485    test_jieba_1()
486    test_jieba_1_1()
487    test_jieba_1_2()
488    test_jieba_2()
489    test_jieba_2_1()
490    test_jieba_2_2()
491    test_jieba_3()
492    test_jieba_3_1()
493    test_jieba_4()
494    test_jieba_4_1()
495    test_jieba_5()
496    test_jieba_5()
497    test_jieba_6()
498    test_jieba_with_offsets_1()
499    test_jieba_with_offsets_1_1()
500    test_jieba_with_offsets_1_2()
501    test_jieba_with_offsets_2()
502    test_jieba_with_offsets_2_1()
503    test_jieba_with_offsets_2_2()
504    test_jieba_with_offsets_3()
505    test_jieba_with_offsets_3_1()
506    test_jieba_with_offsets_4()
507    test_jieba_with_offsets_5()
508