• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""
16Testing UnicodeCharTokenizer op in DE
17"""
18import numpy as np
19import mindspore.dataset as ds
20from mindspore import log as logger
21import mindspore.dataset.text as text
22
23DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
24NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
25REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
26REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
27
28
29def split_by_unicode_char(input_strs):
30    """
31    Split utf-8 strings to unicode characters
32    """
33    out = []
34    for s in input_strs:
35        out.append([c for c in s])
36    return out
37
38
39def test_unicode_char_tokenizer_default():
40    """
41    Test UnicodeCharTokenizer
42    """
43    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
44    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
45    tokenizer = text.UnicodeCharTokenizer()
46    dataset = dataset.map(operations=tokenizer)
47    tokens = []
48    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
49        token = text.to_str(i['text']).tolist()
50        tokens.append(token)
51    logger.info("The out tokens is : {}".format(tokens))
52    assert split_by_unicode_char(input_strs) == tokens
53
54
55def test_unicode_char_tokenizer_with_offsets():
56    """
57    Test UnicodeCharTokenizer
58    """
59    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
60    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
61    tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
62    dataset = dataset.map(operations=tokenizer, input_columns=['text'],
63                          output_columns=['token', 'offsets_start', 'offsets_limit'],
64                          column_order=['token', 'offsets_start', 'offsets_limit'])
65    tokens = []
66    expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
67                              [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
68    expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
69                              [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]]
70    count = 0
71    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
72        token = text.to_str(i['token']).tolist()
73        tokens.append(token)
74        np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
75        np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
76        count += 1
77    logger.info("The out tokens is : {}".format(tokens))
78    assert split_by_unicode_char(input_strs) == tokens
79
80
81def test_whitespace_tokenizer_default():
82    """
83    Test WhitespaceTokenizer
84    """
85    whitespace_strs = [["Welcome", "to", "Beijing!"],
86                       ["北京欢迎您!"],
87                       ["我喜欢English!"],
88                       [""]]
89    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
90    tokenizer = text.WhitespaceTokenizer()
91    dataset = dataset.map(operations=tokenizer)
92    tokens = []
93    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
94        token = text.to_str(i['text']).tolist()
95        tokens.append(token)
96    logger.info("The out tokens is : {}".format(tokens))
97    assert whitespace_strs == tokens
98
99
100def test_whitespace_tokenizer_with_offsets():
101    """
102    Test WhitespaceTokenizer
103    """
104    whitespace_strs = [["Welcome", "to", "Beijing!"],
105                       ["北京欢迎您!"],
106                       ["我喜欢English!"],
107                       [""]]
108    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
109    tokenizer = text.WhitespaceTokenizer(with_offsets=True)
110    dataset = dataset.map(operations=tokenizer, input_columns=['text'],
111                          output_columns=['token', 'offsets_start', 'offsets_limit'],
112                          column_order=['token', 'offsets_start', 'offsets_limit'])
113    tokens = []
114    expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
115    expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
116    count = 0
117    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
118        token = text.to_str(i['token']).tolist()
119        tokens.append(token)
120        np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
121        np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
122        count += 1
123
124    logger.info("The out tokens is : {}".format(tokens))
125    assert whitespace_strs == tokens
126
127
128def test_unicode_script_tokenizer_default():
129    """
130    Test UnicodeScriptTokenizer when para keep_whitespace=False
131    """
132    unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
133                           ["北京欢迎您", "!"],
134                           ["我喜欢", "English", "!"],
135                           [""]]
136    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
137    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False)
138    dataset = dataset.map(operations=tokenizer)
139
140    tokens = []
141    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
142        token = text.to_str(i['text']).tolist()
143        tokens.append(token)
144    logger.info("The out tokens is : {}".format(tokens))
145    assert unicode_script_strs == tokens
146
147
148def test_unicode_script_tokenizer_default2():
149    """
150    Test UnicodeScriptTokenizer when para keep_whitespace=True
151    """
152    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
153                            ["北京欢迎您", "!"],
154                            ["我喜欢", "English", "!"],
155                            ["  "]]
156    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
157    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
158    dataset = dataset.map(operations=tokenizer)
159    tokens = []
160    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
161        token = text.to_str(i['text']).tolist()
162        tokens.append(token)
163    logger.info("The out tokens is :", tokens)
164    assert unicode_script_strs2 == tokens
165
166
167def test_unicode_script_tokenizer_with_offsets():
168    """
169    Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
170    """
171    unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
172                           ["北京欢迎您", "!"],
173                           ["我喜欢", "English", "!"],
174                           [""]]
175    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
176    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True)
177    dataset = dataset.map(operations=tokenizer, input_columns=['text'],
178                          output_columns=['token', 'offsets_start', 'offsets_limit'],
179                          column_order=['token', 'offsets_start', 'offsets_limit'])
180    tokens = []
181    expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]]
182    expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]]
183    count = 0
184    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
185        token = text.to_str(i['token']).tolist()
186        tokens.append(token)
187        np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
188        np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
189        count += 1
190    logger.info("The out tokens is : {}".format(tokens))
191    assert unicode_script_strs == tokens
192
193
194def test_unicode_script_tokenizer_with_offsets2():
195    """
196    Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
197    """
198    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
199                            ["北京欢迎您", "!"],
200                            ["我喜欢", "English", "!"],
201                            ["  "]]
202    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
203    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
204    dataset = dataset.map(operations=tokenizer, input_columns=['text'],
205                          output_columns=['token', 'offsets_start', 'offsets_limit'],
206                          column_order=['token', 'offsets_start', 'offsets_limit'])
207    tokens = []
208    expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
209    expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]]
210    count = 0
211    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
212        token = text.to_str(i['token']).tolist()
213        tokens.append(token)
214        np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
215        np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
216        count += 1
217    logger.info("The out tokens is :", tokens)
218    assert unicode_script_strs2 == tokens
219
220
221def test_case_fold():
222    """
223    Test CaseFold
224    """
225    expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", "  "]
226    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
227    op = text.CaseFold()
228    dataset = dataset.map(operations=op)
229
230    lower_strs = []
231    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
232        token = text.to_str(i['text']).tolist()
233        lower_strs.append(token)
234    assert lower_strs == expect_strs
235
236
237def test_normalize_utf8():
238    """
239    Test NormalizeUTF8
240    """
241
242    def normalize(normalize_form):
243        dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
244        normalize = text.NormalizeUTF8(normalize_form=normalize_form)
245        dataset = dataset.map(operations=normalize)
246        out_bytes = []
247        out_texts = []
248        for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
249            out_bytes.append(i['text'])
250            out_texts.append(text.to_str(i['text']).tolist())
251        logger.info("The out bytes is : ", out_bytes)
252        logger.info("The out texts is: ", out_texts)
253        return out_bytes
254
255    expect_normlize_data = [
256        # NFC
257        [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
258         b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
259        # NFKC
260        [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
261         b'fi', b'25', b'\xe1\xb9\xa9'],
262        # NFD
263        [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
264         b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
265        # NFKD
266        [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
267         b'fi', b'25', b's\xcc\xa3\xcc\x87']
268    ]
269    assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0]
270    assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
271    assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2]
272    assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
273
274
275def test_regex_replace():
276    """
277    Test RegexReplace
278    """
279
280    def regex_replace(first, last, expect_str, pattern, replace):
281        dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
282        if first > 1:
283            dataset = dataset.skip(first - 1)
284        if last >= first:
285            dataset = dataset.take(last - first + 1)
286        replace_op = text.RegexReplace(pattern, replace)
287        dataset = dataset.map(operations=replace_op)
288        out_text = []
289        for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
290            token = text.to_str(i['text']).tolist()
291            out_text.append(token)
292        logger.info("Out:", out_text)
293        logger.info("Exp:", expect_str)
294        assert expect_str == out_text
295
296    regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
297    regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
298    regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
299    regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
300
301
302def test_regex_tokenizer_default():
303    """
304    Test RegexTokenizer
305    """
306
307    def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
308        dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
309        if first > 1:
310            dataset = dataset.skip(first - 1)
311        if last >= first:
312            dataset = dataset.take(last - first + 1)
313        tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern)
314        dataset = dataset.map(operations=tokenizer_op)
315        out_text = []
316        count = 0
317        for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
318            token = text.to_str(i['text']).tolist()
319            np.testing.assert_array_equal(token, expect_str[count])
320            count += 1
321            out_text.append(token)
322        logger.info("Out:", out_text)
323        logger.info("Exp:", expect_str)
324
325    regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
326    regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
327    regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
328    regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
329    regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
330    regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
331
332
333def test_regex_tokenizer_with_offsets():
334    """
335    Test RegexTokenizer
336    """
337
338    def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern,
339                        keep_delim_pattern):
340        dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
341        if first > 1:
342            dataset = dataset.skip(first - 1)
343        if last >= first:
344            dataset = dataset.take(last - first + 1)
345        tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
346        dataset = dataset.map(operations=tokenizer_op, input_columns=['text'],
347                              output_columns=['token', 'offsets_start', 'offsets_limit'],
348                              column_order=['token', 'offsets_start', 'offsets_limit'])
349        out_text = []
350        count = 0
351        for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
352            token = text.to_str(i['token']).tolist()
353            np.testing.assert_array_equal(token, expect_str[count])
354            np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
355            np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
356            count += 1
357            out_text.append(token)
358        logger.info("Out:", out_text)
359        logger.info("Exp:", expect_str)
360
361    regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "")
362    regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]],
363                    "\\s+", "\\s+")
364    regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]],
365                    [[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}")
366    regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]],
367                    r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
368    regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "")
369    regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "")
370
371
372if __name__ == '__main__':
373    test_unicode_char_tokenizer_default()
374    test_unicode_char_tokenizer_with_offsets()
375    test_whitespace_tokenizer_default()
376    test_whitespace_tokenizer_with_offsets()
377    test_unicode_script_tokenizer_default()
378    test_unicode_script_tokenizer_default2()
379    test_unicode_script_tokenizer_with_offsets()
380    test_unicode_script_tokenizer_with_offsets2()
381    test_case_fold()
382    test_normalize_utf8()
383    test_regex_replace()
384    test_regex_tokenizer_default()
385    test_regex_tokenizer_with_offsets()
386