• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2021 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15import numpy as np
16import mindspore.dataset.text.transforms as T
17import mindspore.common.dtype as mstype
18from mindspore import log as logger
19
20def test_sliding_window():
21    txt = ["Welcome", "to", "Beijing", "!"]
22    sliding_window = T.SlidingWindow(width=2)
23    txt = sliding_window(txt)
24    logger.info("Result: {}".format(txt))
25
26    expected = [['Welcome', 'to'], ['to', 'Beijing'], ['Beijing', '!']]
27    np.testing.assert_equal(txt, expected)
28
29
30def test_to_number():
31    txt = ["123456"]
32    to_number = T.ToNumber(mstype.int32)
33    txt = to_number(txt)
34    logger.info("Result: {}, type: {}".format(txt, type(txt[0])))
35
36    assert txt == 123456
37
38
39def test_whitespace_tokenizer():
40    txt = "Welcome to Beijing !"
41    txt = T.WhitespaceTokenizer()(txt)
42    logger.info("Tokenize result: {}".format(txt))
43
44    expected = ['Welcome', 'to', 'Beijing', '!']
45    np.testing.assert_equal(txt, expected)
46
47
48def test_python_tokenizer():
49    # whitespace tokenizer
50    def my_tokenizer(line):
51        words = line.split()
52        if not words:
53            return [""]
54        return words
55    txt1 = np.array("Welcome to Beijing !".encode())
56    txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
57    logger.info("Tokenize result: {}".format(txt1))
58
59    txt2 = np.array("Welcome to Beijing !")
60    txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
61    logger.info("Tokenize result: {}".format(txt2))
62
63    expected = ['Welcome', 'to', 'Beijing', '!']
64    np.testing.assert_equal(txt1, expected)
65    np.testing.assert_equal(txt2, expected)
66
67
68if __name__ == '__main__':
69    test_sliding_window()
70    test_to_number()
71    test_whitespace_tokenizer()
72    test_python_tokenizer()
73