1# Copyright 2021 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15import numpy as np 16import mindspore.dataset.text.transforms as T 17import mindspore.common.dtype as mstype 18from mindspore import log as logger 19 20def test_sliding_window(): 21 txt = ["Welcome", "to", "Beijing", "!"] 22 sliding_window = T.SlidingWindow(width=2) 23 txt = sliding_window(txt) 24 logger.info("Result: {}".format(txt)) 25 26 expected = [['Welcome', 'to'], ['to', 'Beijing'], ['Beijing', '!']] 27 np.testing.assert_equal(txt, expected) 28 29 30def test_to_number(): 31 txt = ["123456"] 32 to_number = T.ToNumber(mstype.int32) 33 txt = to_number(txt) 34 logger.info("Result: {}, type: {}".format(txt, type(txt[0]))) 35 36 assert txt == 123456 37 38 39def test_whitespace_tokenizer(): 40 txt = "Welcome to Beijing !" 41 txt = T.WhitespaceTokenizer()(txt) 42 logger.info("Tokenize result: {}".format(txt)) 43 44 expected = ['Welcome', 'to', 'Beijing', '!'] 45 np.testing.assert_equal(txt, expected) 46 47 48def test_python_tokenizer(): 49 # whitespace tokenizer 50 def my_tokenizer(line): 51 words = line.split() 52 if not words: 53 return [""] 54 return words 55 txt1 = np.array("Welcome to Beijing !".encode()) 56 txt1 = T.PythonTokenizer(my_tokenizer)(txt1) 57 logger.info("Tokenize result: {}".format(txt1)) 58 59 txt2 = np.array("Welcome to Beijing !") 60 txt2 = T.PythonTokenizer(my_tokenizer)(txt2) 61 logger.info("Tokenize result: {}".format(txt2)) 62 63 expected = ['Welcome', 'to', 'Beijing', '!'] 64 np.testing.assert_equal(txt1, expected) 65 np.testing.assert_equal(txt2, expected) 66 67 68if __name__ == '__main__': 69 test_sliding_window() 70 test_to_number() 71 test_whitespace_tokenizer() 72 test_python_tokenizer() 73