1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16Testing PythonTokenizer op in DE 17""" 18import mindspore.dataset as ds 19import mindspore.dataset.text as text 20from mindspore import log as logger 21 22DATA_FILE = "../data/dataset/testTokenizerData/1.txt" 23 24 25def test_whitespace_tokenizer_ch(): 26 """ 27 Test PythonTokenizer 28 """ 29 whitespace_strs = [["Welcome", "to", "Beijing!"], 30 ["北京欢迎您!"], 31 ["我喜欢English!"], 32 [""]] 33 34 def my_tokenizer(line): 35 words = line.split() 36 if not words: 37 return [""] 38 return words 39 40 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 41 tokenizer = text.PythonTokenizer(my_tokenizer) 42 dataset = dataset.map(operations=tokenizer, num_parallel_workers=1) 43 tokens = [] 44 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 45 s = text.to_str(i['text']).tolist() 46 tokens.append(s) 47 logger.info("The out tokens is : {}".format(tokens)) 48 assert whitespace_strs == tokens 49 50 51if __name__ == '__main__': 52 test_whitespace_tokenizer_ch() 53