# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """ Testing PythonTokenizer op in DE """ import mindspore.dataset as ds import mindspore.dataset.text as text from mindspore import log as logger DATA_FILE = "../data/dataset/testTokenizerData/1.txt" def test_whitespace_tokenizer_ch(): """ Test PythonTokenizer """ whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"], ["我喜欢English!"], [""]] def my_tokenizer(line): words = line.split() if not words: return [""] return words dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.PythonTokenizer(my_tokenizer) dataset = dataset.map(operations=tokenizer, num_parallel_workers=1) tokens = [] for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): s = text.to_str(i['text']).tolist() tokens.append(s) logger.info("The out tokens is : {}".format(tokens)) assert whitespace_strs == tokens if __name__ == '__main__': test_whitespace_tokenizer_ch()