• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""
16Testing PythonTokenizer op in DE
17"""
18import mindspore.dataset as ds
19import mindspore.dataset.text as text
20from mindspore import log as logger
21
22DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
23
24
25def test_whitespace_tokenizer_ch():
26    """
27    Test PythonTokenizer
28    """
29    whitespace_strs = [["Welcome", "to", "Beijing!"],
30                       ["北京欢迎您!"],
31                       ["我喜欢English!"],
32                       [""]]
33
34    def my_tokenizer(line):
35        words = line.split()
36        if not words:
37            return [""]
38        return words
39
40    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
41    tokenizer = text.PythonTokenizer(my_tokenizer)
42    dataset = dataset.map(operations=tokenizer, num_parallel_workers=1)
43    tokens = []
44    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
45        s = text.to_str(i['text']).tolist()
46        tokens.append(s)
47    logger.info("The out tokens is : {}".format(tokens))
48    assert whitespace_strs == tokens
49
50
51if __name__ == '__main__':
52    test_whitespace_tokenizer_ch()
53