• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2019 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15
16import numpy as np
17
18import mindspore.dataset as ds
19import mindspore.dataset.text as text
20
21# this file contains "home is behind the world head" each word is 1 line
22DATA_FILE = "../data/dataset/testVocab/words.txt"
23VOCAB_FILE = "../data/dataset/testVocab/vocab_list.txt"
24HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
25MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
26
27
28def test_on_tokenized_line():
29    data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False)
30    jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
31    with open(VOCAB_FILE, 'r') as f:
32        for line in f:
33            word = line.split(',')[0]
34            jieba_op.add_word(word)
35    data = data.map(operations=jieba_op, input_columns=["text"])
36    vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["<pad>", "<unk>"])
37    lookup = text.Lookup(vocab, "<unk>")
38    data = data.map(operations=lookup, input_columns=["text"])
39    res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14],
40                    [11, 1, 12, 1, 10, 1, 14, 1, 13, 1, 15]], dtype=np.int32)
41    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
42        np.testing.assert_array_equal(d["text"], res[i])
43
44
45def test_on_tokenized_line_with_no_special_tokens():
46    data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False)
47    jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
48    with open(VOCAB_FILE, 'r') as f:
49        for line in f:
50            word = line.split(',')[0]
51            jieba_op.add_word(word)
52
53    data = data.map(operations=jieba_op, input_columns=["text"])
54    vocab = text.Vocab.from_file(VOCAB_FILE, ",")
55    lookup = text.Lookup(vocab, "not")
56    data = data.map(operations=lookup, input_columns=["text"])
57    res = np.array([[8, 0, 9, 0, 10, 0, 13, 0, 11, 0, 12],
58                    [9, 0, 10, 0, 8, 0, 12, 0, 11, 0, 13]], dtype=np.int32)
59    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
60        np.testing.assert_array_equal(d["text"], res[i])
61
62
63if __name__ == '__main__':
64    test_on_tokenized_line()
65    test_on_tokenized_line_with_no_special_tokens()
66