• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""
16Testing from_dataset in mindspore.dataset
17"""
18import numpy as np
19import mindspore.dataset as ds
20import mindspore.dataset.text as text
21
22
23def test_demo_basic_from_dataset():
24    """ this is a tutorial on how from_dataset should be used in a normal use case"""
25    data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
26    vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None,
27                                    special_tokens=["<pad>", "<unk>"],
28                                    special_first=True)
29    data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
30    res = []
31    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
32        res.append(d["text"].item())
33    assert res == [4, 5, 3, 6, 7, 2], res
34
35
36def test_demo_basic_from_dataset_with_tokenizer():
37    """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer"""
38    data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt", shuffle=False)
39    data = data.map(operations=text.UnicodeCharTokenizer(), input_columns=["text"])
40    vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None, special_tokens=["<pad>", "<unk>"],
41                                    special_first=True)
42    data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
43    res = []
44    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
45        res.append(list(d["text"]))
46    assert res == [[13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5], [21, 20, 10, 25, 23, 26],
47                   [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5], [2, 2]]
48
49
50def test_from_dataset():
51    """ test build vocab with generator dataset """
52
53    def gen_corpus():
54        # key: word, value: number of occurrences, reason for using letters is so their order is apparent
55        corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
56        for k, v in corpus.items():
57            yield (np.array([k] * v, dtype='S'),)
58
59    def test_config(freq_range, top_k):
60        corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
61        vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k, special_tokens=["<pad>", "<unk>"],
62                                        special_first=True)
63        corpus_dataset = corpus_dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns="text")
64        res = []
65        for d in corpus_dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
66            res.append(list(d["text"]))
67        return res
68
69    # take words whose frequency is with in [3,4] order them alphabetically for words with the same frequency
70    test1_res = test_config(freq_range=(3, 4), top_k=4)
71    assert test1_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test1_res)
72
73    # test words with frequency range [2,inf], only the last word will be filtered out
74    test2_res = test_config((2, None), None)
75    assert test2_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [6, 6, 6], [5, 5, 5], [7, 7], [1]], str(test2_res)
76
77    # test filter only by top_k
78    test3_res = test_config(None, 4)
79    assert test3_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test3_res)
80
81    # test filtering out the most frequent
82    test4_res = test_config((None, 3), 100)
83    assert test4_res == [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [3, 3, 3], [2, 2, 2], [4, 4], [5]], str(test4_res)
84
85    # test top_k == 1
86    test5_res = test_config(None, 1)
87    assert test5_res == [[1, 1, 1, 1], [1, 1, 1, 1], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test5_res)
88
89    # test min_frequency == max_frequency
90    test6_res = test_config((4, 4), None)
91    assert test6_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test6_res)
92
93
94def test_from_dataset_special_token():
95    """ test build vocab with generator dataset """
96
97    def gen_corpus():
98        # key: word, value: number of occurrences, reason for using letters is so their order is apparent
99        corpus = {"D": 1, "C": 1, "B": 1, "A": 1}
100        for k, v in corpus.items():
101            yield (np.array([k] * v, dtype='S'),)
102
103    def gen_input(texts):
104        for word in texts.split(" "):
105            yield (np.array(word, dtype='S'),)
106
107    def test_config(texts, top_k, special_tokens, special_first):
108        corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
109        vocab = text.Vocab.from_dataset(corpus_dataset, None, None, top_k, special_tokens, special_first)
110        data = ds.GeneratorDataset(gen_input(texts), column_names=["text"])
111        data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns="text")
112        res = []
113        for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
114            res.append(d["text"].item())
115        return res
116
117    # test special tokens are inserted before
118    assert test_config("A B C D <pad> <unk>", 4, ["<pad>", "<unk>"], True) == [2, 3, 4, 5, 0, 1]
119    # test special tokens are inserted after
120    assert test_config("A B C D <pad> <unk>", 4, ["<pad>", "<unk>"], False) == [0, 1, 2, 3, 4, 5]
121
122
123def test_from_dataset_exceptions():
124    """ test various exceptions during that are checked in validator """
125
126    def test_config(columns, freq_range, top_k, s):
127        try:
128            data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
129            vocab = text.Vocab.from_dataset(data, columns, freq_range, top_k)
130            assert isinstance(vocab.text.Vocab)
131        except (TypeError, ValueError) as e:
132            assert s in str(e), str(e)
133
134    test_config("text", (), 1, "freq_range needs to be a tuple of 2 element.")
135    test_config("text", (2, 3), 1.2345,
136                "Argument top_k with value 1.2345 is not of type [<class 'int'>, <class 'NoneType'>]")
137    test_config(23, (2, 3), 1.2345, "Argument col[0] with value 23 is not of type [<class 'str'>]")
138    test_config("text", (100, 1), 12, "frequency range [a,b] should be 0 <= a <= b (a,b are inclusive)")
139    test_config("text", (2, 3), 0, "top_k must be greater than 0")
140    test_config([123], (2, 3), -1, "top_k must be greater than 0")
141
142
143if __name__ == '__main__':
144    test_demo_basic_from_dataset()
145    test_from_dataset()
146    test_from_dataset_exceptions()
147    test_demo_basic_from_dataset_with_tokenizer()
148    test_from_dataset_special_token()
149