1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16Testing from_dataset in mindspore.dataset 17""" 18import numpy as np 19import mindspore.dataset as ds 20import mindspore.dataset.text as text 21 22 23def test_demo_basic_from_dataset(): 24 """ this is a tutorial on how from_dataset should be used in a normal use case""" 25 data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) 26 vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None, 27 special_tokens=["<pad>", "<unk>"], 28 special_first=True) 29 data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"]) 30 res = [] 31 for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): 32 res.append(d["text"].item()) 33 assert res == [4, 5, 3, 6, 7, 2], res 34 35 36def test_demo_basic_from_dataset_with_tokenizer(): 37 """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer""" 38 data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt", shuffle=False) 39 data = data.map(operations=text.UnicodeCharTokenizer(), input_columns=["text"]) 40 vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None, special_tokens=["<pad>", "<unk>"], 41 special_first=True) 42 data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"]) 43 res = [] 44 for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): 45 res.append(list(d["text"])) 46 assert res == [[13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5], [21, 20, 10, 25, 23, 26], 47 [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5], [2, 2]] 48 49 50def test_from_dataset(): 51 """ test build vocab with generator dataset """ 52 53 def gen_corpus(): 54 # key: word, value: number of occurrences, reason for using letters is so their order is apparent 55 corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1} 56 for k, v in corpus.items(): 57 yield (np.array([k] * v, dtype='S'),) 58 59 def test_config(freq_range, top_k): 60 corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) 61 vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k, special_tokens=["<pad>", "<unk>"], 62 special_first=True) 63 corpus_dataset = corpus_dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns="text") 64 res = [] 65 for d in corpus_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 66 res.append(list(d["text"])) 67 return res 68 69 # take words whose frequency is with in [3,4] order them alphabetically for words with the same frequency 70 test1_res = test_config(freq_range=(3, 4), top_k=4) 71 assert test1_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test1_res) 72 73 # test words with frequency range [2,inf], only the last word will be filtered out 74 test2_res = test_config((2, None), None) 75 assert test2_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [6, 6, 6], [5, 5, 5], [7, 7], [1]], str(test2_res) 76 77 # test filter only by top_k 78 test3_res = test_config(None, 4) 79 assert test3_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test3_res) 80 81 # test filtering out the most frequent 82 test4_res = test_config((None, 3), 100) 83 assert test4_res == [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [3, 3, 3], [2, 2, 2], [4, 4], [5]], str(test4_res) 84 85 # test top_k == 1 86 test5_res = test_config(None, 1) 87 assert test5_res == [[1, 1, 1, 1], [1, 1, 1, 1], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test5_res) 88 89 # test min_frequency == max_frequency 90 test6_res = test_config((4, 4), None) 91 assert test6_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test6_res) 92 93 94def test_from_dataset_special_token(): 95 """ test build vocab with generator dataset """ 96 97 def gen_corpus(): 98 # key: word, value: number of occurrences, reason for using letters is so their order is apparent 99 corpus = {"D": 1, "C": 1, "B": 1, "A": 1} 100 for k, v in corpus.items(): 101 yield (np.array([k] * v, dtype='S'),) 102 103 def gen_input(texts): 104 for word in texts.split(" "): 105 yield (np.array(word, dtype='S'),) 106 107 def test_config(texts, top_k, special_tokens, special_first): 108 corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) 109 vocab = text.Vocab.from_dataset(corpus_dataset, None, None, top_k, special_tokens, special_first) 110 data = ds.GeneratorDataset(gen_input(texts), column_names=["text"]) 111 data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns="text") 112 res = [] 113 for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): 114 res.append(d["text"].item()) 115 return res 116 117 # test special tokens are inserted before 118 assert test_config("A B C D <pad> <unk>", 4, ["<pad>", "<unk>"], True) == [2, 3, 4, 5, 0, 1] 119 # test special tokens are inserted after 120 assert test_config("A B C D <pad> <unk>", 4, ["<pad>", "<unk>"], False) == [0, 1, 2, 3, 4, 5] 121 122 123def test_from_dataset_exceptions(): 124 """ test various exceptions during that are checked in validator """ 125 126 def test_config(columns, freq_range, top_k, s): 127 try: 128 data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) 129 vocab = text.Vocab.from_dataset(data, columns, freq_range, top_k) 130 assert isinstance(vocab.text.Vocab) 131 except (TypeError, ValueError) as e: 132 assert s in str(e), str(e) 133 134 test_config("text", (), 1, "freq_range needs to be a tuple of 2 element.") 135 test_config("text", (2, 3), 1.2345, 136 "Argument top_k with value 1.2345 is not of type [<class 'int'>, <class 'NoneType'>]") 137 test_config(23, (2, 3), 1.2345, "Argument col[0] with value 23 is not of type [<class 'str'>]") 138 test_config("text", (100, 1), 12, "frequency range [a,b] should be 0 <= a <= b (a,b are inclusive)") 139 test_config("text", (2, 3), 0, "top_k must be greater than 0") 140 test_config([123], (2, 3), -1, "top_k must be greater than 0") 141 142 143if __name__ == '__main__': 144 test_demo_basic_from_dataset() 145 test_from_dataset() 146 test_from_dataset_exceptions() 147 test_demo_basic_from_dataset_with_tokenizer() 148 test_from_dataset_special_token() 149