1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15import copy 16import numpy as np 17import mindspore.dataset.text as text 18import mindspore.dataset as ds 19from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType 20 21VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt" 22DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt" 23 24 25def test_sentence_piece_tokenizer_callable(): 26 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 27 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 28 data = '123' 29 assert np.array_equal(tokenizer(data), ['▁', '12', '3']) 30 31 32def test_from_vocab_to_str_UNIGRAM(): 33 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 34 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 35 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 36 dataset = dataset.map(operations=tokenizer) 37 expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] 38 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 39 ret = to_str(i["text"]) 40 for key, value in enumerate(ret): 41 assert value == expect[key] 42 43 44def test_from_vocab_to_str_BPE(): 45 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {}) 46 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 47 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 48 dataset = dataset.map(operations=tokenizer) 49 expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.'] 50 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 51 ret = to_str(i["text"]) 52 for key, value in enumerate(ret): 53 assert value == expect[key] 54 55 56def test_from_vocab_to_str_CHAR(): 57 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {}) 58 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 59 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 60 dataset = dataset.map(operations=tokenizer) 61 expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\ 62 '▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.'] 63 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 64 ret = to_str(i["text"]) 65 for key, value in enumerate(ret): 66 assert value == expect[key] 67 68 69def test_from_vocab_to_str_WORD(): 70 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {}) 71 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 72 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 73 dataset = dataset.map(operations=tokenizer) 74 expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.'] 75 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 76 ret = to_str(i["text"]) 77 for key, value in enumerate(ret): 78 assert value == expect[key] 79 80 81def test_from_vocab_to_int(): 82 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 83 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT) 84 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 85 dataset = dataset.map(operations=tokenizer) 86 expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4] 87 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 88 ret = i["text"] 89 for key, value in enumerate(ret): 90 assert value == expect[key] 91 92 93def test_from_file_to_str(): 94 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 95 text.SentencePieceVocab.save_model(vocab, "./", "m.model") 96 tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING) 97 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 98 dataset = dataset.map(operations=tokenizer) 99 expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] 100 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 101 ret = to_str(i["text"]) 102 for key, value in enumerate(ret): 103 assert value == expect[key] 104 105 106def test_from_file_to_int(): 107 vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 108 text.SentencePieceVocab.save_model(vocab, "./", "m.model") 109 tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.INT) 110 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 111 dataset = dataset.map(operations=tokenizer) 112 expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4] 113 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 114 ret = i["text"] 115 for key, value in enumerate(ret): 116 assert value == expect[key] 117 118 119def test_build_from_dataset(): 120 data = ds.TextFileDataset(VOCAB_FILE, shuffle=False) 121 vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 122 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 123 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 124 dataset = dataset.map(operations=tokenizer) 125 expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] 126 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 127 ret = to_str(i["text"]) 128 for key, value in enumerate(ret): 129 assert value == expect[key] 130 131 132def apply_func(dataset): 133 input_columns = ['text'] 134 output_columns = ['text2'] 135 dataset = dataset.rename(input_columns, output_columns) 136 return dataset 137 138 139def zip_test(dataset): 140 dataset_1 = copy.deepcopy(dataset) 141 dataset_2 = copy.deepcopy(dataset) 142 dataset_1 = dataset_1.apply(apply_func) 143 dataset_zip = ds.zip((dataset_1, dataset_2)) 144 expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] 145 for i in dataset_zip.create_dict_iterator(num_epochs=1, output_numpy=True): 146 ret = to_str(i["text"]) 147 for key, value in enumerate(ret): 148 assert value == expect[key] 149 150 151def concat_test(dataset): 152 dataset_1 = copy.deepcopy(dataset) 153 dataset = dataset.concat(dataset_1) 154 expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] 155 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 156 ret = to_str(i["text"]) 157 for key, value in enumerate(ret): 158 assert value == expect[key] 159 160def test_with_zip_concat(): 161 data = ds.TextFileDataset(VOCAB_FILE, shuffle=False) 162 vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) 163 tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) 164 dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) 165 dataset = dataset.map(operations=tokenizer, num_parallel_workers=2) 166 zip_test(dataset) 167 concat_test(dataset) 168 169 170if __name__ == "__main__": 171 test_sentence_piece_tokenizer_callable() 172 test_from_vocab_to_str_UNIGRAM() 173 test_from_vocab_to_str_BPE() 174 test_from_vocab_to_str_CHAR() 175 test_from_vocab_to_str_WORD() 176 test_from_vocab_to_int() 177 test_from_file_to_str() 178 test_from_file_to_int() 179 test_build_from_dataset() 180 test_with_zip_concat() 181