1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16Testing BertTokenizer op in DE 17""" 18import numpy as np 19import pytest 20import mindspore.dataset as ds 21from mindspore import log as logger 22import mindspore.dataset.text as text 23 24BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt" 25 26vocab_bert = [ 27 "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", "思", "故", "乡", 28 "繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", 29 "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", 30 "", "", "", "", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", 31 "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]" 32] 33pad = '<pad>' 34test_paras = [ 35 # test chinese text 36 dict( 37 first=1, 38 last=4, 39 expect_str=[['床', '前', '明', '月', '光'], 40 ['疑', '是', '地', '上', '霜'], 41 ['举', '头', '望', '明', '月'], 42 ['低', '头', '思', '故', '乡']], 43 expected_offsets_start=[[0, 3, 6, 9, 12], 44 [0, 3, 6, 9, 12], 45 [0, 3, 6, 9, 12], 46 [0, 3, 6, 9, 12]], 47 expected_offsets_limit=[[3, 6, 9, 12, 15], 48 [3, 6, 9, 12, 15], 49 [3, 6, 9, 12, 15], 50 [3, 6, 9, 12, 15]], 51 vocab_list=vocab_bert 52 ), 53 # test english text 54 dict( 55 first=5, 56 last=5, 57 expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], 58 expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]], 59 expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]], 60 lower_case=True, 61 vocab_list=vocab_bert 62 ), 63 dict( 64 first=5, 65 last=5, 66 expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], 67 expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]], 68 expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]], 69 lower_case=False, 70 vocab_list=vocab_bert 71 ), 72 # test emoji tokens 73 dict( 74 first=6, 75 last=7, 76 expect_str=[ 77 ['', '嘿', '嘿', '', '哈', '哈', '', '大', '笑', '', '嘻', '嘻'], 78 ['繁', '體', '字']], 79 expected_offsets_start=[[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], [0, 3, 6]], 80 expected_offsets_limit=[[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], [3, 6, 9]], 81 normalization_form=text.utils.NormalizeForm.NFKC, 82 vocab_list=vocab_bert 83 ), 84 # test preserved tokens 85 dict( 86 first=8, 87 last=14, 88 expect_str=[ 89 ['[UNK]', '[CLS]'], 90 ['[UNK]', '[SEP]'], 91 ['[UNK]', '[UNK]'], 92 ['[UNK]', '[PAD]'], 93 ['[UNK]', '[MASK]'], 94 ['[unused1]'], 95 ['[unused10]'] 96 ], 97 expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]], 98 expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]], 99 lower_case=False, 100 vocab_list=vocab_bert, 101 preserve_unused_token=True, 102 ), 103 dict( 104 first=8, 105 last=14, 106 expect_str=[ 107 ['[UNK]', '[CLS]'], 108 ['[UNK]', '[SEP]'], 109 ['[UNK]', '[UNK]'], 110 ['[UNK]', '[PAD]'], 111 ['[UNK]', '[MASK]'], 112 ['[unused1]'], 113 ['[unused10]'] 114 ], 115 expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]], 116 expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]], 117 lower_case=True, 118 vocab_list=vocab_bert, 119 preserve_unused_token=True, 120 ), 121 # test special symbol 122 dict( 123 first=15, 124 last=15, 125 expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']], 126 expected_offsets_start=[[0, 2, 3, 4, 5, 7, 8, 10, 11, 12]], 127 expected_offsets_limit=[[2, 3, 4, 5, 7, 8, 10, 11, 12, 14]], 128 preserve_unused_token=True, 129 vocab_list=vocab_bert 130 ), 131 # test non-default params 132 dict( 133 first=8, 134 last=8, 135 expect_str=[['[UNK]', ' ', '[CLS]']], 136 expected_offsets_start=[[0, 6, 7]], 137 expected_offsets_limit=[[6, 7, 12]], 138 lower_case=False, 139 vocab_list=vocab_bert, 140 preserve_unused_token=True, 141 keep_whitespace=True 142 ), 143 dict( 144 first=8, 145 last=8, 146 expect_str=[['unused', ' ', '[CLS]']], 147 expected_offsets_start=[[0, 6, 7]], 148 expected_offsets_limit=[[6, 7, 12]], 149 lower_case=False, 150 vocab_list=vocab_bert, 151 preserve_unused_token=True, 152 keep_whitespace=True, 153 unknown_token='' 154 ), 155 dict( 156 first=8, 157 last=8, 158 expect_str=[['unused', ' ', '[', 'CLS', ']']], 159 expected_offsets_start=[[0, 6, 7, 8, 11]], 160 expected_offsets_limit=[[6, 7, 8, 11, 12]], 161 lower_case=False, 162 vocab_list=vocab_bert, 163 preserve_unused_token=False, 164 keep_whitespace=True, 165 unknown_token='' 166 ), 167] 168 169 170def check_bert_tokenizer_default(first, last, expect_str, 171 expected_offsets_start, expected_offsets_limit, 172 vocab_list, suffix_indicator='##', 173 max_bytes_per_token=100, unknown_token='[UNK]', 174 lower_case=False, keep_whitespace=False, 175 normalization_form=text.utils.NormalizeForm.NONE, 176 preserve_unused_token=False): 177 dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) 178 if first > 1: 179 dataset = dataset.skip(first - 1) 180 if last >= first: 181 dataset = dataset.take(last - first + 1) 182 vocab = text.Vocab.from_list(vocab_list) 183 tokenizer_op = text.BertTokenizer( 184 vocab=vocab, suffix_indicator=suffix_indicator, 185 max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, 186 lower_case=lower_case, keep_whitespace=keep_whitespace, 187 normalization_form=normalization_form, 188 preserve_unused_token=preserve_unused_token) 189 dataset = dataset.map(operations=tokenizer_op) 190 count = 0 191 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 192 token = text.to_str(i['text']) 193 logger.info("Out:", token) 194 logger.info("Exp:", expect_str[count]) 195 np.testing.assert_array_equal(token, expect_str[count]) 196 count = count + 1 197 198 199def check_bert_tokenizer_with_offsets(first, last, expect_str, 200 expected_offsets_start, expected_offsets_limit, 201 vocab_list, suffix_indicator='##', 202 max_bytes_per_token=100, unknown_token='[UNK]', 203 lower_case=False, keep_whitespace=False, 204 normalization_form=text.utils.NormalizeForm.NONE, 205 preserve_unused_token=False): 206 dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) 207 if first > 1: 208 dataset = dataset.skip(first - 1) 209 if last >= first: 210 dataset = dataset.take(last - first + 1) 211 vocab = text.Vocab.from_list(vocab_list) 212 tokenizer_op = text.BertTokenizer( 213 vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, 214 unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, 215 normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True) 216 dataset = dataset.map(operations=tokenizer_op, input_columns=['text'], 217 output_columns=['token', 'offsets_start', 'offsets_limit'], 218 column_order=['token', 'offsets_start', 'offsets_limit']) 219 count = 0 220 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 221 token = text.to_str(i['token']) 222 logger.info("Out:", token) 223 logger.info("Exp:", expect_str[count]) 224 np.testing.assert_array_equal(token, expect_str[count]) 225 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 226 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 227 count = count + 1 228 229 230def test_bert_tokenizer_default(): 231 """ 232 Test WordpieceTokenizer when with_offsets=False 233 """ 234 for paras in test_paras: 235 check_bert_tokenizer_default(**paras) 236 237 238def test_bert_tokenizer_with_offsets(): 239 """ 240 Test WordpieceTokenizer when with_offsets=True 241 """ 242 for paras in test_paras: 243 check_bert_tokenizer_with_offsets(**paras) 244 245 246def test_bert_tokenizer_callable_invalid_input(): 247 """ 248 Test WordpieceTokenizer in eager mode with invalid input 249 """ 250 data = {'张三': 18, '王五': 20} 251 vocab = text.Vocab.from_list(vocab_bert) 252 tokenizer_op = text.BertTokenizer(vocab=vocab) 253 254 with pytest.raises(TypeError) as info: 255 _ = tokenizer_op(data) 256 assert "Invalid user input. Got <class 'dict'>: {'张三': 18, '王五': 20}, cannot be converted into tensor." in str(info) 257 258if __name__ == '__main__': 259 test_bert_tokenizer_callable_invalid_input() 260 test_bert_tokenizer_default() 261 test_bert_tokenizer_with_offsets() 262