1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15""" 16Testing BasicTokenizer op in DE 17""" 18import numpy as np 19import mindspore.dataset as ds 20from mindspore import log as logger 21import mindspore.dataset.text as text 22 23BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" 24 25test_paras = [ 26 dict( 27 first=1, 28 last=6, 29 expected_tokens= 30 [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'], 31 ['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'], 32 ['', '嘿', '嘿', '', '哈', '哈', '', '大', '笑', '', '嘻', '嘻'], 33 ['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝', 34 '(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封', 35 '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'], 36 ['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代', 37 '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', 38 '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], 39 ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', 40 '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']], 41 expected_offsets_start=[[0, 8, 11, 18, 21, 24, 27, 30], 42 [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42], 43 [0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], 44 [0, 3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 45 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100], 46 [0, 3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 47 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115], 48 [0, 10, 11, 15, 16, 20, 21, 25, 35, 36, 40, 41, 45, 46, 50, 57, 64, 74, 87, 97, 101]], 49 expected_offsets_limit=[[7, 10, 18, 21, 24, 27, 30, 33], 50 [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45], 51 [4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], 52 [3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 52, 55, 58, 53 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 103], 54 [3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 54, 55 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115, 124], 56 [9, 11, 15, 16, 20, 21, 24, 34, 36, 40, 41, 45, 46, 49, 56, 63, 73, 86, 96, 100, 113]] 57 ), 58 dict( 59 first=7, 60 last=7, 61 expected_tokens=[['this', 'is', 'a', 'funky', 'string']], 62 expected_offsets_start=[[0, 5, 8, 10, 16]], 63 expected_offsets_limit=[[4, 7, 9, 15, 22]], 64 lower_case=True 65 ), 66] 67 68 69def check_basic_tokenizer_default(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, 70 lower_case=False, keep_whitespace=False, 71 normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): 72 dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) 73 if first > 1: 74 dataset = dataset.skip(first - 1) 75 if last >= first: 76 dataset = dataset.take(last - first + 1) 77 78 basic_tokenizer = text.BasicTokenizer(lower_case=lower_case, 79 keep_whitespace=keep_whitespace, 80 normalization_form=normalization_form, 81 preserve_unused_token=preserve_unused_token) 82 83 dataset = dataset.map(operations=basic_tokenizer) 84 count = 0 85 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 86 token = text.to_str(i['text']) 87 logger.info("Out:", token) 88 logger.info("Exp:", expected_tokens[count]) 89 np.testing.assert_array_equal(token, expected_tokens[count]) 90 count = count + 1 91 92 93def check_basic_tokenizer_with_offsets(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, 94 lower_case=False, keep_whitespace=False, 95 normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): 96 dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) 97 if first > 1: 98 dataset = dataset.skip(first - 1) 99 if last >= first: 100 dataset = dataset.take(last - first + 1) 101 102 basic_tokenizer = text.BasicTokenizer(lower_case=lower_case, 103 keep_whitespace=keep_whitespace, 104 normalization_form=normalization_form, 105 preserve_unused_token=preserve_unused_token, 106 with_offsets=True) 107 108 dataset = dataset.map(operations=basic_tokenizer, input_columns=['text'], 109 output_columns=['token', 'offsets_start', 'offsets_limit'], 110 column_order=['token', 'offsets_start', 'offsets_limit']) 111 count = 0 112 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): 113 token = text.to_str(i['token']) 114 logger.info("Out:", token) 115 logger.info("Exp:", expected_tokens[count]) 116 np.testing.assert_array_equal(token, expected_tokens[count]) 117 np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) 118 np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) 119 count = count + 1 120 121def test_basic_tokenizer_with_offsets(): 122 """ 123 Test BasicTokenizer 124 """ 125 for paras in test_paras: 126 check_basic_tokenizer_with_offsets(**paras) 127 128 129def test_basic_tokenizer_default(): 130 """ 131 Test BasicTokenizer 132 """ 133 for paras in test_paras: 134 check_basic_tokenizer_default(**paras) 135 136 137if __name__ == '__main__': 138 test_basic_tokenizer_default() 139 test_basic_tokenizer_with_offsets() 140