1# Copyright 2019 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16import numpy as np 17 18import mindspore.dataset as ds 19import mindspore.dataset.text as text 20 21# this file contains "home is behind the world head" each word is 1 line 22DATA_FILE = "../data/dataset/testVocab/words.txt" 23VOCAB_FILE = "../data/dataset/testVocab/vocab_list.txt" 24HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8" 25MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8" 26 27 28def test_on_tokenized_line(): 29 data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False) 30 jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP) 31 with open(VOCAB_FILE, 'r') as f: 32 for line in f: 33 word = line.split(',')[0] 34 jieba_op.add_word(word) 35 data = data.map(operations=jieba_op, input_columns=["text"]) 36 vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["<pad>", "<unk>"]) 37 lookup = text.Lookup(vocab, "<unk>") 38 data = data.map(operations=lookup, input_columns=["text"]) 39 res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14], 40 [11, 1, 12, 1, 10, 1, 14, 1, 13, 1, 15]], dtype=np.int32) 41 for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): 42 np.testing.assert_array_equal(d["text"], res[i]) 43 44 45def test_on_tokenized_line_with_no_special_tokens(): 46 data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False) 47 jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP) 48 with open(VOCAB_FILE, 'r') as f: 49 for line in f: 50 word = line.split(',')[0] 51 jieba_op.add_word(word) 52 53 data = data.map(operations=jieba_op, input_columns=["text"]) 54 vocab = text.Vocab.from_file(VOCAB_FILE, ",") 55 lookup = text.Lookup(vocab, "not") 56 data = data.map(operations=lookup, input_columns=["text"]) 57 res = np.array([[8, 0, 9, 0, 10, 0, 13, 0, 11, 0, 12], 58 [9, 0, 10, 0, 8, 0, 12, 0, 11, 0, 13]], dtype=np.int32) 59 for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): 60 np.testing.assert_array_equal(d["text"], res[i]) 61 62 63if __name__ == '__main__': 64 test_on_tokenized_line() 65 test_on_tokenized_line_with_no_special_tokens() 66