1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minddata/dataset/engine/datasetops/source/iwslt_op.h"
18
19 #include <fstream>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "minddata/dataset/util/status.h"
25 #include "utils/file_utils.h"
26
27 namespace mindspore {
28 namespace dataset {
IWSLTOp(int32_t num_workers,int64_t num_samples,int32_t worker_connector_size,int32_t op_connector_size,bool shuffle_files,int32_t num_devices,int32_t device_id,std::unique_ptr<DataSchema> data_schema,IWSLTType type,const std::string & dataset_dir,const std::string & usage,const std::vector<std::string> & language_pair,const std::string & valid_set,const std::string & test_set)29 IWSLTOp::IWSLTOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size, int32_t op_connector_size,
30 bool shuffle_files, int32_t num_devices, int32_t device_id, std::unique_ptr<DataSchema> data_schema,
31 IWSLTType type, const std::string &dataset_dir, const std::string &usage,
32 const std::vector<std::string> &language_pair, const std::string &valid_set,
33 const std::string &test_set)
34 : NonMappableLeafOp(num_workers, worker_connector_size, num_samples, op_connector_size, shuffle_files, num_devices,
35 device_id),
36 iwslt_type_(type),
37 data_schema_(std::move(data_schema)),
38 dataset_dir_(dataset_dir),
39 usage_(usage),
40 language_pair_(language_pair),
41 valid_set_(valid_set),
42 test_set_(test_set) {}
43
Init()44 Status IWSLTOp::Init() {
45 RETURN_IF_NOT_OK(this->GetFiles());
46 RETURN_IF_NOT_OK(filename_index_->insert(src_target_file_list_));
47
48 int32_t safe_queue_size = static_cast<int32_t>(std::ceil(src_target_file_list_.size() / num_workers_) + 1);
49 io_block_queues_.Init(num_workers_, safe_queue_size);
50
51 jagged_rows_connector_ = std::make_unique<JaggedConnector>(num_workers_, 1, worker_connector_size_);
52 return Status::OK();
53 }
54
Split(const std::string & s,const std::string & delim)55 std::vector<std::string> IWSLTOp::Split(const std::string &s, const std::string &delim) {
56 std::vector<std::string> res;
57 std::string::size_type pos1 = 0;
58 std::string::size_type pos2 = s.find(delim);
59 while (std::string::npos != pos2) {
60 res.push_back(s.substr(pos1, pos2 - pos1));
61
62 pos1 = pos2 + delim.size();
63 pos2 = s.find(delim, pos1);
64 }
65 if (pos1 != s.length()) {
66 res.push_back(s.substr(pos1));
67 }
68 return res;
69 }
70
Trim(std::string * text,const std::string & character)71 Status IWSLTOp::Trim(std::string *text, const std::string &character) {
72 RETURN_UNEXPECTED_IF_NULL(text);
73 CHECK_FAIL_RETURN_UNEXPECTED(!text->empty(), "Invalid file, read an empty line.");
74 (void)text->erase(0, text->find_first_not_of(character));
75 (void)text->erase(text->find_last_not_of(character) + 1);
76 return Status::OK();
77 }
78
LoadTensor(const std::string & line,TensorRow * out_row,size_t index)79 Status IWSLTOp::LoadTensor(const std::string &line, TensorRow *out_row, size_t index) {
80 RETURN_UNEXPECTED_IF_NULL(out_row);
81 std::shared_ptr<Tensor> tensor;
82 RETURN_IF_NOT_OK(Tensor::CreateScalar(line, &tensor));
83 (*out_row)[index] = std::move(tensor);
84 return Status::OK();
85 }
86
LoadFile(const std::string & file,int64_t start_offset,int64_t end_offset,int32_t worker_id)87 Status IWSLTOp::LoadFile(const std::string &file, int64_t start_offset, int64_t end_offset, int32_t worker_id) {
88 std::ifstream handle(file, std::ifstream::in);
89 std::string line;
90 if (!handle.is_open()) {
91 RETURN_STATUS_UNEXPECTED("Invalid file, failed to open " + DatasetName() + " file: " + file);
92 }
93
94 int64_t rows_total = 0;
95 while (getline(handle, line)) {
96 if (line.empty()) {
97 continue;
98 }
99 // If read to the end offset of this file, break.
100 if (rows_total >= end_offset) {
101 break;
102 }
103 // Skip line before start offset.
104 if (rows_total < start_offset) {
105 rows_total++;
106 continue;
107 }
108
109 const int kColumnSize = 2;
110 TensorRow tRow(kColumnSize, nullptr);
111 tRow.setPath({file, file});
112
113 // Remove the newline character.
114 auto s = Trim(&line, "\n");
115 if (s != Status::OK()) {
116 handle.close();
117 return s;
118 }
119 s = Trim(&line, "\r");
120 if (s != Status::OK()) {
121 handle.close();
122 return s;
123 }
124 std::vector<std::string> sentence_list = Split(line, "#*$");
125 if (!sentence_list.empty() && sentence_list.size() == kColumnSize) {
126 s = LoadTensor(sentence_list[0], &tRow, 0);
127 if (s != Status::OK()) {
128 handle.close();
129 return s;
130 }
131 s = LoadTensor(sentence_list[1], &tRow, 1);
132 if (s != Status::OK()) {
133 handle.close();
134 return s;
135 }
136 s = jagged_rows_connector_->Add(worker_id, std::move(tRow));
137 if (s != Status::OK()) {
138 handle.close();
139 return s;
140 }
141 rows_total++;
142 }
143 }
144 handle.close();
145 return Status::OK();
146 }
147
FillIOBlockQueue(const std::vector<int64_t> & i_keys)148 Status IWSLTOp::FillIOBlockQueue(const std::vector<int64_t> &i_keys) {
149 int32_t queue_index = 0;
150 int64_t pre_count = 0;
151 int64_t start_offset = 0;
152 int64_t end_offset = 0;
153 bool finish = false;
154 while (!finish) {
155 std::vector<std::pair<std::string, int64_t>> file_index;
156 if (!i_keys.empty()) {
157 for (auto it = i_keys.begin(); it != i_keys.end(); ++it) {
158 {
159 if (!GetLoadIoBlockQueue()) {
160 break;
161 }
162 }
163 file_index.emplace_back(std::pair<std::string, int64_t>((*filename_index_)[*it], *it));
164 }
165 } else {
166 for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
167 {
168 if (!GetLoadIoBlockQueue()) {
169 break;
170 }
171 }
172 file_index.emplace_back(std::pair<std::string, int64_t>(it.value(), it.key()));
173 }
174 }
175 for (auto file_info : file_index) {
176 if (NeedPushFileToBlockQueue(file_info.first, &start_offset, &end_offset, pre_count)) {
177 auto ioBlock = std::make_unique<FilenameBlock>(file_info.second, start_offset, end_offset, IOBlock::kFlagNone);
178 RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
179 queue_index = (queue_index + 1) % num_workers_;
180 }
181
182 pre_count += filename_numrows_[file_info.first];
183 }
184
185 if (pre_count < (static_cast<int64_t>(device_id_) + 1) * num_rows_per_shard_) {
186 finish = false;
187 } else {
188 finish = true;
189 }
190 }
191
192 RETURN_IF_NOT_OK(PostEndOfEpoch(queue_index));
193 return Status::OK();
194 }
195
Print(std::ostream & out,bool show_all) const196 void IWSLTOp::Print(std::ostream &out, bool show_all) const {
197 if (!show_all) {
198 // Call the super class for displaying any common 1-liner info.
199 ParallelOp::Print(out, show_all);
200 // Then show any custom derived-internal 1-liner info for this op.
201 out << "\n";
202 } else {
203 // Call the super class for displaying any common detailed info.
204 ParallelOp::Print(out, show_all);
205 // Then show any custom derived-internal stuff.
206 out << "\nSample count: " << total_rows_ << "\nDevice id: " << device_id_ << "\nNumber of devices: " << num_devices_
207 << "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no") << "\nIWSLT files list:\n";
208 for (int i = 0; i < src_target_file_list_.size(); ++i) {
209 out << " " << src_target_file_list_[i];
210 }
211 out << "\nData Schema:\n";
212 out << *data_schema_ << "\n\n";
213 }
214 }
215
CountFileRows(const std::string & file)216 int64_t IWSLTOp::CountFileRows(const std::string &file) {
217 std::ifstream handle(file, std::ifstream::in);
218 if (!handle.is_open()) {
219 MS_LOG(ERROR) << "Invalid file, failed to open file: " << file;
220 return 0;
221 }
222
223 std::string line;
224 int64_t count = 0;
225 while (getline(handle, line)) {
226 if (!line.empty()) {
227 count++;
228 }
229 }
230 handle.close();
231 return count;
232 }
233
CalculateNumRowsPerShard()234 Status IWSLTOp::CalculateNumRowsPerShard() {
235 for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
236 int64_t count = CountFileRows(it.value());
237 filename_numrows_[it.value()] = count;
238 num_rows_ += count;
239 }
240 if (num_rows_ == 0) {
241 std::stringstream ss;
242 for (int i = 0; i < src_target_file_list_.size(); ++i) {
243 ss << " " << src_target_file_list_[i];
244 }
245 std::string file_list = ss.str();
246 RETURN_STATUS_UNEXPECTED("Invalid data, " + DatasetName(true) +
247 "Dataset API can't read the data file (interface mismatch or no data found). Check " +
248 DatasetName() + ": " + file_list);
249 }
250
251 num_rows_per_shard_ = static_cast<int64_t>(std::ceil(num_rows_ * 1.0 / num_devices_));
252 MS_LOG(DEBUG) << "Number rows per shard is " << num_rows_per_shard_;
253 return Status::OK();
254 }
255
ComputeColMap()256 Status IWSLTOp::ComputeColMap() {
257 // Set the column name mapping (base class field).
258 if (column_name_id_map_.empty()) {
259 for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
260 column_name_id_map_[data_schema_->Column(i).Name()] = i;
261 }
262 } else {
263 MS_LOG(WARNING) << "Column name map is already set!";
264 }
265 return Status::OK();
266 }
267
CountTotalRows(IWSLTType type,const std::string & dataset_dir,const std::string & usage,const std::vector<std::string> & language_pair,const std::string & valid_set,const std::string & test_set,int64_t * count)268 Status IWSLTOp::CountTotalRows(IWSLTType type, const std::string &dataset_dir, const std::string &usage,
269 const std::vector<std::string> &language_pair, const std::string &valid_set,
270 const std::string &test_set, int64_t *count) {
271 RETURN_UNEXPECTED_IF_NULL(count);
272 int32_t num_workers = GlobalContext::config_manager()->num_parallel_workers();
273 int32_t connector_que_size = GlobalContext::config_manager()->op_connector_size();
274 int32_t worker_connector_size = GlobalContext::config_manager()->worker_connector_size();
275 const int32_t shard_id = 0;
276 const int32_t num_shards = 1;
277 const int64_t num_samples = 0;
278 bool shuffle_files = false;
279 // Do internal Schema generation.
280 auto schema = std::make_unique<DataSchema>();
281
282 // Create and initialize.
283 std::shared_ptr<IWSLTOp> op = std::make_shared<IWSLTOp>(
284 num_workers, num_samples, worker_connector_size, connector_que_size, shuffle_files, num_shards, shard_id,
285 std::move(schema), type, dataset_dir, usage, language_pair, valid_set, test_set);
286 RETURN_IF_NOT_OK(op->Init());
287
288 *count = 0;
289 std::vector<std::string> file_list = op->FileNames();
290 for (auto file : file_list) {
291 *count += op->CountFileRows(file);
292 }
293 return Status::OK();
294 }
295
LoadXmlDocument(XMLDocument * xml_document,const std::string & file_path,XMLElement ** doc)296 Status LoadXmlDocument(XMLDocument *xml_document, const std::string &file_path, XMLElement **doc) {
297 RETURN_UNEXPECTED_IF_NULL(xml_document);
298 XMLError e = xml_document->LoadFile(common::SafeCStr(file_path));
299 if (e != XMLError::XML_SUCCESS) {
300 RETURN_STATUS_UNEXPECTED("Invalid file, failed to load xml file: " + file_path);
301 }
302 XMLElement *root = xml_document->RootElement();
303 if (root == nullptr) {
304 RETURN_STATUS_UNEXPECTED("Invalid data, failed to load root element for xml file.");
305 }
306 XMLElement *firstChild = root->FirstChildElement();
307 if (firstChild == nullptr) {
308 RETURN_STATUS_UNEXPECTED("Invalid data, no first child found in " + file_path);
309 }
310 *doc = firstChild->FirstChildElement("doc");
311 if (*doc == nullptr) {
312 RETURN_STATUS_UNEXPECTED("Invalid data, no doc found in " + file_path);
313 }
314 return Status::OK();
315 }
316
CleanXmlFile(const std::string & src_file_path,const std::string & target_file_path,const std::string & new_file_path)317 Status IWSLTOp::CleanXmlFile(const std::string &src_file_path, const std::string &target_file_path,
318 const std::string &new_file_path) {
319 XMLDocument xml_document1, xml_document2;
320 XMLElement *src_doc = nullptr;
321 XMLElement *target_doc = nullptr;
322
323 RETURN_IF_NOT_OK(LoadXmlDocument(&xml_document1, src_file_path, &src_doc));
324 RETURN_IF_NOT_OK(LoadXmlDocument(&xml_document2, target_file_path, &target_doc));
325 std::string src_content, target_content;
326 std::ofstream new_file(new_file_path, std::ofstream::out);
327 CHECK_FAIL_RETURN_UNEXPECTED(new_file.is_open(), "Invalid file, failed to open file: " + new_file_path);
328
329 while (src_doc != nullptr && target_doc != nullptr) {
330 XMLElement *src_seg = src_doc->FirstChildElement("seg");
331 XMLElement *target_seg = target_doc->FirstChildElement("seg");
332 while (src_seg != nullptr && target_seg != nullptr) {
333 src_content = src_seg->GetText();
334 target_content = target_seg->GetText();
335 auto s = Trim(&src_content, " ");
336 if (s != Status::OK()) {
337 new_file.close();
338 return s;
339 }
340 s = Trim(&target_content, " ");
341 if (s != Status::OK()) {
342 new_file.close();
343 return s;
344 }
345 src_seg = src_seg->NextSiblingElement();
346 target_seg = target_seg->NextSiblingElement();
347 new_file << (src_content + "#*$" + target_content + "\n");
348 }
349 src_doc = src_doc->NextSiblingElement();
350 target_doc = target_doc->NextSiblingElement();
351 }
352
353 new_file.close();
354
355 ChangeFileMode(new_file_path, S_IRUSR | S_IWUSR);
356
357 return Status::OK();
358 }
359
IsContainTags(const std::string & content)360 bool IWSLTOp::IsContainTags(const std::string &content) {
361 std::vector<std::string> xml_tags = {"<url", "<keywords", "<talkid", "<description", "<reviewer",
362 "<translator", "<title", "<speaker", "<doc", "</doc"};
363 int i = 0;
364 int size = xml_tags.size();
365 while (i < size) {
366 if (content.find(xml_tags[i]) != std::string::npos) {
367 return true;
368 }
369 i++;
370 }
371 return false;
372 }
373
CleanTagFile(const std::string & src_file_path,const std::string & target_file_path,const std::string & new_file_path)374 Status IWSLTOp::CleanTagFile(const std::string &src_file_path, const std::string &target_file_path,
375 const std::string &new_file_path) {
376 std::ifstream src_handle(src_file_path, std::ifstream::in);
377 std::ifstream target_handle(target_file_path, std::ifstream::in);
378
379 std::ofstream new_file(new_file_path, std::ios::out | std::ios::trunc);
380 std::string src_content, target_content;
381 while (getline(src_handle, src_content)) {
382 while (getline(target_handle, target_content)) {
383 if (!IsContainTags(src_content) && !IsContainTags(target_content)) {
384 auto s = Trim(&src_content, " ");
385 if (s != Status::OK()) {
386 src_handle.close();
387 target_handle.close();
388 new_file.close();
389 return s;
390 }
391 s = Trim(&target_content, " ");
392 if (s != Status::OK()) {
393 src_handle.close();
394 target_handle.close();
395 new_file.close();
396 return s;
397 }
398 new_file << (src_content + "#*$" + target_content + "\n");
399 }
400 break;
401 }
402 }
403 new_file.close();
404
405 ChangeFileMode(new_file_path, S_IRUSR | S_IWUSR);
406
407 src_handle.close();
408 target_handle.close();
409 return Status::OK();
410 }
411
GenerateNewFile(const std::vector<std::string> & src_file_list,const std::vector<std::string> & target_file_list,std::vector<std::string> * src_target_file_list)412 Status IWSLTOp::GenerateNewFile(const std::vector<std::string> &src_file_list,
413 const std::vector<std::string> &target_file_list,
414 std::vector<std::string> *src_target_file_list) {
415 RETURN_UNEXPECTED_IF_NULL(src_target_file_list);
416 std::string::size_type position;
417 std::string new_path;
418 std::string src_path, target_path;
419 for (int i = 0; i < src_file_list.size(); i++) {
420 src_path = src_file_list[i];
421 target_path = target_file_list[i];
422
423 // Add new train file name.
424 position = src_path.find(".tags");
425 if (position != std::string::npos) {
426 new_path = src_path;
427 const int kTagSize = 5;
428 const int kSuffixSize = 3;
429 new_path = new_path.replace(new_path.find(".tags"), kTagSize, "");
430 new_path = new_path.substr(0, new_path.length() - kSuffixSize);
431
432 // Write data to the new file path.
433 RETURN_IF_NOT_OK(CleanTagFile(src_path, target_path, new_path));
434 src_target_file_list->push_back(new_path);
435 } else {
436 // Add new valid or test file name.
437 // Delete suffix.
438 const int kSuffixXMLSize = 7;
439 new_path = src_path;
440 new_path = new_path.substr(0, new_path.length() - kSuffixXMLSize);
441 // Write data to the new file path.
442 RETURN_IF_NOT_OK(CleanXmlFile(src_path, target_path, new_path));
443 src_target_file_list->push_back(new_path);
444 }
445 }
446 return Status::OK();
447 }
448
GenerateIWSLT2016TagsFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & suffix)449 std::string IWSLTOp::GenerateIWSLT2016TagsFileName(Path dir, const std::string &src_language,
450 const std::string &target_language, const std::string &suffix) {
451 Path src_language_path(src_language);
452 Path target_language_path(target_language);
453 Path sub_dir(src_language + "-" + target_language);
454 Path file_name("train.tags." + src_language + "-" + target_language + "." + suffix);
455 Path file_path = dir / "texts" / src_language_path / target_language_path / sub_dir / file_name;
456 return file_path.ToString();
457 }
458
GenerateIWSLT2016XMLFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & set_type,const std::string & suffix)459 std::string IWSLTOp::GenerateIWSLT2016XMLFileName(Path dir, const std::string &src_language,
460 const std::string &target_language, const std::string &set_type,
461 const std::string &suffix) {
462 Path src_language_path(src_language);
463 Path target_language_path(target_language);
464 Path sub_dir(src_language + "-" + target_language);
465 Path file_name("IWSLT16.TED." + set_type + "." + src_language + "-" + target_language + "." + suffix + ".xml");
466 Path file_path = dir / "texts" / src_language_path / target_language_path / sub_dir / file_name;
467 return file_path.ToString();
468 }
469
GenerateIWSLT2017TagsFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & suffix)470 std::string IWSLTOp::GenerateIWSLT2017TagsFileName(Path dir, const std::string &src_language,
471 const std::string &target_language, const std::string &suffix) {
472 Path sub_const_dir("texts");
473 Path sub_src_language_dir("DeEnItNlRo");
474 Path sub_tgt_language_dir("DeEnItNlRo");
475 Path sub_src_tgt_dir("DeEnItNlRo-DeEnItNlRo");
476 Path file_name("train.tags." + src_language + "-" + target_language + "." + suffix);
477 Path file_path = dir / sub_const_dir / sub_src_language_dir / sub_tgt_language_dir / sub_src_tgt_dir / file_name;
478 return file_path.ToString();
479 }
480
GenerateIWSLT2017XMLFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & set_type,const std::string & suffix)481 std::string IWSLTOp::GenerateIWSLT2017XMLFileName(Path dir, const std::string &src_language,
482 const std::string &target_language, const std::string &set_type,
483 const std::string &suffix) {
484 Path sub_const_dir("texts");
485 Path sub_src_language_dir("DeEnItNlRo");
486 Path sub_tgt_language_dir("DeEnItNlRo");
487 Path sub_src_tgt_dir("DeEnItNlRo-DeEnItNlRo");
488 Path file_name("IWSLT17.TED." + set_type + "." + src_language + "-" + target_language + "." + suffix + ".xml");
489 Path file_path = dir / sub_const_dir / sub_src_language_dir / sub_tgt_language_dir / sub_src_tgt_dir / file_name;
490 return file_path.ToString();
491 }
492
GetFiles()493 Status IWSLTOp::GetFiles() {
494 std::vector<std::string> src_path_list;
495 std::vector<std::string> target_path_list;
496 auto real_dataset_dir = FileUtils::GetRealPath(dataset_dir_.c_str());
497 CHECK_FAIL_RETURN_UNEXPECTED(real_dataset_dir.has_value(), "Get real path failed: " + dataset_dir_);
498 Path root_dir(real_dataset_dir.value());
499
500 if (iwslt_type_ == kIWSLT2016) {
501 if (usage_ == "train" || usage_ == "all") {
502 src_path_list.push_back(
503 GenerateIWSLT2016TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[0]));
504 target_path_list.push_back(
505 GenerateIWSLT2016TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[1]));
506 }
507 if (usage_ == "valid" || usage_ == "all") {
508 src_path_list.push_back(
509 GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[0]));
510 target_path_list.push_back(
511 GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[1]));
512 }
513 if (usage_ == "test" || usage_ == "all") {
514 src_path_list.push_back(
515 GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[0]));
516 target_path_list.push_back(
517 GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[1]));
518 }
519 } else {
520 if (usage_ == "train" || usage_ == "all") {
521 src_path_list.push_back(
522 GenerateIWSLT2017TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[0]));
523 target_path_list.push_back(
524 GenerateIWSLT2017TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[1]));
525 }
526 if (usage_ == "valid" || usage_ == "all") {
527 src_path_list.push_back(
528 GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[0]));
529 target_path_list.push_back(
530 GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[1]));
531 }
532 if (usage_ == "test" || usage_ == "all") {
533 src_path_list.push_back(
534 GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[0]));
535 target_path_list.push_back(
536 GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[1]));
537 }
538 }
539 RETURN_IF_NOT_OK(GenerateNewFile(src_path_list, target_path_list, &src_target_file_list_));
540 return Status::OK();
541 }
542 } // namespace dataset
543 } // namespace mindspore
544