• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minddata/dataset/engine/datasetops/source/iwslt_op.h"
18 
19 #include <fstream>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "minddata/dataset/util/status.h"
25 #include "utils/file_utils.h"
26 
27 namespace mindspore {
28 namespace dataset {
IWSLTOp(int32_t num_workers,int64_t num_samples,int32_t worker_connector_size,int32_t op_connector_size,bool shuffle_files,int32_t num_devices,int32_t device_id,std::unique_ptr<DataSchema> data_schema,IWSLTType type,const std::string & dataset_dir,const std::string & usage,const std::vector<std::string> & language_pair,const std::string & valid_set,const std::string & test_set)29 IWSLTOp::IWSLTOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size, int32_t op_connector_size,
30                  bool shuffle_files, int32_t num_devices, int32_t device_id, std::unique_ptr<DataSchema> data_schema,
31                  IWSLTType type, const std::string &dataset_dir, const std::string &usage,
32                  const std::vector<std::string> &language_pair, const std::string &valid_set,
33                  const std::string &test_set)
34     : NonMappableLeafOp(num_workers, worker_connector_size, num_samples, op_connector_size, shuffle_files, num_devices,
35                         device_id),
36       iwslt_type_(type),
37       data_schema_(std::move(data_schema)),
38       dataset_dir_(dataset_dir),
39       usage_(usage),
40       language_pair_(language_pair),
41       valid_set_(valid_set),
42       test_set_(test_set) {}
43 
Init()44 Status IWSLTOp::Init() {
45   RETURN_IF_NOT_OK(this->GetFiles());
46   RETURN_IF_NOT_OK(filename_index_->insert(src_target_file_list_));
47 
48   int32_t safe_queue_size = static_cast<int32_t>(std::ceil(src_target_file_list_.size() / num_workers_) + 1);
49   io_block_queues_.Init(num_workers_, safe_queue_size);
50 
51   jagged_rows_connector_ = std::make_unique<JaggedConnector>(num_workers_, 1, worker_connector_size_);
52   return Status::OK();
53 }
54 
Split(const std::string & s,const std::string & delim)55 std::vector<std::string> IWSLTOp::Split(const std::string &s, const std::string &delim) {
56   std::vector<std::string> res;
57   std::string::size_type pos1 = 0;
58   std::string::size_type pos2 = s.find(delim);
59   while (std::string::npos != pos2) {
60     res.push_back(s.substr(pos1, pos2 - pos1));
61 
62     pos1 = pos2 + delim.size();
63     pos2 = s.find(delim, pos1);
64   }
65   if (pos1 != s.length()) {
66     res.push_back(s.substr(pos1));
67   }
68   return res;
69 }
70 
Trim(std::string * text,const std::string & character)71 Status IWSLTOp::Trim(std::string *text, const std::string &character) {
72   RETURN_UNEXPECTED_IF_NULL(text);
73   CHECK_FAIL_RETURN_UNEXPECTED(!text->empty(), "Invalid file, read an empty line.");
74   (void)text->erase(0, text->find_first_not_of(character));
75   (void)text->erase(text->find_last_not_of(character) + 1);
76   return Status::OK();
77 }
78 
LoadTensor(const std::string & line,TensorRow * out_row,size_t index)79 Status IWSLTOp::LoadTensor(const std::string &line, TensorRow *out_row, size_t index) {
80   RETURN_UNEXPECTED_IF_NULL(out_row);
81   std::shared_ptr<Tensor> tensor;
82   RETURN_IF_NOT_OK(Tensor::CreateScalar(line, &tensor));
83   (*out_row)[index] = std::move(tensor);
84   return Status::OK();
85 }
86 
LoadFile(const std::string & file,int64_t start_offset,int64_t end_offset,int32_t worker_id)87 Status IWSLTOp::LoadFile(const std::string &file, int64_t start_offset, int64_t end_offset, int32_t worker_id) {
88   std::ifstream handle(file, std::ifstream::in);
89   std::string line;
90   if (!handle.is_open()) {
91     RETURN_STATUS_UNEXPECTED("Invalid file, failed to open " + DatasetName() + " file: " + file);
92   }
93 
94   int64_t rows_total = 0;
95   while (getline(handle, line)) {
96     if (line.empty()) {
97       continue;
98     }
99     // If read to the end offset of this file, break.
100     if (rows_total >= end_offset) {
101       break;
102     }
103     // Skip line before start offset.
104     if (rows_total < start_offset) {
105       rows_total++;
106       continue;
107     }
108 
109     const int kColumnSize = 2;
110     TensorRow tRow(kColumnSize, nullptr);
111     tRow.setPath({file, file});
112 
113     // Remove the newline character.
114     auto s = Trim(&line, "\n");
115     if (s != Status::OK()) {
116       handle.close();
117       return s;
118     }
119     s = Trim(&line, "\r");
120     if (s != Status::OK()) {
121       handle.close();
122       return s;
123     }
124     std::vector<std::string> sentence_list = Split(line, "#*$");
125     if (!sentence_list.empty() && sentence_list.size() == kColumnSize) {
126       s = LoadTensor(sentence_list[0], &tRow, 0);
127       if (s != Status::OK()) {
128         handle.close();
129         return s;
130       }
131       s = LoadTensor(sentence_list[1], &tRow, 1);
132       if (s != Status::OK()) {
133         handle.close();
134         return s;
135       }
136       s = jagged_rows_connector_->Add(worker_id, std::move(tRow));
137       if (s != Status::OK()) {
138         handle.close();
139         return s;
140       }
141       rows_total++;
142     }
143   }
144   handle.close();
145   return Status::OK();
146 }
147 
FillIOBlockQueue(const std::vector<int64_t> & i_keys)148 Status IWSLTOp::FillIOBlockQueue(const std::vector<int64_t> &i_keys) {
149   int32_t queue_index = 0;
150   int64_t pre_count = 0;
151   int64_t start_offset = 0;
152   int64_t end_offset = 0;
153   bool finish = false;
154   while (!finish) {
155     std::vector<std::pair<std::string, int64_t>> file_index;
156     if (!i_keys.empty()) {
157       for (auto it = i_keys.begin(); it != i_keys.end(); ++it) {
158         {
159           if (!GetLoadIoBlockQueue()) {
160             break;
161           }
162         }
163         file_index.emplace_back(std::pair<std::string, int64_t>((*filename_index_)[*it], *it));
164       }
165     } else {
166       for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
167         {
168           if (!GetLoadIoBlockQueue()) {
169             break;
170           }
171         }
172         file_index.emplace_back(std::pair<std::string, int64_t>(it.value(), it.key()));
173       }
174     }
175     for (auto file_info : file_index) {
176       if (NeedPushFileToBlockQueue(file_info.first, &start_offset, &end_offset, pre_count)) {
177         auto ioBlock = std::make_unique<FilenameBlock>(file_info.second, start_offset, end_offset, IOBlock::kFlagNone);
178         RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
179         queue_index = (queue_index + 1) % num_workers_;
180       }
181 
182       pre_count += filename_numrows_[file_info.first];
183     }
184 
185     if (pre_count < (static_cast<int64_t>(device_id_) + 1) * num_rows_per_shard_) {
186       finish = false;
187     } else {
188       finish = true;
189     }
190   }
191 
192   RETURN_IF_NOT_OK(PostEndOfEpoch(queue_index));
193   return Status::OK();
194 }
195 
Print(std::ostream & out,bool show_all) const196 void IWSLTOp::Print(std::ostream &out, bool show_all) const {
197   if (!show_all) {
198     // Call the super class for displaying any common 1-liner info.
199     ParallelOp::Print(out, show_all);
200     // Then show any custom derived-internal 1-liner info for this op.
201     out << "\n";
202   } else {
203     // Call the super class for displaying any common detailed info.
204     ParallelOp::Print(out, show_all);
205     // Then show any custom derived-internal stuff.
206     out << "\nSample count: " << total_rows_ << "\nDevice id: " << device_id_ << "\nNumber of devices: " << num_devices_
207         << "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no") << "\nIWSLT files list:\n";
208     for (int i = 0; i < src_target_file_list_.size(); ++i) {
209       out << " " << src_target_file_list_[i];
210     }
211     out << "\nData Schema:\n";
212     out << *data_schema_ << "\n\n";
213   }
214 }
215 
CountFileRows(const std::string & file)216 int64_t IWSLTOp::CountFileRows(const std::string &file) {
217   std::ifstream handle(file, std::ifstream::in);
218   if (!handle.is_open()) {
219     MS_LOG(ERROR) << "Invalid file, failed to open file: " << file;
220     return 0;
221   }
222 
223   std::string line;
224   int64_t count = 0;
225   while (getline(handle, line)) {
226     if (!line.empty()) {
227       count++;
228     }
229   }
230   handle.close();
231   return count;
232 }
233 
CalculateNumRowsPerShard()234 Status IWSLTOp::CalculateNumRowsPerShard() {
235   for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
236     int64_t count = CountFileRows(it.value());
237     filename_numrows_[it.value()] = count;
238     num_rows_ += count;
239   }
240   if (num_rows_ == 0) {
241     std::stringstream ss;
242     for (int i = 0; i < src_target_file_list_.size(); ++i) {
243       ss << " " << src_target_file_list_[i];
244     }
245     std::string file_list = ss.str();
246     RETURN_STATUS_UNEXPECTED("Invalid data, " + DatasetName(true) +
247                              "Dataset API can't read the data file (interface mismatch or no data found). Check " +
248                              DatasetName() + ": " + file_list);
249   }
250 
251   num_rows_per_shard_ = static_cast<int64_t>(std::ceil(num_rows_ * 1.0 / num_devices_));
252   MS_LOG(DEBUG) << "Number rows per shard is " << num_rows_per_shard_;
253   return Status::OK();
254 }
255 
ComputeColMap()256 Status IWSLTOp::ComputeColMap() {
257   // Set the column name mapping (base class field).
258   if (column_name_id_map_.empty()) {
259     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
260       column_name_id_map_[data_schema_->Column(i).Name()] = i;
261     }
262   } else {
263     MS_LOG(WARNING) << "Column name map is already set!";
264   }
265   return Status::OK();
266 }
267 
CountTotalRows(IWSLTType type,const std::string & dataset_dir,const std::string & usage,const std::vector<std::string> & language_pair,const std::string & valid_set,const std::string & test_set,int64_t * count)268 Status IWSLTOp::CountTotalRows(IWSLTType type, const std::string &dataset_dir, const std::string &usage,
269                                const std::vector<std::string> &language_pair, const std::string &valid_set,
270                                const std::string &test_set, int64_t *count) {
271   RETURN_UNEXPECTED_IF_NULL(count);
272   int32_t num_workers = GlobalContext::config_manager()->num_parallel_workers();
273   int32_t connector_que_size = GlobalContext::config_manager()->op_connector_size();
274   int32_t worker_connector_size = GlobalContext::config_manager()->worker_connector_size();
275   const int32_t shard_id = 0;
276   const int32_t num_shards = 1;
277   const int64_t num_samples = 0;
278   bool shuffle_files = false;
279   // Do internal Schema generation.
280   auto schema = std::make_unique<DataSchema>();
281 
282   // Create and initialize.
283   std::shared_ptr<IWSLTOp> op = std::make_shared<IWSLTOp>(
284     num_workers, num_samples, worker_connector_size, connector_que_size, shuffle_files, num_shards, shard_id,
285     std::move(schema), type, dataset_dir, usage, language_pair, valid_set, test_set);
286   RETURN_IF_NOT_OK(op->Init());
287 
288   *count = 0;
289   std::vector<std::string> file_list = op->FileNames();
290   for (auto file : file_list) {
291     *count += op->CountFileRows(file);
292   }
293   return Status::OK();
294 }
295 
LoadXmlDocument(XMLDocument * xml_document,const std::string & file_path,XMLElement ** doc)296 Status LoadXmlDocument(XMLDocument *xml_document, const std::string &file_path, XMLElement **doc) {
297   RETURN_UNEXPECTED_IF_NULL(xml_document);
298   XMLError e = xml_document->LoadFile(common::SafeCStr(file_path));
299   if (e != XMLError::XML_SUCCESS) {
300     RETURN_STATUS_UNEXPECTED("Invalid file, failed to load xml file: " + file_path);
301   }
302   XMLElement *root = xml_document->RootElement();
303   if (root == nullptr) {
304     RETURN_STATUS_UNEXPECTED("Invalid data, failed to load root element for xml file.");
305   }
306   XMLElement *firstChild = root->FirstChildElement();
307   if (firstChild == nullptr) {
308     RETURN_STATUS_UNEXPECTED("Invalid data, no first child found in " + file_path);
309   }
310   *doc = firstChild->FirstChildElement("doc");
311   if (*doc == nullptr) {
312     RETURN_STATUS_UNEXPECTED("Invalid data, no doc found in " + file_path);
313   }
314   return Status::OK();
315 }
316 
CleanXmlFile(const std::string & src_file_path,const std::string & target_file_path,const std::string & new_file_path)317 Status IWSLTOp::CleanXmlFile(const std::string &src_file_path, const std::string &target_file_path,
318                              const std::string &new_file_path) {
319   XMLDocument xml_document1, xml_document2;
320   XMLElement *src_doc = nullptr;
321   XMLElement *target_doc = nullptr;
322 
323   RETURN_IF_NOT_OK(LoadXmlDocument(&xml_document1, src_file_path, &src_doc));
324   RETURN_IF_NOT_OK(LoadXmlDocument(&xml_document2, target_file_path, &target_doc));
325   std::string src_content, target_content;
326   std::ofstream new_file(new_file_path, std::ofstream::out);
327   CHECK_FAIL_RETURN_UNEXPECTED(new_file.is_open(), "Invalid file, failed to open file: " + new_file_path);
328 
329   while (src_doc != nullptr && target_doc != nullptr) {
330     XMLElement *src_seg = src_doc->FirstChildElement("seg");
331     XMLElement *target_seg = target_doc->FirstChildElement("seg");
332     while (src_seg != nullptr && target_seg != nullptr) {
333       src_content = src_seg->GetText();
334       target_content = target_seg->GetText();
335       auto s = Trim(&src_content, " ");
336       if (s != Status::OK()) {
337         new_file.close();
338         return s;
339       }
340       s = Trim(&target_content, " ");
341       if (s != Status::OK()) {
342         new_file.close();
343         return s;
344       }
345       src_seg = src_seg->NextSiblingElement();
346       target_seg = target_seg->NextSiblingElement();
347       new_file << (src_content + "#*$" + target_content + "\n");
348     }
349     src_doc = src_doc->NextSiblingElement();
350     target_doc = target_doc->NextSiblingElement();
351   }
352 
353   new_file.close();
354 
355   ChangeFileMode(new_file_path, S_IRUSR | S_IWUSR);
356 
357   return Status::OK();
358 }
359 
IsContainTags(const std::string & content)360 bool IWSLTOp::IsContainTags(const std::string &content) {
361   std::vector<std::string> xml_tags = {"<url",        "<keywords", "<talkid",  "<description", "<reviewer",
362                                        "<translator", "<title",    "<speaker", "<doc",         "</doc"};
363   int i = 0;
364   int size = xml_tags.size();
365   while (i < size) {
366     if (content.find(xml_tags[i]) != std::string::npos) {
367       return true;
368     }
369     i++;
370   }
371   return false;
372 }
373 
CleanTagFile(const std::string & src_file_path,const std::string & target_file_path,const std::string & new_file_path)374 Status IWSLTOp::CleanTagFile(const std::string &src_file_path, const std::string &target_file_path,
375                              const std::string &new_file_path) {
376   std::ifstream src_handle(src_file_path, std::ifstream::in);
377   std::ifstream target_handle(target_file_path, std::ifstream::in);
378 
379   std::ofstream new_file(new_file_path, std::ios::out | std::ios::trunc);
380   std::string src_content, target_content;
381   while (getline(src_handle, src_content)) {
382     while (getline(target_handle, target_content)) {
383       if (!IsContainTags(src_content) && !IsContainTags(target_content)) {
384         auto s = Trim(&src_content, " ");
385         if (s != Status::OK()) {
386           src_handle.close();
387           target_handle.close();
388           new_file.close();
389           return s;
390         }
391         s = Trim(&target_content, " ");
392         if (s != Status::OK()) {
393           src_handle.close();
394           target_handle.close();
395           new_file.close();
396           return s;
397         }
398         new_file << (src_content + "#*$" + target_content + "\n");
399       }
400       break;
401     }
402   }
403   new_file.close();
404 
405   ChangeFileMode(new_file_path, S_IRUSR | S_IWUSR);
406 
407   src_handle.close();
408   target_handle.close();
409   return Status::OK();
410 }
411 
GenerateNewFile(const std::vector<std::string> & src_file_list,const std::vector<std::string> & target_file_list,std::vector<std::string> * src_target_file_list)412 Status IWSLTOp::GenerateNewFile(const std::vector<std::string> &src_file_list,
413                                 const std::vector<std::string> &target_file_list,
414                                 std::vector<std::string> *src_target_file_list) {
415   RETURN_UNEXPECTED_IF_NULL(src_target_file_list);
416   std::string::size_type position;
417   std::string new_path;
418   std::string src_path, target_path;
419   for (int i = 0; i < src_file_list.size(); i++) {
420     src_path = src_file_list[i];
421     target_path = target_file_list[i];
422 
423     // Add new train file name.
424     position = src_path.find(".tags");
425     if (position != std::string::npos) {
426       new_path = src_path;
427       const int kTagSize = 5;
428       const int kSuffixSize = 3;
429       new_path = new_path.replace(new_path.find(".tags"), kTagSize, "");
430       new_path = new_path.substr(0, new_path.length() - kSuffixSize);
431 
432       // Write data to the new file path.
433       RETURN_IF_NOT_OK(CleanTagFile(src_path, target_path, new_path));
434       src_target_file_list->push_back(new_path);
435     } else {
436       // Add new valid or test file name.
437       // Delete suffix.
438       const int kSuffixXMLSize = 7;
439       new_path = src_path;
440       new_path = new_path.substr(0, new_path.length() - kSuffixXMLSize);
441       // Write data to the new file path.
442       RETURN_IF_NOT_OK(CleanXmlFile(src_path, target_path, new_path));
443       src_target_file_list->push_back(new_path);
444     }
445   }
446   return Status::OK();
447 }
448 
GenerateIWSLT2016TagsFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & suffix)449 std::string IWSLTOp::GenerateIWSLT2016TagsFileName(Path dir, const std::string &src_language,
450                                                    const std::string &target_language, const std::string &suffix) {
451   Path src_language_path(src_language);
452   Path target_language_path(target_language);
453   Path sub_dir(src_language + "-" + target_language);
454   Path file_name("train.tags." + src_language + "-" + target_language + "." + suffix);
455   Path file_path = dir / "texts" / src_language_path / target_language_path / sub_dir / file_name;
456   return file_path.ToString();
457 }
458 
GenerateIWSLT2016XMLFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & set_type,const std::string & suffix)459 std::string IWSLTOp::GenerateIWSLT2016XMLFileName(Path dir, const std::string &src_language,
460                                                   const std::string &target_language, const std::string &set_type,
461                                                   const std::string &suffix) {
462   Path src_language_path(src_language);
463   Path target_language_path(target_language);
464   Path sub_dir(src_language + "-" + target_language);
465   Path file_name("IWSLT16.TED." + set_type + "." + src_language + "-" + target_language + "." + suffix + ".xml");
466   Path file_path = dir / "texts" / src_language_path / target_language_path / sub_dir / file_name;
467   return file_path.ToString();
468 }
469 
GenerateIWSLT2017TagsFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & suffix)470 std::string IWSLTOp::GenerateIWSLT2017TagsFileName(Path dir, const std::string &src_language,
471                                                    const std::string &target_language, const std::string &suffix) {
472   Path sub_const_dir("texts");
473   Path sub_src_language_dir("DeEnItNlRo");
474   Path sub_tgt_language_dir("DeEnItNlRo");
475   Path sub_src_tgt_dir("DeEnItNlRo-DeEnItNlRo");
476   Path file_name("train.tags." + src_language + "-" + target_language + "." + suffix);
477   Path file_path = dir / sub_const_dir / sub_src_language_dir / sub_tgt_language_dir / sub_src_tgt_dir / file_name;
478   return file_path.ToString();
479 }
480 
GenerateIWSLT2017XMLFileName(Path dir,const std::string & src_language,const std::string & target_language,const std::string & set_type,const std::string & suffix)481 std::string IWSLTOp::GenerateIWSLT2017XMLFileName(Path dir, const std::string &src_language,
482                                                   const std::string &target_language, const std::string &set_type,
483                                                   const std::string &suffix) {
484   Path sub_const_dir("texts");
485   Path sub_src_language_dir("DeEnItNlRo");
486   Path sub_tgt_language_dir("DeEnItNlRo");
487   Path sub_src_tgt_dir("DeEnItNlRo-DeEnItNlRo");
488   Path file_name("IWSLT17.TED." + set_type + "." + src_language + "-" + target_language + "." + suffix + ".xml");
489   Path file_path = dir / sub_const_dir / sub_src_language_dir / sub_tgt_language_dir / sub_src_tgt_dir / file_name;
490   return file_path.ToString();
491 }
492 
GetFiles()493 Status IWSLTOp::GetFiles() {
494   std::vector<std::string> src_path_list;
495   std::vector<std::string> target_path_list;
496   auto real_dataset_dir = FileUtils::GetRealPath(dataset_dir_.c_str());
497   CHECK_FAIL_RETURN_UNEXPECTED(real_dataset_dir.has_value(), "Get real path failed: " + dataset_dir_);
498   Path root_dir(real_dataset_dir.value());
499 
500   if (iwslt_type_ == kIWSLT2016) {
501     if (usage_ == "train" || usage_ == "all") {
502       src_path_list.push_back(
503         GenerateIWSLT2016TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[0]));
504       target_path_list.push_back(
505         GenerateIWSLT2016TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[1]));
506     }
507     if (usage_ == "valid" || usage_ == "all") {
508       src_path_list.push_back(
509         GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[0]));
510       target_path_list.push_back(
511         GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[1]));
512     }
513     if (usage_ == "test" || usage_ == "all") {
514       src_path_list.push_back(
515         GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[0]));
516       target_path_list.push_back(
517         GenerateIWSLT2016XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[1]));
518     }
519   } else {
520     if (usage_ == "train" || usage_ == "all") {
521       src_path_list.push_back(
522         GenerateIWSLT2017TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[0]));
523       target_path_list.push_back(
524         GenerateIWSLT2017TagsFileName(root_dir, language_pair_[0], language_pair_[1], language_pair_[1]));
525     }
526     if (usage_ == "valid" || usage_ == "all") {
527       src_path_list.push_back(
528         GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[0]));
529       target_path_list.push_back(
530         GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], valid_set_, language_pair_[1]));
531     }
532     if (usage_ == "test" || usage_ == "all") {
533       src_path_list.push_back(
534         GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[0]));
535       target_path_list.push_back(
536         GenerateIWSLT2017XMLFileName(root_dir, language_pair_[0], language_pair_[1], test_set_, language_pair_[1]));
537     }
538   }
539   RETURN_IF_NOT_OK(GenerateNewFile(src_path_list, target_path_list, &src_target_file_list_));
540   return Status::OK();
541 }
542 }  // namespace dataset
543 }  // namespace mindspore
544