• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_LITEAPI_INCLUDE_DATASETS_H_
18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_LITEAPI_INCLUDE_DATASETS_H_
19 
20 #include <sys/stat.h>
21 #include <unistd.h>
22 
23 #include <algorithm>
24 #include <map>
25 #include <memory>
26 #include <set>
27 #include <string>
28 #include <unordered_map>
29 #include <unordered_set>
30 #include <utility>
31 #include <vector>
32 
33 #include "include/api/dual_abi_helper.h"
34 #include "include/api/types.h"
35 #include "include/dataset/iterator.h"
36 #include "include/dataset/samplers.h"
37 #include "include/dataset/transforms.h"
38 
39 namespace mindspore {
40 namespace dataset {
41 class Tensor;
42 class TensorShape;
43 class TreeAdapter;
44 class TreeAdapterLite;
45 class TreeGetters;
46 
47 class DatasetCache;
48 class DatasetNode;
49 
50 class Iterator;
51 
52 class TensorOperation;
53 class SchemaObj;
54 class SamplerObj;
55 
56 // Dataset classes (in alphabetical order)
57 class BatchDataset;
58 class MapDataset;
59 class ProjectDataset;
60 class ShuffleDataset;
61 class DSCallback;
62 
63 /// \class Dataset datasets.h
64 /// \brief A base class to represent a dataset in the data pipeline.
65 class DATASET_API Dataset : public std::enable_shared_from_this<Dataset> {
66  public:
67   // need friend class so they can access the children_ field
68   friend class Iterator;
69   friend class DataQueueNode;
70 
71   /// \brief Constructor
72   Dataset();
73 
74   /// \brief Destructor
75   virtual ~Dataset() = default;
76 
77   /// \brief Gets the dataset size
78   /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
79   ///     dataset size at the expense of accuracy.
80   /// \return dataset size. If failed, return -1
81   int64_t GetDatasetSize(bool estimate = false);
82 
83   /// \brief Gets the output type
84   /// \return a vector of DataType. If failed, return an empty vector
85   std::vector<mindspore::DataType> GetOutputTypes();
86 
87   /// \brief Gets the output shape
88   /// \return a vector of TensorShape. If failed, return an empty vector
89   std::vector<std::vector<int64_t>> GetOutputShapes();
90 
91   /// \brief Gets the batch size
92   /// \return int64_t
93   int64_t GetBatchSize();
94 
95   /// \brief Gets the repeat count
96   /// \return int64_t
97   int64_t GetRepeatCount();
98 
99   /// \brief Gets the number of classes
100   /// \return number of classes. If failed, return -1
101   int64_t GetNumClasses();
102 
103   /// \brief Gets the column names
104   /// \return Names of the columns. If failed, return an empty vector
GetColumnNames()105   std::vector<std::string> GetColumnNames() { return VectorCharToString(GetColumnNamesCharIF()); }
106 
107   /// \brief Gets the class indexing
108   /// \return a map of ClassIndexing. If failed, return an empty map
GetClassIndexing()109   std::vector<std::pair<std::string, std::vector<int32_t>>> GetClassIndexing() {
110     return ClassIndexCharToString(GetClassIndexingCharIF());
111   }
112 
113   /// \brief Setter function for runtime number of workers
114   /// \param[in] num_workers The number of threads in this operator
115   /// \return Shared pointer to the original object
116   /// \par Example
117   /// \code
118   ///      /* Set number of workers(threads) to process the dataset in parallel */
119   ///      std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true);
120   ///      ds = ds->SetNumWorkers(16);
121   /// \endcode
122   std::shared_ptr<Dataset> SetNumWorkers(int32_t num_workers);
123 
124   /// \brief Function to create an PullBasedIterator over the Dataset
125   /// \return Shared pointer to the Iterator
126   /// \par Example
127   /// \code
128   ///      /* dataset is an instance of Dataset object */
129   ///      std::shared_ptr<Iterator> = dataset->CreatePullBasedIterator();
130   ///      std::unordered_map<std::string, mindspore::MSTensor> row;
131   ///      iter->GetNextRow(&row);
132   /// \endcode
133   std::shared_ptr<PullIterator> CreatePullBasedIterator();
134 
135   /// \brief Function to create an Iterator over the Dataset pipeline
136   /// \param[in] num_epochs Number of epochs to run through the pipeline, default -1 which means infinite epochs.
137   ///     An empty row is returned at the end of each epoch
138   /// \return Shared pointer to the Iterator
139   /// \par Example
140   /// \code
141   ///      /* dataset is an instance of Dataset object */
142   ///      std::shared_ptr<Iterator> = dataset->CreateIterator();
143   ///      std::unordered_map<std::string, mindspore::MSTensor> row;
144   ///      iter->GetNextRow(&row);
145   /// \endcode
146   std::shared_ptr<Iterator> CreateIterator(int32_t num_epochs = -1) { return CreateIteratorCharIF(num_epochs); }
147 
148   /// \brief Function to transfer data through a device.
149   /// \note If device is Ascend, features of data will be transferred one by one. The limitation
150   ///     of data transmission per time is 256M.
151   /// \param[in] queue_name Channel name (default="", create new unique name).
152   /// \param[in] device_type Type of device (default="", get from MSContext).
153   /// \param[in] device_id id of device (default=1, get from MSContext).
154   /// \param[in] num_epochs Number of epochs (default=-1, infinite epochs).
155   /// \param[in] send_epoch_end Whether to send end of sequence to device or not (default=true).
156   /// \param[in] total_batches Number of batches to be sent to the device (default=0, all data).
157   /// \param[in] create_data_info_queue Whether to create queue which stores types and shapes
158   ///     of data or not(default=false).
159   /// \return Returns true if no error encountered else false.
160   bool DeviceQueue(const std::string &queue_name = "", const std::string &device_type = "", int32_t device_id = 0,
161                    int32_t num_epochs = -1, bool send_epoch_end = true, int32_t total_batches = 0,
162                    bool create_data_info_queue = false) {
163     return DeviceQueueCharIF(StringToChar(queue_name), StringToChar(device_type), device_id, num_epochs, send_epoch_end,
164                              total_batches, create_data_info_queue);
165   }
166 
167   /// \brief Function to create a Saver to save the dynamic data processed by the dataset pipeline
168   /// \note Usage restrictions:
169   ///     1. Supported dataset formats: 'mindrecord' only
170   ///     2. To save the samples in order, set dataset's shuffle to false and num_files to 1.
171   ///     3. Before calling the function, do not use batch operator, repeat operator or data augmentation operators
172   ///        with random attribute in map operator.
173   ///     4. Mindrecord does not support bool, uint64, multi-dimensional uint8(drop dimension) nor
174   ///        multi-dimensional string.
175   /// \param[in] dataset_path Path to dataset file
176   /// \param[in] num_files Number of dataset files (default=1)
177   /// \param[in] dataset_type Dataset format (default="mindrecord")
178   /// \return Returns true if no error encountered else false
179   /// \par Example
180   /// \code
181   ///      /* Create a dataset and save its data into MindRecord */
182   ///      std::string folder_path = "/path/to/cifar_dataset";
183   ///      std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", std::make_shared<SequentialSampler>(0, 10));
184   ///      std::string save_file = "Cifar10Data.mindrecord";
185   ///      bool rc = ds->Save(save_file);
186   /// \endcode
187   bool Save(const std::string &dataset_path, int32_t num_files = 1, const std::string &dataset_type = "mindrecord") {
188     return SaveCharIF(StringToChar(dataset_path), num_files, StringToChar(dataset_type));
189   }
190 
191   /// \brief Function to create a BatchDataset
192   /// \note Combines batch_size number of consecutive rows into batches
193   /// \param[in] batch_size The number of rows each batch is created with
194   /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete
195   ///     batch. If true, and if there are less than batch_size rows
196   ///     available to make the last batch, then those rows will
197   ///     be dropped and not propagated to the next node
198   /// \return Shared pointer to the current BatchDataset
199   /// \par Example
200   /// \code
201   ///      /* Create a dataset where every 100 rows is combined into a batch */
202   ///      std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true);
203   ///      ds = ds->Batch(100, true);
204   /// \endcode
205   std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false);
206 
207   /// \brief Function to create a MapDataset
208   /// \note Applies each operation in operations to this dataset
209   /// \param[in] operations Vector of raw pointers to TensorTransform objects to be applied on the dataset. Operations
210   ///     are applied in the order they appear in this list
211   /// \param[in] input_columns Vector of the names of the columns that will be passed to the first
212   ///     operation as input. The size of this list must match the number of
213   ///     input columns expected by the first operator. The default input_columns
214   ///     is the first column
215   /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation
216   ///     This parameter is mandatory if len(input_columns) != len(output_columns)
217   ///     The size of this list must match the number of output columns of the
218   ///     last operation. The default output_columns will have the same
219   ///     name as the input columns, i.e., the columns will be replaced
220   /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
221   /// \param[in] callbacks List of Dataset callbacks to be called.
222   /// \return Shared pointer to the current MapDataset
223   /// \par Example
224   /// \code
225   ///     // Create objects for the tensor ops
226   ///     std::shared_ptr<TensorTransform> decode_op = std::make_shared<vision::Decode>(true);
227   ///     std::shared_ptr<TensorTransform> random_color_op = std::make_shared<vision::RandomColor>(0.0, 0.0);
228   ///
229   ///     /* 1) Simple map example */
230   ///     // Apply decode_op on column "image". This column will be replaced by the outputted
231   ///     // column of decode_op.
232   ///     dataset = dataset->Map({decode_op}, {"image"});
233   ///
234   ///     // Decode and rename column "image" to "decoded_image".
235   ///     dataset = dataset->Map({decode_op}, {"image"}, {"decoded_image"});
236   ///
237   ///    /* 2) Map example with more than one operation */
238   ///    // Create a dataset where the images are decoded, then randomly color jittered.
239   ///    // decode_op takes column "image" as input and outputs one column. The column
240   ///    // outputted by decode_op is passed as input to random_jitter_op.
241   ///    // random_jitter_op will output one column. Column "image" will be replaced by
242   ///    // the column outputted by random_jitter_op (the very last operation). All other
243   ///    // columns are unchanged.
244   ///    dataset = dataset->Map({decode_op, random_jitter_op}, {"image"})
245   /// \endcode
246   std::shared_ptr<MapDataset> Map(const std::vector<TensorTransform *> &operations,
247                                   const std::vector<std::string> &input_columns = {},
248                                   const std::vector<std::string> &output_columns = {},
249                                   const std::shared_ptr<DatasetCache> &cache = nullptr,
250                                   const std::vector<std::shared_ptr<DSCallback>> &callbacks = {}) {
251     std::vector<std::shared_ptr<TensorOperation>> transform_ops;
252     (void)std::transform(
253       operations.begin(), operations.end(), std::back_inserter(transform_ops),
254       [](TensorTransform *op) -> std::shared_ptr<TensorOperation> { return op != nullptr ? op->Parse() : nullptr; });
255     return std::make_shared<MapDataset>(shared_from_this(), transform_ops, VectorStringToChar(input_columns),
256                                         VectorStringToChar(output_columns), cache, callbacks);
257   }
258 
259   /// \brief Function to create a MapDataset
260   /// \note Applies each operation in operations to this dataset
261   /// \param[in] operations Vector of shared pointers to TensorTransform objects to be applied on the dataset.
262   ///     Operations are applied in the order they appear in this list
263   /// \param[in] input_columns Vector of the names of the columns that will be passed to the first
264   ///     operation as input. The size of this list must match the number of
265   ///     input columns expected by the first operator. The default input_columns
266   ///     is the first column
267   /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation
268   ///     This parameter is mandatory if len(input_columns) != len(output_columns)
269   ///     The size of this list must match the number of output columns of the
270   ///     last operation. The default output_columns will have the same
271   ///     name as the input columns, i.e., the columns will be replaced
272   /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
273   /// \param[in] callbacks List of Dataset callbacks to be called.
274   /// \return Shared pointer to the current MapDataset
275   std::shared_ptr<MapDataset> Map(const std::vector<std::shared_ptr<TensorTransform>> &operations,
276                                   const std::vector<std::string> &input_columns = {},
277                                   const std::vector<std::string> &output_columns = {},
278                                   const std::shared_ptr<DatasetCache> &cache = nullptr,
279                                   const std::vector<std::shared_ptr<DSCallback>> &callbacks = {}) {
280     std::vector<std::shared_ptr<TensorOperation>> transform_ops;
281     (void)std::transform(operations.begin(), operations.end(), std::back_inserter(transform_ops),
282                          [](const std::shared_ptr<TensorTransform> &op) -> std::shared_ptr<TensorOperation> {
283                            return op != nullptr ? op->Parse() : nullptr;
284                          });
285     return std::make_shared<MapDataset>(shared_from_this(), transform_ops, VectorStringToChar(input_columns),
286                                         VectorStringToChar(output_columns), cache, callbacks);
287   }
288 
289   /// \brief Function to create a MapDataset
290   /// \note Applies each operation in operations to this dataset
291   /// \param[in] operations Vector of TensorTransform objects to be applied on the dataset. Operations are applied in
292   ///     the order they appear in this list
293   /// \param[in] input_columns Vector of the names of the columns that will be passed to the first
294   ///     operation as input. The size of this list must match the number of
295   ///     input columns expected by the first operator. The default input_columns
296   ///     is the first column
297   /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation
298   ///     This parameter is mandatory if len(input_columns) != len(output_columns)
299   ///     The size of this list must match the number of output columns of the
300   ///     last operation. The default output_columns will have the same
301   ///     name as the input columns, i.e., the columns will be replaced
302   /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
303   /// \param[in] callbacks List of Dataset callbacks to be called.
304   /// \return Shared pointer to the current MapDataset
305   std::shared_ptr<MapDataset> Map(const std::vector<std::reference_wrapper<TensorTransform>> &operations,
306                                   const std::vector<std::string> &input_columns = {},
307                                   const std::vector<std::string> &output_columns = {},
308                                   const std::shared_ptr<DatasetCache> &cache = nullptr,
309                                   const std::vector<std::shared_ptr<DSCallback>> &callbacks = {}) {
310     std::vector<std::shared_ptr<TensorOperation>> transform_ops;
311     (void)std::transform(operations.begin(), operations.end(), std::back_inserter(transform_ops),
312                          [](TensorTransform &op) -> std::shared_ptr<TensorOperation> { return op.Parse(); });
313     return std::make_shared<MapDataset>(shared_from_this(), transform_ops, VectorStringToChar(input_columns),
314                                         VectorStringToChar(output_columns), cache, callbacks);
315   }
316 
317   /// \brief Function to create a Project Dataset
318   /// \note Applies project to the dataset
319   /// \param[in] columns The name of columns to project
320   /// \return Shared pointer to the current Dataset
321   /// \par Example
322   /// \code
323   ///      /* Reorder the original column names in dataset */
324   ///      std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", std::make_shared<RandomSampler>(false, 10));
325   ///      ds = ds->Project({"label", "image"});
326   /// \endcode
Project(const std::vector<std::string> & columns)327   std::shared_ptr<ProjectDataset> Project(const std::vector<std::string> &columns) {
328     return std::make_shared<ProjectDataset>(shared_from_this(), VectorStringToChar(columns));
329   }
330 
331   /// \brief Function to create a Shuffle Dataset
332   /// \note Randomly shuffles the rows of this dataset
333   /// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling
334   /// \return Shared pointer to the current ShuffleDataset
335   /// \par Example
336   /// \code
337   ///      /* Rename the original column names in dataset */
338   ///      std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", std::make_shared<RandomSampler>(false, 10));
339   ///      ds = ds->Rename({"image", "label"}, {"image_output", "label_output"});
340   /// \endcode
Shuffle(int32_t buffer_size)341   std::shared_ptr<ShuffleDataset> Shuffle(int32_t buffer_size) {
342     return std::make_shared<ShuffleDataset>(shared_from_this(), buffer_size);
343   }
344 
IRNode()345   std::shared_ptr<DatasetNode> IRNode() { return ir_node_; }
346 
347  protected:
348   std::shared_ptr<TreeGetters> tree_getters_;
349   std::shared_ptr<DatasetNode> ir_node_;
350 
351  private:
352   // Char interface(CharIF) of GetColumnNames
353   std::vector<std::vector<char>> GetColumnNamesCharIF();
354 
355   // Char interface(CharIF) of GetClassIndexing
356   std::vector<std::pair<std::vector<char>, std::vector<int32_t>>> GetClassIndexingCharIF();
357 
358   // Char interface(CharIF) of CreateIterator
359   std::shared_ptr<Iterator> CreateIteratorCharIF(int32_t num_epochs);
360 
361   // Char interface(CharIF) of DeviceQueue
362   bool DeviceQueueCharIF(const std::vector<char> &queue_name, const std::vector<char> &device_type, int32_t device_id,
363                          int32_t num_epochs, bool send_epoch_end, int32_t total_batches, bool create_data_info_queue);
364 
365   // Char interface(CharIF) of Save
366   bool SaveCharIF(const std::vector<char> &dataset_path, int32_t num_files, const std::vector<char> &dataset_type);
367 };
368 
369 class DATASET_API SchemaObj {
370  public:
371   /// \brief Constructor
SchemaObj(StringToChar (schema_file))372   explicit SchemaObj(const std::string &schema_file = "") : SchemaObj(StringToChar(schema_file)) {}
373 
374   /// \brief Destructor
375   ~SchemaObj() = default;
376 
377   /// \brief SchemaObj Init function
378   /// \return bool true if schema initialization is successful
379   Status Init();
380 
381   /// \brief Add new column to the schema with unknown shape of rank 1
382   /// \param[in] name Name of the column.
383   /// \param[in] ms_type Data type of the column(mindspore::DataType).
384   /// \return Status code
add_column(const std::string & name,mindspore::DataType ms_type)385   Status add_column(const std::string &name, mindspore::DataType ms_type) {
386     return add_column_char(StringToChar(name), ms_type);
387   }
388 
389   /// \brief Add new column to the schema with unknown shape of rank 1
390   /// \param[in] name Name of the column.
391   /// \param[in] ms_type Data type of the column(std::string).
392   /// \param[in] shape Shape of the column.
393   /// \return Status code
add_column(const std::string & name,const std::string & ms_type)394   Status add_column(const std::string &name, const std::string &ms_type) {
395     return add_column_char(StringToChar(name), StringToChar(ms_type));
396   }
397 
398   /// \brief Add new column to the schema
399   /// \param[in] name Name of the column.
400   /// \param[in] ms_type Data type of the column(mindspore::DataType).
401   /// \param[in] shape Shape of the column.
402   /// \return Status code
add_column(const std::string & name,mindspore::DataType ms_type,const std::vector<int32_t> & shape)403   Status add_column(const std::string &name, mindspore::DataType ms_type, const std::vector<int32_t> &shape) {
404     return add_column_char(StringToChar(name), ms_type, shape);
405   }
406 
407   /// \brief Add new column to the schema
408   /// \param[in] name Name of the column.
409   /// \param[in] ms_type Data type of the column(std::string).
410   /// \param[in] shape Shape of the column.
411   /// \return Status code
add_column(const std::string & name,const std::string & ms_type,const std::vector<int32_t> & shape)412   Status add_column(const std::string &name, const std::string &ms_type, const std::vector<int32_t> &shape) {
413     return add_column_char(StringToChar(name), StringToChar(ms_type), shape);
414   }
415 
416   /// \brief Get a JSON string of the schema
417   /// \return JSON string of the schema
to_json()418   std::string to_json() { return CharToString(to_json_char()); }
419 
420   /// \brief Get a JSON string of the schema
to_string()421   std::string to_string() { return to_json(); }
422 
423   /// \brief Set a new value to dataset_type
424   void set_dataset_type(const std::string &dataset_type);
425 
426   /// \brief Set a new value to num_rows
427   void set_num_rows(int32_t num_rows);
428 
429   /// \brief Get the current num_rows
430   int32_t get_num_rows() const;
431 
432   /// \brief Get schema file from JSON file
433   /// \param[in] json_string Name of JSON file to be parsed.
434   /// \return Status code
FromJSONString(const std::string & json_string)435   Status FromJSONString(const std::string &json_string) { return FromJSONStringCharIF(StringToChar(json_string)); }
436 
437   /// \brief Parse and add column information
438   /// \param[in] json_string Name of JSON string for column dataset attribute information, decoded from schema file.
439   /// \return Status code
ParseColumnString(const std::string & json_string)440   Status ParseColumnString(const std::string &json_string) {
441     return ParseColumnStringCharIF(StringToChar(json_string));
442   }
443 
444  private:
445   // Char constructor of SchemaObj
446   explicit SchemaObj(const std::vector<char> &schema_file);
447 
448   // Char interface of add_column
449   Status add_column_char(const std::vector<char> &name, mindspore::DataType ms_type);
450 
451   Status add_column_char(const std::vector<char> &name, const std::vector<char> &ms_type);
452 
453   Status add_column_char(const std::vector<char> &name, mindspore::DataType ms_type, const std::vector<int32_t> &shape);
454 
455   Status add_column_char(const std::vector<char> &name, const std::vector<char> &ms_type,
456                          const std::vector<int32_t> &shape);
457 
458   // Char interface of to_json
459   const std::vector<char> to_json_char();
460 
461   // Char interface of FromJSONString
462   Status FromJSONStringCharIF(const std::vector<char> &json_string);
463 
464   // Char interface of ParseColumnString
465   Status ParseColumnStringCharIF(const std::vector<char> &json_string);
466 
467   struct Data;
468   std::shared_ptr<Data> data_;
469 };
470 
471 class DATASET_API BatchDataset : public Dataset {
472  public:
473   BatchDataset(const std::shared_ptr<Dataset> &input, int32_t batch_size, bool drop_remainder = false);
474 
475   ~BatchDataset() override = default;
476 };
477 
478 class DATASET_API MapDataset : public Dataset {
479  public:
480   MapDataset(const std::shared_ptr<Dataset> &input, const std::vector<std::shared_ptr<TensorOperation>> &operations,
481              const std::vector<std::vector<char>> &input_columns, const std::vector<std::vector<char>> &output_columns,
482              const std::shared_ptr<DatasetCache> &cache, const std::vector<std::shared_ptr<DSCallback>> &callbacks);
483 
484   ~MapDataset() override = default;
485 };
486 
487 class DATASET_API ProjectDataset : public Dataset {
488  public:
489   ProjectDataset(const std::shared_ptr<Dataset> &input, const std::vector<std::vector<char>> &columns);
490 
491   ~ProjectDataset() override = default;
492 };
493 
494 class DATASET_API ShuffleDataset : public Dataset {
495  public:
496   ShuffleDataset(const std::shared_ptr<Dataset> &input, int32_t buffer_size);
497 
498   ~ShuffleDataset() override = default;
499 };
500 
501 /// \brief Function to create a SchemaObj.
502 /// \param[in] schema_file Path of schema file.
503 /// \note The reason for using this API is that std::string will be constrained by the
504 ///    compiler option '_GLIBCXX_USE_CXX11_ABI' while char is free of this restriction.
505 /// \return Shared pointer to the current schema.
506 std::shared_ptr<SchemaObj> DATASET_API SchemaCharIF(const std::vector<char> &schema_file);
507 
508 /// \brief Function to create a SchemaObj.
509 /// \param[in] schema_file Path of schema file.
510 /// \return Shared pointer to the current schema.
511 inline std::shared_ptr<SchemaObj> DATASET_API Schema(const std::string &schema_file = "") {
512   return SchemaCharIF(StringToChar(schema_file));
513 }
514 
515 class DATASET_API AlbumDataset : public Dataset {
516  public:
517   /// \brief Constructor of AlbumDataset.
518   /// \param[in] dataset_dir Path to the root directory that contains the dataset.
519   /// \param[in] data_schema Path to dataset schema file.
520   /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns
521   ///     (default = {}).
522   /// \param[in] decode The option to decode the images in dataset (default = false).
523   /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
524   ///     given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()).
525   /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
526   AlbumDataset(const std::vector<char> &dataset_dir, const std::vector<char> &data_schema,
527                const std::vector<std::vector<char>> &column_names, bool decode, const std::shared_ptr<Sampler> &sampler,
528                const std::shared_ptr<DatasetCache> &cache);
529 
530   /// \brief Constructor of AlbumDataset.
531   /// \param[in] dataset_dir Path to the root directory that contains the dataset.
532   /// \param[in] data_schema Path to dataset schema file.
533   /// \param[in] column_names Column names used to specify columns to load.
534   /// \param[in] decode The option to decode the images in dataset.
535   /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
536   /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
537   AlbumDataset(const std::vector<char> &dataset_dir, const std::vector<char> &data_schema,
538                const std::vector<std::vector<char>> &column_names, bool decode, const Sampler *sampler,
539                const std::shared_ptr<DatasetCache> &cache);
540 
541   /// \brief Constructor of AlbumDataset.
542   /// \param[in] dataset_dir Path to the root directory that contains the dataset.
543   /// \param[in] data_schema Path to dataset schema file.
544   /// \param[in] column_names Column names used to specify columns to load.
545   /// \param[in] decode The option to decode the images in dataset.
546   /// \param[in] sampler Sampler object used to choose samples from the dataset.
547   /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
548   AlbumDataset(const std::vector<char> &dataset_dir, const std::vector<char> &data_schema,
549                const std::vector<std::vector<char>> &column_names, bool decode,
550                const std::reference_wrapper<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache);
551 
552   /// \brief Destructor of AlbumDataset.
553   ~AlbumDataset() override = default;
554 };
555 
556 /// \brief Function to create an AlbumDataset
557 /// \note The generated dataset is specified through setting a schema
558 /// \param[in] dataset_dir Path to the root directory that contains the dataset
559 /// \param[in] data_schema Path to dataset schema file
560 /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
561 ///     (default = {})
562 /// \param[in] decode the option to decode the images in dataset (default = false)
563 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
564 /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
565 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
566 /// \return Shared pointer to the current Dataset
567 /// \par Example
568 /// \code
569 ///      /* Define dataset path and MindData object */
570 ///      std::string folder_path = "/path/to/album_dataset_directory";
571 ///      std::string schema_file = "/path/to/album_schema_file";
572 ///      std::vector<std::string> column_names = {"image", "label", "id"};
573 ///      std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names);
574 ///
575 ///      /* Create iterator to read dataset */
576 ///      std::shared_ptr<Iterator> iter = ds->CreateIterator();
577 ///      std::unordered_map<std::string, mindspore::MSTensor> row;
578 ///      iter->GetNextRow(&row);
579 ///
580 ///      /* Note: As we defined before, each data dictionary owns keys "image", "label" and "id" */
581 ///      auto image = row["image"];
582 /// \endcode
583 inline std::shared_ptr<AlbumDataset> DATASET_API
584 Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector<std::string> &column_names = {},
585       bool decode = false, const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(),
586       const std::shared_ptr<DatasetCache> &cache = nullptr) {
587   return std::make_shared<AlbumDataset>(StringToChar(dataset_dir), StringToChar(data_schema),
588                                         VectorStringToChar(column_names), decode, sampler, cache);
589 }
590 
591 /// \brief Function to create an AlbumDataset
592 /// \note The generated dataset is specified through setting a schema
593 /// \param[in] dataset_dir Path to the root directory that contains the dataset
594 /// \param[in] data_schema Path to dataset schema file
595 /// \param[in] column_names Column names used to specify columns to load
596 /// \param[in] decode the option to decode the images in dataset
597 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
598 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
599 /// \return Shared pointer to the current Dataset
600 inline std::shared_ptr<AlbumDataset> DATASET_API Album(const std::string &dataset_dir, const std::string &data_schema,
601                                                        const std::vector<std::string> &column_names, bool decode,
602                                                        const Sampler *sampler,
603                                                        const std::shared_ptr<DatasetCache> &cache = nullptr) {
604   return std::make_shared<AlbumDataset>(StringToChar(dataset_dir), StringToChar(data_schema),
605                                         VectorStringToChar(column_names), decode, sampler, cache);
606 }
607 
608 /// \brief Function to create an AlbumDataset
609 /// \note The generated dataset is specified through setting a schema
610 /// \param[in] dataset_dir Path to the root directory that contains the dataset
611 /// \param[in] data_schema Path to dataset schema file
612 /// \param[in] column_names Column names used to specify columns to load
613 /// \param[in] decode the option to decode the images in dataset
614 /// \param[in] sampler Sampler object used to choose samples from the dataset.
615 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
616 /// \return Shared pointer to the current Dataset
617 inline std::shared_ptr<AlbumDataset> DATASET_API Album(const std::string &dataset_dir, const std::string &data_schema,
618                                                        const std::vector<std::string> &column_names, bool decode,
619                                                        const std::reference_wrapper<Sampler> sampler,
620                                                        const std::shared_ptr<DatasetCache> &cache = nullptr) {
621   return std::make_shared<AlbumDataset>(StringToChar(dataset_dir), StringToChar(data_schema),
622                                         VectorStringToChar(column_names), decode, sampler, cache);
623 }
624 
625 class DATASET_API MnistDataset : public Dataset {
626  public:
627   /// \brief Constructor of MnistDataset.
628   /// \param[in] dataset_dir Path to the root directory that contains the dataset.
629   /// \param[in] usage Part of dataset of MNIST, can be "train", "test" or "all" (default = "all").
630   /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
631   ///     given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()).
632   /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
633   MnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
634                const std::shared_ptr<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache);
635 
636   /// \brief Constructor of MnistDataset.
637   /// \param[in] dataset_dir Path to the root directory that contains the dataset.
638   /// \param[in] usage Part of dataset of MNIST, can be "train", "test" or "all".
639   /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
640   /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
641   MnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, const Sampler *sampler,
642                const std::shared_ptr<DatasetCache> &cache);
643 
644   /// \brief Constructor of MnistDataset.
645   /// \param[in] dataset_dir Path to the root directory that contains the dataset.
646   /// \param[in] usage Part of dataset of MNIST, can be "train", "test" or "all".
647   /// \param[in] sampler Sampler object used to choose samples from the dataset.
648   /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
649   MnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
650                const std::reference_wrapper<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache);
651 
652   /// Destructor of MnistDataset.
653   ~MnistDataset() override = default;
654 };
655 
656 /// \brief Function to create a MnistDataset
657 /// \note The generated dataset has two columns ["image", "label"]
658 /// \param[in] dataset_dir Path to the root directory that contains the dataset
659 /// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all").
660 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
661 /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
662 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
663 /// \return Shared pointer to the current MnistDataset
664 /// \par Example
665 /// \code
666 ///      /* Define dataset path and MindData object */
667 ///      std::string folder_path = "/path/to/mnist_dataset_directory";
668 ///      std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", std::make_shared<RandomSampler>(false, 20));
669 ///
670 ///      /* Create iterator to read dataset */
671 ///      std::shared_ptr<Iterator> iter = ds->CreateIterator();
672 ///      std::unordered_map<std::string, mindspore::MSTensor> row;
673 ///      iter->GetNextRow(&row);
674 ///
675 ///      /* Note: In MNIST dataset, each dictionary has keys "image" and "label" */
676 ///      auto image = row["image"];
677 /// \endcode
678 inline std::shared_ptr<MnistDataset> DATASET_API
679 Mnist(const std::string &dataset_dir, const std::string &usage = "all",
680       const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(),
681       const std::shared_ptr<DatasetCache> &cache = nullptr) {
682   return std::make_shared<MnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
683 }
684 
685 /// \brief Function to create a MnistDataset
686 /// \note The generated dataset has two columns ["image", "label"]
687 /// \param[in] dataset_dir Path to the root directory that contains the dataset
688 /// \param[in] usage of MNIST, can be "train", "test" or "all"
689 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
690 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
691 /// \return Shared pointer to the current MnistDataset
692 inline std::shared_ptr<MnistDataset> DATASET_API Mnist(const std::string &dataset_dir, const std::string &usage,
693                                                        const Sampler *sampler,
694                                                        const std::shared_ptr<DatasetCache> &cache = nullptr) {
695   return std::make_shared<MnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
696 }
697 
698 /// \brief Function to create a MnistDataset
699 /// \note The generated dataset has two columns ["image", "label"]
700 /// \param[in] dataset_dir Path to the root directory that contains the dataset
701 /// \param[in] usage of MNIST, can be "train", "test" or "all"
702 /// \param[in] sampler Sampler object used to choose samples from the dataset.
703 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
704 /// \return Shared pointer to the current MnistDataset
705 inline std::shared_ptr<MnistDataset> DATASET_API Mnist(const std::string &dataset_dir, const std::string &usage,
706                                                        const std::reference_wrapper<Sampler> sampler,
707                                                        const std::shared_ptr<DatasetCache> &cache = nullptr) {
708   return std::make_shared<MnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
709 }
710 }  // namespace dataset
711 }  // namespace mindspore
712 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_DATASETS_H_
713