1 /** 2 * Copyright 2020-2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_LITEAPI_INCLUDE_DATASETS_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_LITEAPI_INCLUDE_DATASETS_H_ 19 20 #include <sys/stat.h> 21 #include <unistd.h> 22 23 #include <algorithm> 24 #include <map> 25 #include <memory> 26 #include <set> 27 #include <string> 28 #include <unordered_map> 29 #include <unordered_set> 30 #include <utility> 31 #include <vector> 32 33 #include "include/api/dual_abi_helper.h" 34 #include "include/api/types.h" 35 #include "include/dataset/iterator.h" 36 #include "include/dataset/samplers.h" 37 #include "include/dataset/transforms.h" 38 39 namespace mindspore { 40 namespace dataset { 41 class Tensor; 42 class TensorShape; 43 class TreeAdapter; 44 class TreeAdapterLite; 45 class TreeGetters; 46 47 class DatasetCache; 48 class DatasetNode; 49 50 class Iterator; 51 52 class TensorOperation; 53 class SchemaObj; 54 class SamplerObj; 55 56 // Dataset classes (in alphabetical order) 57 class BatchDataset; 58 class MapDataset; 59 class ProjectDataset; 60 class ShuffleDataset; 61 class DSCallback; 62 63 /// \class Dataset datasets.h 64 /// \brief A base class to represent a dataset in the data pipeline. 65 class DATASET_API Dataset : public std::enable_shared_from_this<Dataset> { 66 public: 67 // need friend class so they can access the children_ field 68 friend class Iterator; 69 friend class DataQueueNode; 70 71 /// \brief Constructor 72 Dataset(); 73 74 /// \brief Destructor 75 virtual ~Dataset() = default; 76 77 /// \brief Gets the dataset size 78 /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting 79 /// dataset size at the expense of accuracy. 80 /// \return dataset size. If failed, return -1 81 int64_t GetDatasetSize(bool estimate = false); 82 83 /// \brief Gets the output type 84 /// \return a vector of DataType. If failed, return an empty vector 85 std::vector<mindspore::DataType> GetOutputTypes(); 86 87 /// \brief Gets the output shape 88 /// \return a vector of TensorShape. If failed, return an empty vector 89 std::vector<std::vector<int64_t>> GetOutputShapes(); 90 91 /// \brief Gets the batch size 92 /// \return int64_t 93 int64_t GetBatchSize(); 94 95 /// \brief Gets the repeat count 96 /// \return int64_t 97 int64_t GetRepeatCount(); 98 99 /// \brief Gets the number of classes 100 /// \return number of classes. If failed, return -1 101 int64_t GetNumClasses(); 102 103 /// \brief Gets the column names 104 /// \return Names of the columns. If failed, return an empty vector GetColumnNames()105 std::vector<std::string> GetColumnNames() { return VectorCharToString(GetColumnNamesCharIF()); } 106 107 /// \brief Gets the class indexing 108 /// \return a map of ClassIndexing. If failed, return an empty map GetClassIndexing()109 std::vector<std::pair<std::string, std::vector<int32_t>>> GetClassIndexing() { 110 return ClassIndexCharToString(GetClassIndexingCharIF()); 111 } 112 113 /// \brief Setter function for runtime number of workers 114 /// \param[in] num_workers The number of threads in this operator 115 /// \return Shared pointer to the original object 116 /// \par Example 117 /// \code 118 /// /* Set number of workers(threads) to process the dataset in parallel */ 119 /// std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true); 120 /// ds = ds->SetNumWorkers(16); 121 /// \endcode 122 std::shared_ptr<Dataset> SetNumWorkers(int32_t num_workers); 123 124 /// \brief Function to create an PullBasedIterator over the Dataset 125 /// \return Shared pointer to the Iterator 126 /// \par Example 127 /// \code 128 /// /* dataset is an instance of Dataset object */ 129 /// std::shared_ptr<Iterator> = dataset->CreatePullBasedIterator(); 130 /// std::unordered_map<std::string, mindspore::MSTensor> row; 131 /// iter->GetNextRow(&row); 132 /// \endcode 133 std::shared_ptr<PullIterator> CreatePullBasedIterator(); 134 135 /// \brief Function to create an Iterator over the Dataset pipeline 136 /// \param[in] num_epochs Number of epochs to run through the pipeline, default -1 which means infinite epochs. 137 /// An empty row is returned at the end of each epoch 138 /// \return Shared pointer to the Iterator 139 /// \par Example 140 /// \code 141 /// /* dataset is an instance of Dataset object */ 142 /// std::shared_ptr<Iterator> = dataset->CreateIterator(); 143 /// std::unordered_map<std::string, mindspore::MSTensor> row; 144 /// iter->GetNextRow(&row); 145 /// \endcode 146 std::shared_ptr<Iterator> CreateIterator(int32_t num_epochs = -1) { return CreateIteratorCharIF(num_epochs); } 147 148 /// \brief Function to transfer data through a device. 149 /// \note If device is Ascend, features of data will be transferred one by one. The limitation 150 /// of data transmission per time is 256M. 151 /// \param[in] queue_name Channel name (default="", create new unique name). 152 /// \param[in] device_type Type of device (default="", get from MSContext). 153 /// \param[in] device_id id of device (default=1, get from MSContext). 154 /// \param[in] num_epochs Number of epochs (default=-1, infinite epochs). 155 /// \param[in] send_epoch_end Whether to send end of sequence to device or not (default=true). 156 /// \param[in] total_batches Number of batches to be sent to the device (default=0, all data). 157 /// \param[in] create_data_info_queue Whether to create queue which stores types and shapes 158 /// of data or not(default=false). 159 /// \return Returns true if no error encountered else false. 160 bool DeviceQueue(const std::string &queue_name = "", const std::string &device_type = "", int32_t device_id = 0, 161 int32_t num_epochs = -1, bool send_epoch_end = true, int32_t total_batches = 0, 162 bool create_data_info_queue = false) { 163 return DeviceQueueCharIF(StringToChar(queue_name), StringToChar(device_type), device_id, num_epochs, send_epoch_end, 164 total_batches, create_data_info_queue); 165 } 166 167 /// \brief Function to create a Saver to save the dynamic data processed by the dataset pipeline 168 /// \note Usage restrictions: 169 /// 1. Supported dataset formats: 'mindrecord' only 170 /// 2. To save the samples in order, set dataset's shuffle to false and num_files to 1. 171 /// 3. Before calling the function, do not use batch operator, repeat operator or data augmentation operators 172 /// with random attribute in map operator. 173 /// 4. Mindrecord does not support bool, uint64, multi-dimensional uint8(drop dimension) nor 174 /// multi-dimensional string. 175 /// \param[in] dataset_path Path to dataset file 176 /// \param[in] num_files Number of dataset files (default=1) 177 /// \param[in] dataset_type Dataset format (default="mindrecord") 178 /// \return Returns true if no error encountered else false 179 /// \par Example 180 /// \code 181 /// /* Create a dataset and save its data into MindRecord */ 182 /// std::string folder_path = "/path/to/cifar_dataset"; 183 /// std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", std::make_shared<SequentialSampler>(0, 10)); 184 /// std::string save_file = "Cifar10Data.mindrecord"; 185 /// bool rc = ds->Save(save_file); 186 /// \endcode 187 bool Save(const std::string &dataset_path, int32_t num_files = 1, const std::string &dataset_type = "mindrecord") { 188 return SaveCharIF(StringToChar(dataset_path), num_files, StringToChar(dataset_type)); 189 } 190 191 /// \brief Function to create a BatchDataset 192 /// \note Combines batch_size number of consecutive rows into batches 193 /// \param[in] batch_size The number of rows each batch is created with 194 /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete 195 /// batch. If true, and if there are less than batch_size rows 196 /// available to make the last batch, then those rows will 197 /// be dropped and not propagated to the next node 198 /// \return Shared pointer to the current BatchDataset 199 /// \par Example 200 /// \code 201 /// /* Create a dataset where every 100 rows is combined into a batch */ 202 /// std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true); 203 /// ds = ds->Batch(100, true); 204 /// \endcode 205 std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false); 206 207 /// \brief Function to create a MapDataset 208 /// \note Applies each operation in operations to this dataset 209 /// \param[in] operations Vector of raw pointers to TensorTransform objects to be applied on the dataset. Operations 210 /// are applied in the order they appear in this list 211 /// \param[in] input_columns Vector of the names of the columns that will be passed to the first 212 /// operation as input. The size of this list must match the number of 213 /// input columns expected by the first operator. The default input_columns 214 /// is the first column 215 /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation 216 /// This parameter is mandatory if len(input_columns) != len(output_columns) 217 /// The size of this list must match the number of output columns of the 218 /// last operation. The default output_columns will have the same 219 /// name as the input columns, i.e., the columns will be replaced 220 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 221 /// \param[in] callbacks List of Dataset callbacks to be called. 222 /// \return Shared pointer to the current MapDataset 223 /// \par Example 224 /// \code 225 /// // Create objects for the tensor ops 226 /// std::shared_ptr<TensorTransform> decode_op = std::make_shared<vision::Decode>(true); 227 /// std::shared_ptr<TensorTransform> random_color_op = std::make_shared<vision::RandomColor>(0.0, 0.0); 228 /// 229 /// /* 1) Simple map example */ 230 /// // Apply decode_op on column "image". This column will be replaced by the outputted 231 /// // column of decode_op. 232 /// dataset = dataset->Map({decode_op}, {"image"}); 233 /// 234 /// // Decode and rename column "image" to "decoded_image". 235 /// dataset = dataset->Map({decode_op}, {"image"}, {"decoded_image"}); 236 /// 237 /// /* 2) Map example with more than one operation */ 238 /// // Create a dataset where the images are decoded, then randomly color jittered. 239 /// // decode_op takes column "image" as input and outputs one column. The column 240 /// // outputted by decode_op is passed as input to random_jitter_op. 241 /// // random_jitter_op will output one column. Column "image" will be replaced by 242 /// // the column outputted by random_jitter_op (the very last operation). All other 243 /// // columns are unchanged. 244 /// dataset = dataset->Map({decode_op, random_jitter_op}, {"image"}) 245 /// \endcode 246 std::shared_ptr<MapDataset> Map(const std::vector<TensorTransform *> &operations, 247 const std::vector<std::string> &input_columns = {}, 248 const std::vector<std::string> &output_columns = {}, 249 const std::shared_ptr<DatasetCache> &cache = nullptr, 250 const std::vector<std::shared_ptr<DSCallback>> &callbacks = {}) { 251 std::vector<std::shared_ptr<TensorOperation>> transform_ops; 252 (void)std::transform( 253 operations.begin(), operations.end(), std::back_inserter(transform_ops), 254 [](TensorTransform *op) -> std::shared_ptr<TensorOperation> { return op != nullptr ? op->Parse() : nullptr; }); 255 return std::make_shared<MapDataset>(shared_from_this(), transform_ops, VectorStringToChar(input_columns), 256 VectorStringToChar(output_columns), cache, callbacks); 257 } 258 259 /// \brief Function to create a MapDataset 260 /// \note Applies each operation in operations to this dataset 261 /// \param[in] operations Vector of shared pointers to TensorTransform objects to be applied on the dataset. 262 /// Operations are applied in the order they appear in this list 263 /// \param[in] input_columns Vector of the names of the columns that will be passed to the first 264 /// operation as input. The size of this list must match the number of 265 /// input columns expected by the first operator. The default input_columns 266 /// is the first column 267 /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation 268 /// This parameter is mandatory if len(input_columns) != len(output_columns) 269 /// The size of this list must match the number of output columns of the 270 /// last operation. The default output_columns will have the same 271 /// name as the input columns, i.e., the columns will be replaced 272 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 273 /// \param[in] callbacks List of Dataset callbacks to be called. 274 /// \return Shared pointer to the current MapDataset 275 std::shared_ptr<MapDataset> Map(const std::vector<std::shared_ptr<TensorTransform>> &operations, 276 const std::vector<std::string> &input_columns = {}, 277 const std::vector<std::string> &output_columns = {}, 278 const std::shared_ptr<DatasetCache> &cache = nullptr, 279 const std::vector<std::shared_ptr<DSCallback>> &callbacks = {}) { 280 std::vector<std::shared_ptr<TensorOperation>> transform_ops; 281 (void)std::transform(operations.begin(), operations.end(), std::back_inserter(transform_ops), 282 [](const std::shared_ptr<TensorTransform> &op) -> std::shared_ptr<TensorOperation> { 283 return op != nullptr ? op->Parse() : nullptr; 284 }); 285 return std::make_shared<MapDataset>(shared_from_this(), transform_ops, VectorStringToChar(input_columns), 286 VectorStringToChar(output_columns), cache, callbacks); 287 } 288 289 /// \brief Function to create a MapDataset 290 /// \note Applies each operation in operations to this dataset 291 /// \param[in] operations Vector of TensorTransform objects to be applied on the dataset. Operations are applied in 292 /// the order they appear in this list 293 /// \param[in] input_columns Vector of the names of the columns that will be passed to the first 294 /// operation as input. The size of this list must match the number of 295 /// input columns expected by the first operator. The default input_columns 296 /// is the first column 297 /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation 298 /// This parameter is mandatory if len(input_columns) != len(output_columns) 299 /// The size of this list must match the number of output columns of the 300 /// last operation. The default output_columns will have the same 301 /// name as the input columns, i.e., the columns will be replaced 302 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 303 /// \param[in] callbacks List of Dataset callbacks to be called. 304 /// \return Shared pointer to the current MapDataset 305 std::shared_ptr<MapDataset> Map(const std::vector<std::reference_wrapper<TensorTransform>> &operations, 306 const std::vector<std::string> &input_columns = {}, 307 const std::vector<std::string> &output_columns = {}, 308 const std::shared_ptr<DatasetCache> &cache = nullptr, 309 const std::vector<std::shared_ptr<DSCallback>> &callbacks = {}) { 310 std::vector<std::shared_ptr<TensorOperation>> transform_ops; 311 (void)std::transform(operations.begin(), operations.end(), std::back_inserter(transform_ops), 312 [](TensorTransform &op) -> std::shared_ptr<TensorOperation> { return op.Parse(); }); 313 return std::make_shared<MapDataset>(shared_from_this(), transform_ops, VectorStringToChar(input_columns), 314 VectorStringToChar(output_columns), cache, callbacks); 315 } 316 317 /// \brief Function to create a Project Dataset 318 /// \note Applies project to the dataset 319 /// \param[in] columns The name of columns to project 320 /// \return Shared pointer to the current Dataset 321 /// \par Example 322 /// \code 323 /// /* Reorder the original column names in dataset */ 324 /// std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", std::make_shared<RandomSampler>(false, 10)); 325 /// ds = ds->Project({"label", "image"}); 326 /// \endcode Project(const std::vector<std::string> & columns)327 std::shared_ptr<ProjectDataset> Project(const std::vector<std::string> &columns) { 328 return std::make_shared<ProjectDataset>(shared_from_this(), VectorStringToChar(columns)); 329 } 330 331 /// \brief Function to create a Shuffle Dataset 332 /// \note Randomly shuffles the rows of this dataset 333 /// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling 334 /// \return Shared pointer to the current ShuffleDataset 335 /// \par Example 336 /// \code 337 /// /* Rename the original column names in dataset */ 338 /// std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", std::make_shared<RandomSampler>(false, 10)); 339 /// ds = ds->Rename({"image", "label"}, {"image_output", "label_output"}); 340 /// \endcode Shuffle(int32_t buffer_size)341 std::shared_ptr<ShuffleDataset> Shuffle(int32_t buffer_size) { 342 return std::make_shared<ShuffleDataset>(shared_from_this(), buffer_size); 343 } 344 IRNode()345 std::shared_ptr<DatasetNode> IRNode() { return ir_node_; } 346 347 protected: 348 std::shared_ptr<TreeGetters> tree_getters_; 349 std::shared_ptr<DatasetNode> ir_node_; 350 351 private: 352 // Char interface(CharIF) of GetColumnNames 353 std::vector<std::vector<char>> GetColumnNamesCharIF(); 354 355 // Char interface(CharIF) of GetClassIndexing 356 std::vector<std::pair<std::vector<char>, std::vector<int32_t>>> GetClassIndexingCharIF(); 357 358 // Char interface(CharIF) of CreateIterator 359 std::shared_ptr<Iterator> CreateIteratorCharIF(int32_t num_epochs); 360 361 // Char interface(CharIF) of DeviceQueue 362 bool DeviceQueueCharIF(const std::vector<char> &queue_name, const std::vector<char> &device_type, int32_t device_id, 363 int32_t num_epochs, bool send_epoch_end, int32_t total_batches, bool create_data_info_queue); 364 365 // Char interface(CharIF) of Save 366 bool SaveCharIF(const std::vector<char> &dataset_path, int32_t num_files, const std::vector<char> &dataset_type); 367 }; 368 369 class DATASET_API SchemaObj { 370 public: 371 /// \brief Constructor SchemaObj(StringToChar (schema_file))372 explicit SchemaObj(const std::string &schema_file = "") : SchemaObj(StringToChar(schema_file)) {} 373 374 /// \brief Destructor 375 ~SchemaObj() = default; 376 377 /// \brief SchemaObj Init function 378 /// \return bool true if schema initialization is successful 379 Status Init(); 380 381 /// \brief Add new column to the schema with unknown shape of rank 1 382 /// \param[in] name Name of the column. 383 /// \param[in] ms_type Data type of the column(mindspore::DataType). 384 /// \return Status code add_column(const std::string & name,mindspore::DataType ms_type)385 Status add_column(const std::string &name, mindspore::DataType ms_type) { 386 return add_column_char(StringToChar(name), ms_type); 387 } 388 389 /// \brief Add new column to the schema with unknown shape of rank 1 390 /// \param[in] name Name of the column. 391 /// \param[in] ms_type Data type of the column(std::string). 392 /// \param[in] shape Shape of the column. 393 /// \return Status code add_column(const std::string & name,const std::string & ms_type)394 Status add_column(const std::string &name, const std::string &ms_type) { 395 return add_column_char(StringToChar(name), StringToChar(ms_type)); 396 } 397 398 /// \brief Add new column to the schema 399 /// \param[in] name Name of the column. 400 /// \param[in] ms_type Data type of the column(mindspore::DataType). 401 /// \param[in] shape Shape of the column. 402 /// \return Status code add_column(const std::string & name,mindspore::DataType ms_type,const std::vector<int32_t> & shape)403 Status add_column(const std::string &name, mindspore::DataType ms_type, const std::vector<int32_t> &shape) { 404 return add_column_char(StringToChar(name), ms_type, shape); 405 } 406 407 /// \brief Add new column to the schema 408 /// \param[in] name Name of the column. 409 /// \param[in] ms_type Data type of the column(std::string). 410 /// \param[in] shape Shape of the column. 411 /// \return Status code add_column(const std::string & name,const std::string & ms_type,const std::vector<int32_t> & shape)412 Status add_column(const std::string &name, const std::string &ms_type, const std::vector<int32_t> &shape) { 413 return add_column_char(StringToChar(name), StringToChar(ms_type), shape); 414 } 415 416 /// \brief Get a JSON string of the schema 417 /// \return JSON string of the schema to_json()418 std::string to_json() { return CharToString(to_json_char()); } 419 420 /// \brief Get a JSON string of the schema to_string()421 std::string to_string() { return to_json(); } 422 423 /// \brief Set a new value to dataset_type 424 void set_dataset_type(const std::string &dataset_type); 425 426 /// \brief Set a new value to num_rows 427 void set_num_rows(int32_t num_rows); 428 429 /// \brief Get the current num_rows 430 int32_t get_num_rows() const; 431 432 /// \brief Get schema file from JSON file 433 /// \param[in] json_string Name of JSON file to be parsed. 434 /// \return Status code FromJSONString(const std::string & json_string)435 Status FromJSONString(const std::string &json_string) { return FromJSONStringCharIF(StringToChar(json_string)); } 436 437 /// \brief Parse and add column information 438 /// \param[in] json_string Name of JSON string for column dataset attribute information, decoded from schema file. 439 /// \return Status code ParseColumnString(const std::string & json_string)440 Status ParseColumnString(const std::string &json_string) { 441 return ParseColumnStringCharIF(StringToChar(json_string)); 442 } 443 444 private: 445 // Char constructor of SchemaObj 446 explicit SchemaObj(const std::vector<char> &schema_file); 447 448 // Char interface of add_column 449 Status add_column_char(const std::vector<char> &name, mindspore::DataType ms_type); 450 451 Status add_column_char(const std::vector<char> &name, const std::vector<char> &ms_type); 452 453 Status add_column_char(const std::vector<char> &name, mindspore::DataType ms_type, const std::vector<int32_t> &shape); 454 455 Status add_column_char(const std::vector<char> &name, const std::vector<char> &ms_type, 456 const std::vector<int32_t> &shape); 457 458 // Char interface of to_json 459 const std::vector<char> to_json_char(); 460 461 // Char interface of FromJSONString 462 Status FromJSONStringCharIF(const std::vector<char> &json_string); 463 464 // Char interface of ParseColumnString 465 Status ParseColumnStringCharIF(const std::vector<char> &json_string); 466 467 struct Data; 468 std::shared_ptr<Data> data_; 469 }; 470 471 class DATASET_API BatchDataset : public Dataset { 472 public: 473 BatchDataset(const std::shared_ptr<Dataset> &input, int32_t batch_size, bool drop_remainder = false); 474 475 ~BatchDataset() override = default; 476 }; 477 478 class DATASET_API MapDataset : public Dataset { 479 public: 480 MapDataset(const std::shared_ptr<Dataset> &input, const std::vector<std::shared_ptr<TensorOperation>> &operations, 481 const std::vector<std::vector<char>> &input_columns, const std::vector<std::vector<char>> &output_columns, 482 const std::shared_ptr<DatasetCache> &cache, const std::vector<std::shared_ptr<DSCallback>> &callbacks); 483 484 ~MapDataset() override = default; 485 }; 486 487 class DATASET_API ProjectDataset : public Dataset { 488 public: 489 ProjectDataset(const std::shared_ptr<Dataset> &input, const std::vector<std::vector<char>> &columns); 490 491 ~ProjectDataset() override = default; 492 }; 493 494 class DATASET_API ShuffleDataset : public Dataset { 495 public: 496 ShuffleDataset(const std::shared_ptr<Dataset> &input, int32_t buffer_size); 497 498 ~ShuffleDataset() override = default; 499 }; 500 501 /// \brief Function to create a SchemaObj. 502 /// \param[in] schema_file Path of schema file. 503 /// \note The reason for using this API is that std::string will be constrained by the 504 /// compiler option '_GLIBCXX_USE_CXX11_ABI' while char is free of this restriction. 505 /// \return Shared pointer to the current schema. 506 std::shared_ptr<SchemaObj> DATASET_API SchemaCharIF(const std::vector<char> &schema_file); 507 508 /// \brief Function to create a SchemaObj. 509 /// \param[in] schema_file Path of schema file. 510 /// \return Shared pointer to the current schema. 511 inline std::shared_ptr<SchemaObj> DATASET_API Schema(const std::string &schema_file = "") { 512 return SchemaCharIF(StringToChar(schema_file)); 513 } 514 515 class DATASET_API AlbumDataset : public Dataset { 516 public: 517 /// \brief Constructor of AlbumDataset. 518 /// \param[in] dataset_dir Path to the root directory that contains the dataset. 519 /// \param[in] data_schema Path to dataset schema file. 520 /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns 521 /// (default = {}). 522 /// \param[in] decode The option to decode the images in dataset (default = false). 523 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not 524 /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). 525 /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). 526 AlbumDataset(const std::vector<char> &dataset_dir, const std::vector<char> &data_schema, 527 const std::vector<std::vector<char>> &column_names, bool decode, const std::shared_ptr<Sampler> &sampler, 528 const std::shared_ptr<DatasetCache> &cache); 529 530 /// \brief Constructor of AlbumDataset. 531 /// \param[in] dataset_dir Path to the root directory that contains the dataset. 532 /// \param[in] data_schema Path to dataset schema file. 533 /// \param[in] column_names Column names used to specify columns to load. 534 /// \param[in] decode The option to decode the images in dataset. 535 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. 536 /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). 537 AlbumDataset(const std::vector<char> &dataset_dir, const std::vector<char> &data_schema, 538 const std::vector<std::vector<char>> &column_names, bool decode, const Sampler *sampler, 539 const std::shared_ptr<DatasetCache> &cache); 540 541 /// \brief Constructor of AlbumDataset. 542 /// \param[in] dataset_dir Path to the root directory that contains the dataset. 543 /// \param[in] data_schema Path to dataset schema file. 544 /// \param[in] column_names Column names used to specify columns to load. 545 /// \param[in] decode The option to decode the images in dataset. 546 /// \param[in] sampler Sampler object used to choose samples from the dataset. 547 /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). 548 AlbumDataset(const std::vector<char> &dataset_dir, const std::vector<char> &data_schema, 549 const std::vector<std::vector<char>> &column_names, bool decode, 550 const std::reference_wrapper<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache); 551 552 /// \brief Destructor of AlbumDataset. 553 ~AlbumDataset() override = default; 554 }; 555 556 /// \brief Function to create an AlbumDataset 557 /// \note The generated dataset is specified through setting a schema 558 /// \param[in] dataset_dir Path to the root directory that contains the dataset 559 /// \param[in] data_schema Path to dataset schema file 560 /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. 561 /// (default = {}) 562 /// \param[in] decode the option to decode the images in dataset (default = false) 563 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not 564 /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) 565 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 566 /// \return Shared pointer to the current Dataset 567 /// \par Example 568 /// \code 569 /// /* Define dataset path and MindData object */ 570 /// std::string folder_path = "/path/to/album_dataset_directory"; 571 /// std::string schema_file = "/path/to/album_schema_file"; 572 /// std::vector<std::string> column_names = {"image", "label", "id"}; 573 /// std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names); 574 /// 575 /// /* Create iterator to read dataset */ 576 /// std::shared_ptr<Iterator> iter = ds->CreateIterator(); 577 /// std::unordered_map<std::string, mindspore::MSTensor> row; 578 /// iter->GetNextRow(&row); 579 /// 580 /// /* Note: As we defined before, each data dictionary owns keys "image", "label" and "id" */ 581 /// auto image = row["image"]; 582 /// \endcode 583 inline std::shared_ptr<AlbumDataset> DATASET_API 584 Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector<std::string> &column_names = {}, 585 bool decode = false, const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(), 586 const std::shared_ptr<DatasetCache> &cache = nullptr) { 587 return std::make_shared<AlbumDataset>(StringToChar(dataset_dir), StringToChar(data_schema), 588 VectorStringToChar(column_names), decode, sampler, cache); 589 } 590 591 /// \brief Function to create an AlbumDataset 592 /// \note The generated dataset is specified through setting a schema 593 /// \param[in] dataset_dir Path to the root directory that contains the dataset 594 /// \param[in] data_schema Path to dataset schema file 595 /// \param[in] column_names Column names used to specify columns to load 596 /// \param[in] decode the option to decode the images in dataset 597 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. 598 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 599 /// \return Shared pointer to the current Dataset 600 inline std::shared_ptr<AlbumDataset> DATASET_API Album(const std::string &dataset_dir, const std::string &data_schema, 601 const std::vector<std::string> &column_names, bool decode, 602 const Sampler *sampler, 603 const std::shared_ptr<DatasetCache> &cache = nullptr) { 604 return std::make_shared<AlbumDataset>(StringToChar(dataset_dir), StringToChar(data_schema), 605 VectorStringToChar(column_names), decode, sampler, cache); 606 } 607 608 /// \brief Function to create an AlbumDataset 609 /// \note The generated dataset is specified through setting a schema 610 /// \param[in] dataset_dir Path to the root directory that contains the dataset 611 /// \param[in] data_schema Path to dataset schema file 612 /// \param[in] column_names Column names used to specify columns to load 613 /// \param[in] decode the option to decode the images in dataset 614 /// \param[in] sampler Sampler object used to choose samples from the dataset. 615 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 616 /// \return Shared pointer to the current Dataset 617 inline std::shared_ptr<AlbumDataset> DATASET_API Album(const std::string &dataset_dir, const std::string &data_schema, 618 const std::vector<std::string> &column_names, bool decode, 619 const std::reference_wrapper<Sampler> sampler, 620 const std::shared_ptr<DatasetCache> &cache = nullptr) { 621 return std::make_shared<AlbumDataset>(StringToChar(dataset_dir), StringToChar(data_schema), 622 VectorStringToChar(column_names), decode, sampler, cache); 623 } 624 625 class DATASET_API MnistDataset : public Dataset { 626 public: 627 /// \brief Constructor of MnistDataset. 628 /// \param[in] dataset_dir Path to the root directory that contains the dataset. 629 /// \param[in] usage Part of dataset of MNIST, can be "train", "test" or "all" (default = "all"). 630 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not 631 /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). 632 /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). 633 MnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, 634 const std::shared_ptr<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache); 635 636 /// \brief Constructor of MnistDataset. 637 /// \param[in] dataset_dir Path to the root directory that contains the dataset. 638 /// \param[in] usage Part of dataset of MNIST, can be "train", "test" or "all". 639 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. 640 /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). 641 MnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, const Sampler *sampler, 642 const std::shared_ptr<DatasetCache> &cache); 643 644 /// \brief Constructor of MnistDataset. 645 /// \param[in] dataset_dir Path to the root directory that contains the dataset. 646 /// \param[in] usage Part of dataset of MNIST, can be "train", "test" or "all". 647 /// \param[in] sampler Sampler object used to choose samples from the dataset. 648 /// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used). 649 MnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, 650 const std::reference_wrapper<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache); 651 652 /// Destructor of MnistDataset. 653 ~MnistDataset() override = default; 654 }; 655 656 /// \brief Function to create a MnistDataset 657 /// \note The generated dataset has two columns ["image", "label"] 658 /// \param[in] dataset_dir Path to the root directory that contains the dataset 659 /// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all"). 660 /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not 661 /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) 662 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 663 /// \return Shared pointer to the current MnistDataset 664 /// \par Example 665 /// \code 666 /// /* Define dataset path and MindData object */ 667 /// std::string folder_path = "/path/to/mnist_dataset_directory"; 668 /// std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", std::make_shared<RandomSampler>(false, 20)); 669 /// 670 /// /* Create iterator to read dataset */ 671 /// std::shared_ptr<Iterator> iter = ds->CreateIterator(); 672 /// std::unordered_map<std::string, mindspore::MSTensor> row; 673 /// iter->GetNextRow(&row); 674 /// 675 /// /* Note: In MNIST dataset, each dictionary has keys "image" and "label" */ 676 /// auto image = row["image"]; 677 /// \endcode 678 inline std::shared_ptr<MnistDataset> DATASET_API 679 Mnist(const std::string &dataset_dir, const std::string &usage = "all", 680 const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(), 681 const std::shared_ptr<DatasetCache> &cache = nullptr) { 682 return std::make_shared<MnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); 683 } 684 685 /// \brief Function to create a MnistDataset 686 /// \note The generated dataset has two columns ["image", "label"] 687 /// \param[in] dataset_dir Path to the root directory that contains the dataset 688 /// \param[in] usage of MNIST, can be "train", "test" or "all" 689 /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. 690 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 691 /// \return Shared pointer to the current MnistDataset 692 inline std::shared_ptr<MnistDataset> DATASET_API Mnist(const std::string &dataset_dir, const std::string &usage, 693 const Sampler *sampler, 694 const std::shared_ptr<DatasetCache> &cache = nullptr) { 695 return std::make_shared<MnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); 696 } 697 698 /// \brief Function to create a MnistDataset 699 /// \note The generated dataset has two columns ["image", "label"] 700 /// \param[in] dataset_dir Path to the root directory that contains the dataset 701 /// \param[in] usage of MNIST, can be "train", "test" or "all" 702 /// \param[in] sampler Sampler object used to choose samples from the dataset. 703 /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). 704 /// \return Shared pointer to the current MnistDataset 705 inline std::shared_ptr<MnistDataset> DATASET_API Mnist(const std::string &dataset_dir, const std::string &usage, 706 const std::reference_wrapper<Sampler> sampler, 707 const std::shared_ptr<DatasetCache> &cache = nullptr) { 708 return std::make_shared<MnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); 709 } 710 } // namespace dataset 711 } // namespace mindspore 712 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_DATASETS_H_ 713