1 /** 2 * Copyright 2020-2024 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_VISION_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_VISION_H_ 19 20 #include <map> 21 #include <memory> 22 #include <string> 23 #include <tuple> 24 #include <utility> 25 #include <vector> 26 27 #include "include/api/dual_abi_helper.h" 28 #include "include/api/status.h" 29 #include "include/dataset/constants.h" 30 #include "include/dataset/transforms.h" 31 #include "include/dataset/vision_lite.h" 32 33 namespace mindspore { 34 namespace dataset { 35 class TensorOperation; 36 37 // Transform operations for performing computer vision. 38 namespace vision { 39 /// \brief Apply brightness adjustment on input image. 40 class DATASET_API AdjustBrightness final : public TensorTransform { 41 public: 42 /// \brief Constructor. 43 /// \param[in] brightness_factor Adjusts image brightness, non negative real number. 44 /// \par Example 45 /// \code 46 /// /* Define operations */ 47 /// auto decode_op = vision::Decode(); 48 /// auto adjust_brightness_op = vision::AdjustBrightness(2.0); 49 /// 50 /// /* dataset is an instance of Dataset object */ 51 /// dataset = dataset->Map({decode_op, adjust_brightness_op}, // operations 52 /// {"image"}); // input columns 53 /// \endcode 54 explicit AdjustBrightness(float brightness_factor); 55 56 /// \brief Destructor. 57 ~AdjustBrightness() override = default; 58 59 protected: 60 /// \brief Function to convert TensorTransform object into a TensorOperation object. 61 /// \return Shared pointer to TensorOperation object. 62 std::shared_ptr<TensorOperation> Parse() override; 63 64 private: 65 struct Data; 66 std::shared_ptr<Data> data_; 67 }; 68 69 /// \brief Apply contrast adjustment on input image. 70 class DATASET_API AdjustContrast final : public TensorTransform { 71 public: 72 /// \brief Constructor. 73 /// \param[in] contrast_factor Adjusts image contrast, non negative real number. 74 /// \par Example 75 /// \code 76 /// /* Define operations */ 77 /// auto decode_op = vision::Decode(); 78 /// auto adjust_contrast_op = vision::AdjustContrast(10.0); 79 /// 80 /// /* dataset is an instance of Dataset object */ 81 /// dataset = dataset->Map({decode_op, adjust_contrast_op}, // operations 82 /// {"image"}); // input columns 83 /// \endcode 84 explicit AdjustContrast(float contrast_factor); 85 86 /// \brief Destructor. 87 ~AdjustContrast() override = default; 88 89 protected: 90 /// \brief Function to convert TensorTransform object into a TensorOperation object. 91 /// \return Shared pointer to TensorOperation object. 92 std::shared_ptr<TensorOperation> Parse() override; 93 94 private: 95 struct Data; 96 std::shared_ptr<Data> data_; 97 }; 98 99 /// \brief AdjustGamma TensorTransform. 100 /// \note Apply gamma correction on input image. 101 class DATASET_API AdjustGamma final : public TensorTransform { 102 public: 103 /// \brief Constructor. 104 /// \param[in] gamma Non negative real number, which makes the output image pixel value 105 /// exponential in relation to the input image pixel value. 106 /// \param[in] gain The constant multiplier. Default: 1.0. 107 /// \par Example 108 /// \code 109 /// /* Define operations */ 110 /// auto decode_op = vision::Decode(); 111 /// auto adjust_gamma_op = vision::AdjustGamma(10.0); 112 /// 113 /// /* dataset is an instance of Dataset object */ 114 /// dataset = dataset->Map({decode_op, adjust_gamma_op}, // operations 115 /// {"image"}); // input columns 116 /// \endcode 117 explicit AdjustGamma(float gamma, float gain = 1.0); 118 119 /// \brief Destructor. 120 ~AdjustGamma() override = default; 121 122 protected: 123 /// \brief Function to convert TensorTransform object into a TensorOperation object. 124 /// \return Shared pointer to TensorOperation object. 125 std::shared_ptr<TensorOperation> Parse() override; 126 127 private: 128 struct Data; 129 std::shared_ptr<Data> data_; 130 }; 131 132 /// \note Apply hue adjustment on input image. 133 class DATASET_API AdjustHue final : public TensorTransform { 134 public: 135 /// \brief Constructor. 136 /// \param[in] hue_factor How much to shift the hue channel, must be in the interval [-0.5, 0.5]. 137 /// \par Example 138 /// \code 139 /// /* Define operations */ 140 /// auto decode_op = vision::Decode(); 141 /// auto adjust_hue_op = vision::AdjustHue(0.2); 142 /// 143 /// /* dataset is an instance of Dataset object */ 144 /// dataset = dataset->Map({decode_op, adjust_contrast_op}, // operations 145 /// {"image"}); // input columns 146 /// \endcode 147 explicit AdjustHue(float hue_factor); 148 149 /// \brief Destructor. 150 ~AdjustHue() override = default; 151 152 protected: 153 /// \brief Function to convert TensorTransform object into a TensorOperation object. 154 /// \return Shared pointer to TensorOperation object. 155 std::shared_ptr<TensorOperation> Parse() override; 156 157 private: 158 struct Data; 159 std::shared_ptr<Data> data_; 160 }; 161 162 /// \brief Adjust the color saturation of the input image. 163 class DATASET_API AdjustSaturation final : public TensorTransform { 164 public: 165 /// \brief Constructor. 166 /// \param[in] saturation_factor Adjust image saturation, non negative real number. 167 /// \par Example 168 /// \code 169 /// /* Define operations */ 170 /// auto decode_op = vision::Decode(); 171 /// auto adjust_saturation_op = vision::AdjustSaturation(2.0); 172 /// 173 /// /* dataset is an instance of Dataset object */ 174 /// dataset = dataset->Map({decode_op, adjust_saturation_op}, // operations 175 /// {"image"}); // input columns 176 /// \endcode 177 explicit AdjustSaturation(float saturation_factor); 178 179 /// \brief Destructor. 180 ~AdjustSaturation() override = default; 181 182 protected: 183 /// \brief Function to convert TensorTransform object into a TensorOperation object. 184 /// \return Shared pointer to TensorOperation object. 185 std::shared_ptr<TensorOperation> Parse() override; 186 187 private: 188 struct Data; 189 std::shared_ptr<Data> data_; 190 }; 191 192 /// \brief Apply adjust sharpness on input image. Input image is expected to be in [H, W, C] or [H, W] format. 193 class DATASET_API AdjustSharpness final : public TensorTransform { 194 public: 195 /// \brief Constructor. 196 /// \param[in] sharpness_factor How much to adjust the sharpness. Can be any Non negative real number. 197 /// 0 gives a blurred image, 1 gives the original image while 2 increases the Sharpness by a factor of 2. 198 /// \par Example 199 /// \code 200 /// /* Define operations */ 201 /// auto decode_op = vision::Decode(); 202 /// auto adjust_sharpness_op = vision::AdjustSharpness(2.0); 203 /// 204 /// /* dataset is an instance of Dataset object */ 205 /// dataset = dataset->Map({decode_op, adjust_sharpness_op}, // operations 206 /// {"image"}); // input columns 207 /// \endcode 208 explicit AdjustSharpness(float sharpness_factor); 209 210 /// \brief Destructor. 211 ~AdjustSharpness() override = default; 212 213 protected: 214 /// \brief Function to convert TensorTransform object into a TensorOperation object. 215 /// \return Shared pointer to TensorOperation object. 216 std::shared_ptr<TensorOperation> Parse() override; 217 218 private: 219 struct Data; 220 std::shared_ptr<Data> data_; 221 }; 222 223 /// \brief Apply AutoAugment data augmentation method. 224 class DATASET_API AutoAugment final : public TensorTransform { 225 public: 226 /// \brief Constructor. 227 /// \param[in] policy An enum for the data auto augmentation policy (default=AutoAugmentPolicy::kImageNet). 228 /// - AutoAugmentPolicy::kImageNet, AutoAugment policy learned on the ImageNet dataset. 229 /// - AutoAugmentPolicy::kCifar10, AutoAugment policy learned on the Cifar10 dataset. 230 /// - AutoAugmentPolicy::kSVHN, AutoAugment policy learned on the SVHN dataset. 231 /// \param[in] interpolation An enum for the mode of interpolation (default=InterpolationMode::kNearestNeighbour). 232 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 233 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 234 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 235 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 236 /// \param[in] fill_value A vector representing the pixel intensity of the borders (default={0, 0, 0}). 237 /// \par Example 238 /// \code 239 /// /* Define operations */ 240 /// auto decode_op = vision::Decode(); 241 /// auto auto_augment_op = vision::AutoAugment(AutoAugmentPolicy::kImageNet, 242 /// InterpolationMode::kNearestNeighbour, {0, 0, 0}); 243 /// /* dataset is an instance of Dataset object */ 244 /// dataset = dataset->Map({decode_op, auto_augment_op}, // operations 245 /// {"image"}); // input columns 246 /// \endcode 247 explicit AutoAugment(AutoAugmentPolicy policy = AutoAugmentPolicy::kImageNet, 248 InterpolationMode interpolation = InterpolationMode::kNearestNeighbour, 249 const std::vector<uint8_t> &fill_value = {0, 0, 0}); 250 251 /// \brief Destructor. 252 ~AutoAugment() override = default; 253 254 protected: 255 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 256 /// \return Shared pointer to TensorOperation object. 257 std::shared_ptr<TensorOperation> Parse() override; 258 259 private: 260 struct Data; 261 std::shared_ptr<Data> data_; 262 }; 263 264 /// \brief Apply automatic contrast on the input image. 265 class DATASET_API AutoContrast final : public TensorTransform { 266 public: 267 /// \brief Constructor. 268 /// \param[in] cutoff Percent of pixels to cut off from the histogram, the valid range of cutoff value is 0 to 50. 269 /// \param[in] ignore Pixel values to ignore. 270 /// \par Example 271 /// \code 272 /// /* Define operations */ 273 /// auto decode_op = vision::Decode(); 274 /// auto autocontrast_op = vision::AutoContrast(10.0, {10, 20}); 275 /// 276 /// /* dataset is an instance of Dataset object */ 277 /// dataset = dataset->Map({decode_op, autocontrast_op}, // operations 278 /// {"image"}); // input columns 279 /// \endcode 280 explicit AutoContrast(float cutoff = 0.0, const std::vector<uint32_t> &ignore = {}); 281 282 /// \brief Destructor. 283 ~AutoContrast() override = default; 284 285 protected: 286 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 287 /// \return Shared pointer to TensorOperation object. 288 std::shared_ptr<TensorOperation> Parse() override; 289 290 private: 291 struct Data; 292 std::shared_ptr<Data> data_; 293 }; 294 295 /// \brief BoundingBoxAugment TensorTransform. 296 /// \note Apply a given image transform on a random selection of bounding box regions of a given image. 297 class DATASET_API BoundingBoxAugment final : public TensorTransform { 298 public: 299 /// \brief Constructor. 300 /// \param[in] transform Raw pointer to the TensorTransform operation. 301 /// \param[in] ratio Ratio of bounding boxes to apply augmentation on. Range: [0, 1] (default=0.3). 302 /// \par Example 303 /// \code 304 /// /* Define operations */ 305 /// TensorTransform *rotate_op = new vision::RandomRotation({-180, 180}); 306 /// auto bbox_aug_op = vision::BoundingBoxAugment(rotate_op, 0.5); 307 /// 308 /// /* dataset is an instance of Dataset object */ 309 /// dataset = dataset->Map({bbox_aug_op}, // operations 310 /// {"image", "bbox"}); // input columns 311 /// \endcode 312 explicit BoundingBoxAugment(TensorTransform *transform, float ratio = 0.3); 313 314 /// \brief Constructor. 315 /// \param[in] transform Smart pointer to the TensorTransform operation. 316 /// \param[in] ratio Ratio of bounding boxes where augmentation is applied to. Range: [0, 1] (default=0.3). 317 /// \par Example 318 /// \code 319 /// /* Define operations */ 320 /// std::shared_ptr<TensorTransform> flip_op = std::make_shared<vision::RandomHorizontalFlip>(0.5); 321 /// std::shared_ptr<TensorTransform> bbox_aug_op = std::make_shared<vision::BoundingBoxAugment>(flip_op, 0.1); 322 /// 323 /// /* dataset is an instance of Dataset object */ 324 /// dataset = dataset->Map({bbox_aug_op}, // operations 325 /// {"image", "bbox"}); // input columns 326 /// \endcode 327 explicit BoundingBoxAugment(const std::shared_ptr<TensorTransform> &transform, float ratio = 0.3); 328 329 /// \brief Constructor. 330 /// \param[in] transform Object pointer to the TensorTransform operation. 331 /// \param[in] ratio Ratio of bounding boxes where augmentation is applied to. Range: [0, 1] (default=0.3). 332 /// \par Example 333 /// \code 334 /// /* Define operations */ 335 /// vision::RandomColor random_color_op = vision::RandomColor(0.5, 1.0); 336 /// vision::BoundingBoxAugment bbox_aug_op = vision::BoundingBoxAugment(random_color_op, 0.8); 337 /// 338 /// /* dataset is an instance of Dataset object */ 339 /// dataset = dataset->Map({bbox_aug_op}, // operations 340 /// {"image", "bbox"}); // input columns 341 /// \endcode 342 explicit BoundingBoxAugment(const std::reference_wrapper<TensorTransform> &transform, float ratio = 0.3); 343 344 /// \brief Destructor. 345 ~BoundingBoxAugment() override = default; 346 347 protected: 348 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 349 /// \return Shared pointer to TensorOperation object. 350 std::shared_ptr<TensorOperation> Parse() override; 351 352 private: 353 struct Data; 354 std::shared_ptr<Data> data_; 355 }; 356 357 /// \brief Change the color space of the image. 358 class DATASET_API ConvertColor final : public TensorTransform { 359 public: 360 /// \brief Constructor. 361 /// \param[in] convert_mode The mode of image channel conversion. 362 /// \par Example 363 /// \code 364 /// /* dataset is an instance of Dataset object */ 365 /// dataset = dataset->Map({std::make_shared<vision::Decode>(), 366 /// std::make_shared<vision::ConvertColor>(ConvertMode::COLOR_BGR2RGB)}, // operations 367 /// {"image"}); // input columns 368 /// \endcode 369 explicit ConvertColor(ConvertMode convert_mode); 370 371 /// \brief Destructor. 372 ~ConvertColor() override = default; 373 374 protected: 375 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 376 /// \return Shared pointer to TensorOperation object. 377 std::shared_ptr<TensorOperation> Parse() override; 378 379 private: 380 struct Data; 381 std::shared_ptr<Data> data_; 382 }; 383 384 /// \brief Mask a random section of each image with the corresponding part of another randomly 385 /// selected image in that batch. 386 class DATASET_API CutMixBatch final : public TensorTransform { 387 public: 388 /// \brief Constructor. 389 /// \param[in] image_batch_format The format of the batch. 390 /// \param[in] alpha The hyperparameter of beta distribution (default = 1.0). 391 /// \param[in] prob The probability by which CutMix is applied to each image (default = 1.0). 392 /// \par Example 393 /// \code 394 /// /* dataset is an instance of Dataset object */ 395 /// dataset = dataset->Batch(5); 396 /// dataset = dataset->Map({std::make_shared<vision::CutMixBatch>(ImageBatchFormat::kNHWC)}, // operations 397 /// {"image", "label"}); // input columns 398 /// \endcode 399 explicit CutMixBatch(ImageBatchFormat image_batch_format, float alpha = 1.0, float prob = 1.0); 400 401 /// \brief Destructor. 402 ~CutMixBatch() override = default; 403 404 protected: 405 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 406 /// \return Shared pointer to TensorOperation object. 407 std::shared_ptr<TensorOperation> Parse() override; 408 409 private: 410 struct Data; 411 std::shared_ptr<Data> data_; 412 }; 413 414 /// \brief Randomly cut (mask) out a given number of square patches from the input image. 415 class DATASET_API CutOut final : public TensorTransform { 416 public: 417 /// \brief Constructor. 418 /// \param[in] length Integer representing the side length of each square patch. 419 /// \param[in] num_patches Integer representing the number of patches to be cut out of an image. 420 /// \param[in] is_hwc A boolean to indicate whether the input image is in HWC format (true) or CHW 421 /// format (false) (default = true). 422 /// \par Example 423 /// \code 424 /// /* dataset is an instance of Dataset object */ 425 /// dataset = dataset->Map({std::make_shared<vision::Decode>(), 426 /// std::make_shared<vision::CutOut>(1, 4, true)}, // operations 427 /// {"image"}); // input columns 428 /// \endcode 429 explicit CutOut(int32_t length, int32_t num_patches = 1, bool is_hwc = true); 430 431 /// \brief Destructor. 432 ~CutOut() override = default; 433 434 protected: 435 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 436 /// \return Shared pointer to TensorOperation object. 437 std::shared_ptr<TensorOperation> Parse() override; 438 439 private: 440 struct Data; 441 std::shared_ptr<Data> data_; 442 }; 443 444 /// \brief Decode the input video. 445 class DATASET_API DecodeVideo final : public TensorTransform { 446 public: 447 /// \brief Constructor. It will decode a vector containing a raw video tensor into a vector containing two tensors. 448 /// The raw video tensor in the input vector should be 1D array of UINT8. 449 /// The first tensor in the output vector is a visual tensor, the shape is <T,H,W,C>, the type is DE_UINT8. Pixel 450 /// order is RGB. The second tensor in the output vector is an audio tensor, the shape is <C, L>. 451 /// \par Example 452 /// \code 453 /// /* Read video file into tensor */ 454 /// mindspore::MSTensor video; 455 /// ASSERT_OK(mindspore::dataset::vision::ReadFile("/path/to/video/file", &video)); 456 /// std::vector<mindspore::MSTensor> input_tensor; 457 /// std::vector<mindspore::MSTensor> output_tensor; 458 /// input_tensor.push_back(video); 459 /// auto decode_video = vision::DecodeVideo(); 460 /// auto transform = Execute(decode_video); 461 /// Status rc = transform(input_tensor, &output_tensor); 462 /// \endcode 463 DecodeVideo(); 464 465 /// \brief Destructor. 466 ~DecodeVideo() = default; 467 468 protected: 469 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 470 /// \return Shared pointer to TensorOperation object. 471 std::shared_ptr<TensorOperation> Parse() override; 472 }; 473 474 /// \brief Encode the image as JPEG data. 475 /// \param[in] image The image to be encoded. 476 /// \param[out] output The Tensor data. 477 /// \param[in] quality The quality for the output tensor, in range of [1, 100]. Default: 75. 478 /// \return The status code. 479 Status DATASET_API EncodeJpeg(const mindspore::MSTensor &image, mindspore::MSTensor *output, int quality = 75); 480 481 /// \brief Encode the image as PNG data. 482 /// \param[in] image The image to be encoded. 483 /// \param[out] output The Tensor data. 484 /// \param[in] compression_level The compression_level for encoding, in range of [0, 9]. Default: 6. 485 /// \return The status code. 486 Status DATASET_API EncodePng(const mindspore::MSTensor &image, mindspore::MSTensor *output, int compression_level = 6); 487 488 /// \brief Apply histogram equalization on the input image. 489 class DATASET_API Equalize final : public TensorTransform { 490 public: 491 /// \brief Constructor. 492 /// \par Example 493 /// \code 494 /// /* dataset is an instance of Dataset object */ 495 /// dataset = dataset->Map({std::make_shared<vision::Decode>(), 496 /// std::make_shared<vision::Equalize>()}, // operations 497 /// {"image"}); // input columns 498 /// \endcode 499 Equalize(); 500 501 /// \brief Destructor. 502 ~Equalize() override = default; 503 504 protected: 505 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 506 /// \return Shared pointer to TensorOperation object. 507 std::shared_ptr<TensorOperation> Parse() override; 508 }; 509 510 /// \brief Erase the input image with given value. 511 class DATASET_API Erase final : public TensorTransform { 512 public: 513 /// \brief Constructor. 514 /// \param[in] top Vertical ordinate of the upper left corner of erased region. 515 /// \param[in] left Horizontal ordinate of the upper left corner of erased region. 516 /// \param[in] height Height of erased region. 517 /// \param[in] width Width of erased region. 518 /// \param[in] value Pixel value used to pad the erased area. 519 /// If a single integer is provided, it will be used for all RGB channels. 520 /// If a sequence of length 3 is provided, it will be used for R, G, B channels respectively. Default: 0. 521 /// \param[in] inplace Whether to erase inplace. Default: False. 522 /// \par Example 523 /// \code 524 /// /* dataset is an instance of Dataset object */ 525 /// dataset = dataset->Map({std::make_shared<vision::Decode>(), 526 /// std::make_shared<vision::Erase>(10, 10, 10, 10)}, // operations 527 /// {"image"}); // input columns 528 /// \endcode 529 Erase(int32_t top, int32_t left, int32_t height, int32_t width, const std::vector<float> &value = {0., 0., 0.}, 530 bool inplace = false); 531 532 /// \brief Destructor. 533 ~Erase() override = default; 534 535 protected: 536 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 537 /// \return Shared pointer to TensorOperation object. 538 std::shared_ptr<TensorOperation> Parse() override; 539 540 private: 541 struct Data; 542 std::shared_ptr<Data> data_; 543 }; 544 545 /// \brief Get the number of input image channels. 546 /// \param[in] image Tensor of the image. 547 /// \param[out] channels Channels of the image. 548 /// \return The status code. 549 Status DATASET_API GetImageNumChannels(const mindspore::MSTensor &image, dsize_t *channels); 550 551 /// \brief Get the size of input image. 552 /// \param[in] image Tensor of the image. 553 /// \param[out] size Size of the image as [height, width]. 554 /// \return The status code. 555 Status DATASET_API GetImageSize(const mindspore::MSTensor &image, std::vector<dsize_t> *size); 556 557 /// \brief Flip the input image horizontally. 558 class DATASET_API HorizontalFlip final : public TensorTransform { 559 public: 560 /// \brief Constructor. 561 /// \par Example 562 /// \code 563 /// /* dataset is an instance of Dataset object */ 564 /// dataset = dataset->Map({std::make_shared<vision::Decode>(), 565 /// std::make_shared<vision::HorizontalFlip>()}, // operations 566 /// {"image"}); // input columns 567 /// \endcode 568 HorizontalFlip(); 569 570 /// \brief Destructor. 571 ~HorizontalFlip() override = default; 572 573 protected: 574 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 575 /// \return Shared pointer to TensorOperation object. 576 std::shared_ptr<TensorOperation> Parse() override; 577 }; 578 579 /// \brief Apply invert on the input image in RGB mode. 580 class DATASET_API Invert final : public TensorTransform { 581 public: 582 /// \brief Constructor. 583 /// \par Example 584 /// \code 585 /// /* dataset is an instance of Dataset object */ 586 /// dataset = dataset->Map({std::make_shared<vision::Decode>(), 587 /// std::make_shared<vision::Invert>()}, // operations 588 /// {"image"}); // input columns 589 /// \endcode 590 Invert(); 591 592 /// \brief Destructor. 593 ~Invert() override = default; 594 595 protected: 596 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 597 /// \return Shared pointer to TensorOperation object. 598 std::shared_ptr<TensorOperation> Parse() override; 599 }; 600 601 /// \brief Apply MixUp transformation on an input batch of images and labels. The labels must be in 602 /// one-hot format and Batch must be called before calling this function. 603 class DATASET_API MixUpBatch final : public TensorTransform { 604 public: 605 /// \brief Constructor. 606 /// \param[in] alpha hyperparameter of beta distribution (default = 1.0). 607 /// \par Example 608 /// \code 609 /// /* dataset is an instance of Dataset object */ 610 /// dataset = dataset->Batch(5); 611 /// dataset = dataset->Map({std::make_shared<vision::MixUpBatch>()}, // operations 612 /// {"image"}); // input columns 613 /// \endcode 614 explicit MixUpBatch(float alpha = 1.0); 615 616 /// \brief Destructor. 617 ~MixUpBatch() override = default; 618 619 protected: 620 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 621 /// \return Shared pointer to TensorOperation object. 622 std::shared_ptr<TensorOperation> Parse() override; 623 624 private: 625 struct Data; 626 std::shared_ptr<Data> data_; 627 }; 628 629 /// \brief Normalize the input image with respect to mean and standard deviation and pads an extra 630 /// channel with value zero. 631 class DATASET_API NormalizePad final : public TensorTransform { 632 public: 633 /// \brief Constructor. 634 /// \param[in] mean A vector of mean values for each channel, with respect to channel order. 635 /// The mean values must be in range [0.0, 255.0]. 636 /// \param[in] std A vector of standard deviations for each channel, with respect to channel order. 637 /// The standard deviation values must be in range (0.0, 255.0]. 638 /// \param[in] dtype The output datatype of Tensor. 639 /// The standard deviation values must be "float32" or "float16"(default = "float32"). 640 /// \param[in] is_hwc A boolean to indicate whether the input image is in HWC format (true) or CHW 641 /// format (false) (default = true). 642 /// \par Example 643 /// \code 644 /// /* Define operations */ 645 /// auto decode_op = vision::Decode(); 646 /// auto normalize_pad_op = vision::NormalizePad({121.0, 115.0, 100.0}, {70.0, 68.0, 71.0}); 647 /// 648 /// /* dataset is an instance of Dataset object */ 649 /// dataset = dataset->Map({decode_op, normalize_pad_op}, // operations 650 /// {"image"}); // input columns 651 /// \endcode 652 NormalizePad(const std::vector<float> &mean, const std::vector<float> &std, const std::string &dtype = "float32", 653 bool is_hwc = true) NormalizePad(mean,std,StringToChar (dtype),is_hwc)654 : NormalizePad(mean, std, StringToChar(dtype), is_hwc) {} 655 656 NormalizePad(const std::vector<float> &mean, const std::vector<float> &std, const std::vector<char> &dtype, 657 bool is_hwc = true); 658 659 /// \brief Destructor. 660 ~NormalizePad() override = default; 661 662 protected: 663 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 664 /// \return Shared pointer to TensorOperation object. 665 std::shared_ptr<TensorOperation> Parse() override; 666 667 private: 668 struct Data; 669 std::shared_ptr<Data> data_; 670 }; 671 672 /// \brief Pad the image to a fixed size. 673 class DATASET_API PadToSize final : public TensorTransform { 674 public: 675 /// \brief Constructor. 676 /// \param[in] size A two element vector representing the target size to pad, in order of [height, width]. 677 /// \param[in] offset A two element vector representing the lengths to pad on the top and left, 678 /// in order of [top, left]. Default: {}, means to pad symmetrically, keeping the original image in center. 679 /// \param[in] fill_value A vector representing the pixel intensity of the borders. Only valid if the 680 /// padding_mode is BorderType.kConstant. If 1 value is provided, it is used for all RGB channels. 681 /// If 3 values are provided, it is used to fill R, G, B channels respectively. Default: {0}. 682 /// \param[in] padding_mode The method of padding, which can be one of BorderType.kConstant, BorderType.kEdge, 683 /// BorderType.kReflect or BorderType.kSymmetric. Default: BorderType.kConstant. 684 /// - BorderType.kConstant, pads with a constant value. 685 /// - BorderType.kEdge, pads with the last value at the edge of the image. 686 /// - BorderType.kReflect, pads with reflection of the image omitting the last value on the edge. 687 /// - BorderType.kSymmetric, pads with reflection of the image repeating the last value on the edge. 688 /// \par Example 689 /// \code 690 /// /* Define operations */ 691 /// auto decode_op = vision::Decode(); 692 /// auto pad_to_size_op = vision::PadToSize({256, 256}, {10, 20}, {255, 255, 255}); 693 /// 694 /// /* dataset is an instance of Dataset object */ 695 /// dataset = dataset->Map({decode_op, pad_to_size_op}, // operations 696 /// {"image"}); // input columns 697 /// \endcode 698 explicit PadToSize(const std::vector<int32_t> &size, const std::vector<int32_t> &offset = {}, 699 const std::vector<uint8_t> &fill_value = {0}, BorderType padding_mode = BorderType::kConstant); 700 701 /// \brief Destructor. 702 ~PadToSize() override = default; 703 704 protected: 705 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 706 /// \return Shared pointer to TensorOperation object. 707 std::shared_ptr<TensorOperation> Parse() override; 708 709 private: 710 struct Data; 711 std::shared_ptr<Data> data_; 712 }; 713 714 /// \brief Perform perspective transform on the image. 715 class DATASET_API Perspective final : public TensorTransform { 716 public: 717 /// \brief Constructor. 718 /// \param[in] start_points List containing four lists of two integers corresponding to four 719 /// corners [top-left, top-right, bottom-right, bottom-left] of the original image. 720 /// \param[in] end_points List containing four lists of two integers corresponding to four 721 /// corners [top-left, top-right, bottom-right, bottom-left] of the transformed image. 722 /// \param[in] interpolation An enum for the mode of interpolation. Default: InterpolationMode::kLinear. 723 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 724 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 725 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 726 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 727 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 728 /// \par Example 729 /// \code 730 /// /* Define operations */ 731 /// auto decode_op = vision::Decode(); 732 /// std::vector<std::vector<int32_t>> start_points = {{0, 0}, {1, 0}, {1, 1}, {0, 1}}; 733 /// std::vector<std::vector<int32_t>> end_points = {{0, 2}, {2, 0}, {2, 2}, {0, 2}}; 734 /// auto perspective_op = vision::Perspective(start_points, end_points, InterpolationMode::kLinear); 735 /// 736 /// /* dataset is an instance of Dataset object */ 737 /// dataset = dataset->Map({decode_op, perspective_op}, // operations 738 /// {"image"}); // input columns 739 /// \endcode 740 Perspective(const std::vector<std::vector<int32_t>> &start_points, 741 const std::vector<std::vector<int32_t>> &end_points, InterpolationMode interpolation); 742 743 /// \brief Destructor. 744 ~Perspective() override = default; 745 746 protected: 747 /// \brief Function to convert TensorTransform object into a TensorOperation object. 748 /// \return Shared pointer to TensorOperation object. 749 std::shared_ptr<TensorOperation> Parse() override; 750 751 private: 752 struct Data; 753 std::shared_ptr<Data> data_; 754 }; 755 756 /// \brief Posterize an image by reducing the number of bits for each color channel. 757 class DATASET_API Posterize final : public TensorTransform { 758 public: 759 /// \brief Constructor. 760 /// \param[in] bits The number of bits to keep for each channel, 761 /// should be in range of [0, 8]. 762 /// \par Example 763 /// \code 764 /// /* Define operations */ 765 /// auto decode_op = vision::Decode(); 766 /// auto posterize_op = vision::Posterize(8); 767 /// 768 /// /* dataset is an instance of Dataset object */ 769 /// dataset = dataset->Map({decode_op, posterize_op}, // operations 770 /// {"image"}); // input columns 771 /// \endcode 772 explicit Posterize(uint8_t bits); 773 774 /// \brief Destructor. 775 ~Posterize() override = default; 776 777 protected: 778 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 779 /// \return Shared pointer to TensorOperation object. 780 std::shared_ptr<TensorOperation> Parse() override; 781 782 private: 783 struct Data; 784 std::shared_ptr<Data> data_; 785 }; 786 787 /// \brief Apply RandAugment data augmentation method. 788 class DATASET_API RandAugment final : public TensorTransform { 789 public: 790 /// \brief Constructor. 791 /// \param[in] num_ops Number of augmentation transformations to apply sequentially. Default: 2. 792 /// \param[in] magnitude Magnitude for all the transformations. Default: 9. 793 /// \param[in] num_magnitude_bins The number of different magnitude values. Default: 31. 794 /// \param[in] interpolation An enum for the mode of interpolation. Default: InterpolationMode::kNearestNeighbour. 795 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 796 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 797 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 798 /// \param[in] fill_value A vector representing the pixel intensity of the borders. Default: {0, 0, 0}. 799 /// \par Example 800 /// \code 801 /// /* Define operations */ 802 /// auto decode_op = vision::Decode(); 803 /// auto rand_augment_op = vision::RandAugment(); 804 /// /* dataset is an instance of Dataset object */ 805 /// dataset = dataset->Map({decode_op, rand_augment_op}, // operations 806 /// {"image"}); // input columns 807 /// \endcode 808 explicit RandAugment(int32_t num_ops = 2, int32_t magnitude = 9, int32_t num_magnitude_bins = 31, 809 InterpolationMode interpolation = InterpolationMode::kNearestNeighbour, 810 const std::vector<uint8_t> &fill_value = {0, 0, 0}); 811 812 /// \brief Destructor. 813 ~RandAugment() override = default; 814 815 protected: 816 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 817 /// \return Shared pointer to TensorOperation object. 818 std::shared_ptr<TensorOperation> Parse() override; 819 820 private: 821 struct Data; 822 std::shared_ptr<Data> data_; 823 }; 824 825 /// \brief Automatically adjust the contrast of the image with a given probability. 826 class DATASET_API RandomAutoContrast final : public TensorTransform { 827 public: 828 /// \brief Constructor. 829 /// \param[in] cutoff Percent of the lightest and darkest pixels to be cut off from 830 /// the histogram of the input image. The value must be in range of [0.0, 50.0) (default=0.0). 831 /// \param[in] ignore The background pixel values to be ignored, each of which must be 832 /// in range of [0, 255] (default={}). 833 /// \param[in] prob A float representing the probability of AutoContrast, which must be 834 /// in range of [0, 1] (default=0.5). 835 /// \par Example 836 /// \code 837 /// /* Define operations */ 838 /// auto decode_op = vision::Decode(); 839 /// auto random_auto_contrast_op = vision::RandomAutoContrast(5.0); 840 /// 841 /// /* dataset is an instance of Dataset object */ 842 /// dataset = dataset->Map({decode_op, random_auto_contrast_op}, // operations 843 /// {"image"}); // input columns 844 /// \endcode 845 explicit RandomAutoContrast(float cutoff = 0.0, const std::vector<uint32_t> &ignore = {}, float prob = 0.5); 846 847 /// \brief Destructor. 848 ~RandomAutoContrast() override = default; 849 850 protected: 851 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 852 /// \return Shared pointer to TensorOperation object. 853 std::shared_ptr<TensorOperation> Parse() override; 854 855 private: 856 struct Data; 857 std::shared_ptr<Data> data_; 858 }; 859 860 /// \brief Randomly adjust the sharpness of the input image with a given probability. 861 class DATASET_API RandomAdjustSharpness final : public TensorTransform { 862 public: 863 /// \brief Constructor. 864 /// \param[in] degree A float representing sharpness adjustment degree, which must be non negative. 865 /// \param[in] prob A float representing the probability of the image being sharpness adjusted, which 866 /// must in range of [0, 1] (default=0.5). 867 /// \par Example 868 /// \code 869 /// /* Define operations */ 870 /// auto decode_op = vision::Decode(); 871 /// auto random_adjust_sharpness_op = vision::RandomAdjustSharpness(30.0); 872 /// 873 /// /* dataset is an instance of Dataset object */ 874 /// dataset = dataset->Map({decode_op, random_adjust_sharpness_op}, // operations 875 /// {"image"}); // input columns 876 /// \endcode 877 explicit RandomAdjustSharpness(float degree, float prob = 0.5); 878 879 /// \brief Destructor. 880 ~RandomAdjustSharpness() override = default; 881 882 protected: 883 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 884 /// \return Shared pointer to TensorOperation object. 885 std::shared_ptr<TensorOperation> Parse() override; 886 887 private: 888 struct Data; 889 std::shared_ptr<Data> data_; 890 }; 891 892 /// \brief Blend an image with its grayscale version with random weights 893 /// t and 1 - t generated from a given range. If the range is trivial 894 /// then the weights are determinate and t equals to the bound of the interval. 895 class DATASET_API RandomColor final : public TensorTransform { 896 public: 897 /// \brief Constructor. 898 /// \param[in] t_lb Lower bound random weights. 899 /// \param[in] t_ub Upper bound random weights. 900 /// \par Example 901 /// \code 902 /// /* Define operations */ 903 /// auto decode_op = vision::Decode(); 904 /// auto random_color_op = vision::RandomColor(5.0, 50.0); 905 /// 906 /// /* dataset is an instance of Dataset object */ 907 /// dataset = dataset->Map({decode_op, random_color_op}, // operations 908 /// {"image"}); // input columns 909 /// \endcode 910 RandomColor(float t_lb, float t_ub); 911 912 /// \brief Destructor. 913 ~RandomColor() override = default; 914 915 protected: 916 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 917 /// \return Shared pointer to TensorOperation object. 918 std::shared_ptr<TensorOperation> Parse() override; 919 920 private: 921 struct Data; 922 std::shared_ptr<Data> data_; 923 }; 924 925 /// \brief Randomly adjust the brightness, contrast, saturation, and hue of the input image. 926 class DATASET_API RandomColorAdjust final : public TensorTransform { 927 public: 928 /// \brief Constructor. 929 /// \param[in] brightness Brightness adjustment factor. Must be a vector of one or two values 930 /// if it is a vector of two values it needs to be in the form of [min, max] (Default={1, 1}). 931 /// \param[in] contrast Contrast adjustment factor. Must be a vector of one or two values 932 /// if it is a vector of two values, it needs to be in the form of [min, max] (Default={1, 1}). 933 /// \param[in] saturation Saturation adjustment factor. Must be a vector of one or two values 934 /// if it is a vector of two values, it needs to be in the form of [min, max] (Default={1, 1}). 935 /// \param[in] hue Hue adjustment factor. Must be a vector of one or two values 936 /// if it is a vector of two values, it must be in the form of [min, max] where -0.5 <= min <= max <= 0.5 937 /// (Default={0, 0}). 938 /// \par Example 939 /// \code 940 /// /* Define operations */ 941 /// auto decode_op = vision::Decode(); 942 /// auto random_color_adjust_op = vision::RandomColorAdjust({1.0, 5.0}, {10.0, 20.0}, {40.0, 40.0}); 943 /// 944 /// /* dataset is an instance of Dataset object */ 945 /// dataset = dataset->Map({decode_op, random_color_adjust_op}, // operations 946 /// {"image"}); // input columns 947 /// \endcode 948 explicit RandomColorAdjust(const std::vector<float> &brightness = {1.0, 1.0}, 949 const std::vector<float> &contrast = {1.0, 1.0}, 950 const std::vector<float> &saturation = {1.0, 1.0}, 951 const std::vector<float> &hue = {0.0, 0.0}); 952 953 /// \brief Destructor. 954 ~RandomColorAdjust() override = default; 955 956 protected: 957 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 958 /// \return Shared pointer to TensorOperation object. 959 std::shared_ptr<TensorOperation> Parse() override; 960 961 private: 962 struct Data; 963 std::shared_ptr<Data> data_; 964 }; 965 966 /// \brief Crop the input image at a random location. 967 class DATASET_API RandomCrop final : public TensorTransform { 968 public: 969 /// \brief Constructor. 970 /// \param[in] size A vector representing the output size of the cropped image. 971 /// If the size is a single value, a squared crop of size (size, size) is returned. 972 /// If the size has 2 values, it should be (height, width). 973 /// \param[in] padding A vector representing the number of pixels to pad the image. 974 /// If the vector has one value, it pads all sides of the image with that value. 975 /// If the vector has two values, it pads left and right with the first and 976 /// top and bottom with the second value. 977 /// If the vector has four values, it pads left, top, right, and bottom with 978 /// those values respectively. 979 /// \param[in] pad_if_needed A boolean indicating that whether to pad the image 980 /// if either side is smaller than the given output size. 981 /// \param[in] fill_value A vector representing the pixel intensity of the borders if the padding_mode is 982 /// BorderType.kConstant. If 1 value is provided, it is used for all RGB channels. 983 /// If 3 values are provided, it is used to fill R, G, B channels respectively. 984 /// \param[in] padding_mode The method of padding (default=BorderType::kConstant).It can be any of 985 /// [BorderType::kConstant, BorderType::kEdge, BorderType::kReflect, BorderType::kSymmetric]. 986 /// - BorderType::kConstant, Fill the border with constant values. 987 /// - BorderType::kEdge, Fill the border with the last value on the edge. 988 /// - BorderType::kReflect, Reflect the values on the edge omitting the last value of edge. 989 /// - BorderType::kSymmetric, Reflect the values on the edge repeating the last value of edge. 990 /// \note If the input image is more than one, then make sure that the image size is the same. 991 /// \par Example 992 /// \code 993 /// /* Define operations */ 994 /// auto decode_op = vision::Decode(); 995 /// auto random_crop_op = vision::RandomCrop({255, 255}, {10, 10, 10, 10}); 996 /// 997 /// /* dataset is an instance of Dataset object */ 998 /// dataset = dataset->Map({decode_op, random_crop_op}, // operations 999 /// {"image"}); // input columns 1000 /// \endcode 1001 explicit RandomCrop(const std::vector<int32_t> &size, const std::vector<int32_t> &padding = {0, 0, 0, 0}, 1002 bool pad_if_needed = false, const std::vector<uint8_t> &fill_value = {0, 0, 0}, 1003 BorderType padding_mode = BorderType::kConstant); 1004 1005 /// \brief Destructor. 1006 ~RandomCrop() override = default; 1007 1008 protected: 1009 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1010 /// \return Shared pointer to TensorOperation object. 1011 std::shared_ptr<TensorOperation> Parse() override; 1012 1013 private: 1014 struct Data; 1015 std::shared_ptr<Data> data_; 1016 }; 1017 1018 /// \brief Equivalent to RandomResizedCrop TensorTransform, but crop the image before decoding. 1019 class DATASET_API RandomCropDecodeResize final : public TensorTransform { 1020 public: 1021 /// \brief Constructor. 1022 /// \param[in] size A vector representing the output size of the cropped image. 1023 /// If the size is a single value, a squared crop of size (size, size) is returned. 1024 /// If the size has 2 values, it should be (height, width). 1025 /// \param[in] scale Range [min, max) of respective size of the 1026 /// original size to be cropped (default=(0.08, 1.0)). 1027 /// \param[in] ratio Range [min, max) of aspect ratio to be 1028 /// cropped (default=(3. / 4., 4. / 3.)). 1029 /// \param[in] interpolation An enum for the mode of interpolation. 1030 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1031 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1032 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1033 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1034 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 1035 /// \param[in] max_attempts The maximum number of attempts to propose a valid crop_area (default=10). 1036 /// If exceeded, fall back to use center_crop instead. 1037 /// \par Example 1038 /// \code 1039 /// /* Define operations */ 1040 /// auto random_op = vision::RandomCropDecodeResize({255, 255}, {0.1, 0.5}); 1041 /// 1042 /// /* dataset is an instance of Dataset object */ 1043 /// dataset = dataset->Map({random_op}, // operations 1044 /// {"image"}); // input columns 1045 /// \endcode 1046 explicit RandomCropDecodeResize(const std::vector<int32_t> &size, const std::vector<float> &scale = {0.08, 1.0}, 1047 const std::vector<float> &ratio = {3. / 4., 4. / 3.}, 1048 InterpolationMode interpolation = InterpolationMode::kLinear, 1049 int32_t max_attempts = 10); 1050 1051 /// \brief Destructor. 1052 ~RandomCropDecodeResize() override = default; 1053 1054 protected: 1055 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1056 /// \return Shared pointer to TensorOperation object. 1057 std::shared_ptr<TensorOperation> Parse() override; 1058 1059 private: 1060 struct Data; 1061 std::shared_ptr<Data> data_; 1062 }; 1063 1064 /// \brief Crop the input image at a random location and adjust bounding boxes accordingly. 1065 /// If the cropped area is out of bbox, the returned bbox will be empty. 1066 class DATASET_API RandomCropWithBBox final : public TensorTransform { 1067 public: 1068 /// \brief Constructor. 1069 /// \param[in] size A vector representing the output size of the cropped image. 1070 /// If the size is a single value, a squared crop of size (size, size) is returned. 1071 /// If the size has 2 values, it should be (height, width). 1072 /// \param[in] padding A vector representing the number of pixels to pad the image 1073 /// If the vector has one value, it pads all sides of the image with that value. 1074 /// If the vector has two values, it pads left and right with the first and 1075 /// top and bottom with the second value. 1076 /// If the vector has four values, it pads left, top, right, and bottom with 1077 /// those values respectively. 1078 /// \param[in] pad_if_needed A boolean indicating that whether to pad the image 1079 /// if either side is smaller than the given output size. 1080 /// \param[in] fill_value A vector representing the pixel intensity of the borders. Only valid 1081 /// if the padding_mode is BorderType.kConstant. If 1 value is provided, it is used for all 1082 /// RGB channels. If 3 values are provided, it is used to fill R, G, B channels respectively. 1083 /// \param[in] padding_mode The method of padding (default=BorderType::kConstant).It can be any of 1084 /// [BorderType::kConstant, BorderType::kEdge, BorderType::kReflect, BorderType::kSymmetric]. 1085 /// - BorderType::kConstant, Fill the border with constant values. 1086 /// - BorderType::kEdge, Fill the border with the last value on the edge. 1087 /// - BorderType::kReflect, Reflect the values on the edge omitting the last value of edge. 1088 /// - BorderType::kSymmetric, Reflect the values on the edge repeating the last value of edge. 1089 /// \par Example 1090 /// \code 1091 /// /* Define operations */ 1092 /// auto random_op = vision::RandomCropWithBBox({224, 224}, {0, 0, 0, 0}); 1093 /// 1094 /// /* dataset is an instance of Dataset object */ 1095 /// dataset = dataset->Map({random_op}, // operations 1096 /// {"image", "bbox"}); // input columns 1097 /// \endcode 1098 explicit RandomCropWithBBox(const std::vector<int32_t> &size, const std::vector<int32_t> &padding = {0, 0, 0, 0}, 1099 bool pad_if_needed = false, const std::vector<uint8_t> &fill_value = {0, 0, 0}, 1100 BorderType padding_mode = BorderType::kConstant); 1101 1102 /// \brief Destructor. 1103 ~RandomCropWithBBox() override = default; 1104 1105 protected: 1106 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1107 /// \return Shared pointer to TensorOperation object. 1108 std::shared_ptr<TensorOperation> Parse() override; 1109 1110 private: 1111 struct Data; 1112 std::shared_ptr<Data> data_; 1113 }; 1114 1115 /// \brief Randomly apply histogram equalization on the input image with a given probability. 1116 class DATASET_API RandomEqualize final : public TensorTransform { 1117 public: 1118 /// \brief Constructor. 1119 /// \param[in] prob A float representing the probability of equalization, which 1120 /// must be in range of [0, 1] (default=0.5). 1121 /// \par Example 1122 /// \code 1123 /// /* Define operations */ 1124 /// auto decode_op = vision::Decode(); 1125 /// auto random_op = vision::RandomEqualize(0.5); 1126 /// 1127 /// /* dataset is an instance of Dataset object */ 1128 /// dataset = dataset->Map({decode_op, random_op}, // operations 1129 /// {"image"}); // input columns 1130 /// \endcode 1131 explicit RandomEqualize(float prob = 0.5); 1132 1133 /// \brief Destructor. 1134 ~RandomEqualize() override = default; 1135 1136 protected: 1137 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1138 /// \return Shared pointer to TensorOperation object. 1139 std::shared_ptr<TensorOperation> Parse() override; 1140 1141 private: 1142 struct Data; 1143 std::shared_ptr<Data> data_; 1144 }; 1145 1146 /// \brief Randomly flip the input image horizontally with a given probability. 1147 class DATASET_API RandomHorizontalFlip final : public TensorTransform { 1148 public: 1149 /// \brief Constructor. 1150 /// \param[in] prob A float representing the probability of flip. 1151 /// \par Example 1152 /// \code 1153 /// /* Define operations */ 1154 /// auto decode_op = vision::Decode(); 1155 /// auto random_op = vision::RandomHorizontalFlip(0.8); 1156 /// 1157 /// /* dataset is an instance of Dataset object */ 1158 /// dataset = dataset->Map({decode_op, random_op}, // operations 1159 /// {"image"}); // input columns 1160 /// \endcode 1161 explicit RandomHorizontalFlip(float prob = 0.5); 1162 1163 /// \brief Destructor. 1164 ~RandomHorizontalFlip() override = default; 1165 1166 protected: 1167 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1168 /// \return Shared pointer to TensorOperation object. 1169 std::shared_ptr<TensorOperation> Parse() override; 1170 1171 private: 1172 struct Data; 1173 std::shared_ptr<Data> data_; 1174 }; 1175 1176 /// \brief Randomly flip the input image horizontally with a given probability and adjust bounding boxes accordingly. 1177 class DATASET_API RandomHorizontalFlipWithBBox final : public TensorTransform { 1178 public: 1179 /// \brief Constructor. 1180 /// \param[in] prob A float representing the probability of flip. 1181 /// \par Example 1182 /// \code 1183 /// /* Define operations */ 1184 /// auto random_op = vision::RandomHorizontalFlipWithBBox(1.0); 1185 /// 1186 /// /* dataset is an instance of Dataset object */ 1187 /// dataset = dataset->Map({random_op}, // operations 1188 /// {"image", "bbox"}); // input columns 1189 /// \endcode 1190 explicit RandomHorizontalFlipWithBBox(float prob = 0.5); 1191 1192 /// \brief Destructor. 1193 ~RandomHorizontalFlipWithBBox() override = default; 1194 1195 protected: 1196 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1197 /// \return Shared pointer to TensorOperation object. 1198 std::shared_ptr<TensorOperation> Parse() override; 1199 1200 private: 1201 struct Data; 1202 std::shared_ptr<Data> data_; 1203 }; 1204 1205 /// \brief Randomly invert the input image with a given probability. 1206 class DATASET_API RandomInvert final : public TensorTransform { 1207 public: 1208 /// \brief Constructor. 1209 /// \param[in] prob A float representing the probability of the image being inverted, which 1210 /// must be in range of [0, 1] (default=0.5). 1211 /// \par Example 1212 /// \code 1213 /// /* Define operations */ 1214 /// auto decode_op = vision::Decode(); 1215 /// auto random_op = vision::RandomInvert(0.8); 1216 /// 1217 /// /* dataset is an instance of Dataset object */ 1218 /// dataset = dataset->Map({decode_op, random_op}, // operations 1219 /// {"image"}); // input columns 1220 /// \endcode 1221 explicit RandomInvert(float prob = 0.5); 1222 1223 /// \brief Destructor. 1224 ~RandomInvert() override = default; 1225 1226 protected: 1227 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1228 /// \return Shared pointer to TensorOperation object. 1229 std::shared_ptr<TensorOperation> Parse() override; 1230 1231 private: 1232 struct Data; 1233 std::shared_ptr<Data> data_; 1234 }; 1235 1236 /// \brief Add AlexNet-style PCA-based noise to an image. 1237 class DATASET_API RandomLighting final : public TensorTransform { 1238 public: 1239 /// \brief Constructor. 1240 /// \param[in] alpha A float representing the intensity of the image (default=0.05). 1241 /// \par Example 1242 /// \code 1243 /// /* Define operations */ 1244 /// auto decode_op = vision::Decode(); 1245 /// auto random_op = vision::RandomLighting(0.1); 1246 /// 1247 /// /* dataset is an instance of Dataset object */ 1248 /// dataset = dataset->Map({decode_op, random_op}, // operations 1249 /// {"image"}); // input columns 1250 /// \endcode 1251 explicit RandomLighting(float alpha = 0.05); 1252 1253 /// \brief Destructor. 1254 ~RandomLighting() override = default; 1255 1256 protected: 1257 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1258 /// \return Shared pointer to TensorOperation object. 1259 std::shared_ptr<TensorOperation> Parse() override; 1260 1261 private: 1262 struct Data; 1263 std::shared_ptr<Data> data_; 1264 }; 1265 1266 /// \brief Reduce the number of bits for each color channel randomly. 1267 class DATASET_API RandomPosterize final : public TensorTransform { 1268 public: 1269 /// \brief Constructor. 1270 /// \param[in] bit_range Range of random posterize to compress image. 1271 /// uint8_t vector representing the minimum and maximum bit in range of [1,8] (Default={4, 8}). 1272 /// \par Example 1273 /// \code 1274 /// /* Define operations */ 1275 /// auto decode_op = vision::Decode(); 1276 /// auto random_op = vision::RandomPosterize({4, 8}); 1277 /// 1278 /// /* dataset is an instance of Dataset object */ 1279 /// dataset = dataset->Map({decode_op, random_op}, // operations 1280 /// {"image"}); // input columns 1281 /// \endcode 1282 explicit RandomPosterize(const std::vector<uint8_t> &bit_range = {4, 8}); 1283 1284 /// \brief Destructor. 1285 ~RandomPosterize() override = default; 1286 1287 protected: 1288 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1289 /// \return Shared pointer to TensorOperation object. 1290 std::shared_ptr<TensorOperation> Parse() override; 1291 1292 private: 1293 struct Data; 1294 std::shared_ptr<Data> data_; 1295 }; 1296 1297 /// \brief Resize the input image using a randomly selected interpolation mode. 1298 class DATASET_API RandomResize final : public TensorTransform { 1299 public: 1300 /// \brief Constructor. 1301 /// \param[in] size A vector representing the output size of the resized image. 1302 /// If the size is a single value, the smaller edge of the image will be resized to this value with 1303 /// the same image aspect ratio. If the size has 2 values, it should be (height, width). 1304 /// \par Example 1305 /// \code 1306 /// /* Define operations */ 1307 /// auto decode_op = vision::Decode(); 1308 /// auto random_op = vision::RandomResize({32, 32}); 1309 /// 1310 /// /* dataset is an instance of Dataset object */ 1311 /// dataset = dataset->Map({decode_op, random_op}, // operations 1312 /// {"image"}); // input columns 1313 /// \endcode 1314 explicit RandomResize(const std::vector<int32_t> &size); 1315 1316 /// \brief Destructor. 1317 ~RandomResize() override = default; 1318 1319 protected: 1320 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1321 /// \return Shared pointer to TensorOperation object. 1322 std::shared_ptr<TensorOperation> Parse() override; 1323 1324 private: 1325 struct Data; 1326 std::shared_ptr<Data> data_; 1327 }; 1328 1329 /// \brief Resize the input image using a randomly selected interpolation mode and adjust 1330 /// bounding boxes accordingly. 1331 class DATASET_API RandomResizeWithBBox final : public TensorTransform { 1332 public: 1333 /// \brief Constructor. 1334 /// \param[in] size A vector representing the output size of the resized image. 1335 /// If the size is a single value, the smaller edge of the image will be resized to this value with 1336 /// the same image aspect ratio. If the size has 2 values, it should be (height, width). 1337 /// \par Example 1338 /// \code 1339 /// /* Define operations */ 1340 /// auto random_op = vision::RandomResizeWithBBox({50, 50}); 1341 /// 1342 /// /* dataset is an instance of Dataset object */ 1343 /// dataset = dataset->Map({random_op}, // operations 1344 /// {"image", "bbox"}); // input columns 1345 /// \endcode 1346 explicit RandomResizeWithBBox(const std::vector<int32_t> &size); 1347 1348 /// \brief Destructor. 1349 ~RandomResizeWithBBox() override = default; 1350 1351 protected: 1352 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1353 /// \return Shared pointer to TensorOperation object. 1354 std::shared_ptr<TensorOperation> Parse() override; 1355 1356 private: 1357 struct Data; 1358 std::shared_ptr<Data> data_; 1359 }; 1360 1361 /// \brief Crop the input image to a random size and aspect ratio. 1362 class DATASET_API RandomResizedCrop final : public TensorTransform { 1363 public: 1364 /// \brief Constructor. 1365 /// \param[in] size A vector representing the output size of the cropped image. 1366 /// If the size is a single value, a squared crop of size (size, size) is returned. 1367 /// If the size has 2 values, it should be (height, width). 1368 /// \param[in] scale Range [min, max) of respective size of the original 1369 /// size to be cropped (default=(0.08, 1.0)). 1370 /// \param[in] ratio Range [min, max) of aspect ratio to be cropped 1371 /// (default=(3. / 4., 4. / 3.)). 1372 /// \param[in] interpolation Image interpolation mode (default=InterpolationMode::kLinear). 1373 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1374 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1375 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1376 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1377 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 1378 /// \param[in] max_attempts The maximum number of attempts to propose a valid. 1379 /// crop_area (default=10). If exceeded, fall back to use center_crop instead. 1380 /// \note If the input image is more than one, then make sure that the image size is the same. 1381 /// \par Example 1382 /// \code 1383 /// /* Define operations */ 1384 /// auto decode_op = vision::Decode(); 1385 /// auto random_op = vision::RandomResizedCrop({32, 32}, {0.08, 1.0}); 1386 /// 1387 /// /* dataset is an instance of Dataset object */ 1388 /// dataset = dataset->Map({decode_op, random_op}, // operations 1389 /// {"image"}); // input columns 1390 /// \endcode 1391 explicit RandomResizedCrop(const std::vector<int32_t> &size, const std::vector<float> &scale = {0.08, 1.0}, 1392 const std::vector<float> &ratio = {3. / 4., 4. / 3.}, 1393 InterpolationMode interpolation = InterpolationMode::kLinear, int32_t max_attempts = 10); 1394 1395 /// \brief Destructor. 1396 ~RandomResizedCrop() override = default; 1397 1398 protected: 1399 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1400 /// \return Shared pointer to TensorOperation object. 1401 std::shared_ptr<TensorOperation> Parse() override; 1402 1403 private: 1404 struct Data; 1405 std::shared_ptr<Data> data_; 1406 }; 1407 1408 /// \brief Crop the input image to a random size and aspect ratio. 1409 /// If cropped area is out of bbox, the return bbox will be empty. 1410 class DATASET_API RandomResizedCropWithBBox final : public TensorTransform { 1411 public: 1412 /// \brief Constructor. 1413 /// \param[in] size A vector representing the output size of the cropped image. 1414 /// If the size is a single value, a squared crop of size (size, size) is returned. 1415 /// If the size has 2 values, it should be (height, width). 1416 /// \param[in] scale Range [min, max) of respective size of the original 1417 /// size to be cropped (default=(0.08, 1.0)). 1418 /// \param[in] ratio Range [min, max) of aspect ratio to be cropped 1419 /// (default=(3. / 4., 4. / 3.)). 1420 /// \param[in] interpolation Image interpolation mode (default=InterpolationMode::kLinear). 1421 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1422 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1423 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1424 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1425 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 1426 /// \param[in] max_attempts The maximum number of attempts to propose a valid 1427 /// crop_area (default=10). If exceeded, fall back to use center_crop instead. 1428 /// \par Example 1429 /// \code 1430 /// /* Define operations */ 1431 /// auto random_op = vision::RandomResizedCropWithBBox({50, 50}, {0.05, 0.5}, {0.2, 0.4}, 1432 /// InterpolationMode::kCubic); 1433 /// 1434 /// /* dataset is an instance of Dataset object */ 1435 /// dataset = dataset->Map({random_op}, // operations 1436 /// {"image", "bbox"}); // input columns 1437 /// \endcode 1438 explicit RandomResizedCropWithBBox(const std::vector<int32_t> &size, const std::vector<float> &scale = {0.08, 1.0}, 1439 const std::vector<float> &ratio = {3. / 4., 4. / 3.}, 1440 InterpolationMode interpolation = InterpolationMode::kLinear, 1441 int32_t max_attempts = 10); 1442 1443 /// \brief Destructor. 1444 ~RandomResizedCropWithBBox() override = default; 1445 1446 protected: 1447 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1448 /// \return Shared pointer to TensorOperation object. 1449 std::shared_ptr<TensorOperation> Parse() override; 1450 1451 private: 1452 struct Data; 1453 std::shared_ptr<Data> data_; 1454 }; 1455 1456 /// \brief Rotate the image according to parameters. 1457 class DATASET_API RandomRotation final : public TensorTransform { 1458 public: 1459 /// \brief Constructor. 1460 /// \param[in] degrees A float vector of size 2, representing the starting and ending degrees. 1461 /// \param[in] resample An enum for the mode of interpolation. 1462 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1463 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1464 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1465 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1466 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 1467 /// \param[in] expand A boolean representing whether the image is expanded after rotation. 1468 /// \param[in] center A float vector of size 2 or empty, representing the x and y center of rotation 1469 /// or the center of the image. 1470 /// \param[in] fill_value A vector representing the value to fill the area outside the transform 1471 /// in the output image. If 1 value is provided, it is used for all RGB channels. 1472 /// If 3 values are provided, it is used to fill R, G, B channels respectively. 1473 /// \par Example 1474 /// \code 1475 /// /* Define operations */ 1476 /// auto decode_op = vision::Decode(); 1477 /// auto random_op = vision::RandomRotation({30, 60}, InterpolationMode::kNearestNeighbour); 1478 /// 1479 /// /* dataset is an instance of Dataset object */ 1480 /// dataset = dataset->Map({decode_op, random_op}, // operations 1481 /// {"image"}); // input columns 1482 /// \endcode 1483 explicit RandomRotation(const std::vector<float> °rees, 1484 InterpolationMode resample = InterpolationMode::kNearestNeighbour, bool expand = false, 1485 const std::vector<float> ¢er = {}, const std::vector<uint8_t> &fill_value = {0, 0, 0}); 1486 1487 /// \brief Destructor. 1488 ~RandomRotation() override = default; 1489 1490 protected: 1491 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1492 /// \return Shared pointer to TensorOperation object. 1493 std::shared_ptr<TensorOperation> Parse() override; 1494 1495 private: 1496 struct Data; 1497 std::shared_ptr<Data> data_; 1498 }; 1499 1500 /// \brief Choose a random sub-policy from a list to be applied on the input image. A sub-policy is a list of tuples 1501 /// (operation, prob), where operation is a TensorTransform operation and prob is the probability that this 1502 /// operation will be applied. Once a sub-policy is selected, each operation within the sub-policy with be 1503 /// applied in sequence according to its probability. 1504 class DATASET_API RandomSelectSubpolicy final : public TensorTransform { 1505 public: 1506 /// \brief Constructor. 1507 /// \param[in] policy Vector of sub-policies to choose from, in which the TensorTransform objects are raw pointers. 1508 /// \par Example 1509 /// \code 1510 /// /* Define operations */ 1511 /// auto invert_op(new vision::Invert()); 1512 /// auto equalize_op(new vision::Equalize()); 1513 /// 1514 /// std::vector<std::pair<TensorTransform *, double>> policy = {{invert_op, 0.5}, {equalize_op, 0.4}}; 1515 /// vision::RandomSelectSubpolicy random_select_subpolicy_op = vision::RandomSelectSubpolicy({policy}); 1516 /// 1517 /// /* dataset is an instance of Dataset object */ 1518 /// dataset = dataset->Map({random_select_subpolicy_op}, // operations 1519 /// {"image"}); // input columns 1520 /// \endcode 1521 explicit RandomSelectSubpolicy(const std::vector<std::vector<std::pair<TensorTransform *, double>>> &policy); 1522 1523 /// \brief Constructor. 1524 /// \param[in] policy Vector of sub-policies to choose from, in which the TensorTransform objects are shared pointers. 1525 /// \par Example 1526 /// \code 1527 /// /* Define operations */ 1528 /// std::shared_ptr<TensorTransform> invert_op(new vision::Invert()); 1529 /// std::shared_ptr<TensorTransform> equalize_op(new vision::Equalize()); 1530 /// std::shared_ptr<TensorTransform> resize_op(new vision::Resize({15, 15})); 1531 /// 1532 /// auto random_select_subpolicy_op = vision::RandomSelectSubpolicy({ 1533 /// {{invert_op, 0.5}, {equalize_op, 0.4}}, 1534 /// {{resize_op, 0.1}} 1535 /// }); 1536 /// 1537 /// /* dataset is an instance of Dataset object */ 1538 /// dataset = dataset->Map({random_select_subpolicy_op}, // operations 1539 /// {"image"}); // input columns 1540 /// \endcode 1541 explicit RandomSelectSubpolicy( 1542 const std::vector<std::vector<std::pair<std::shared_ptr<TensorTransform>, double>>> &policy); 1543 1544 /// \brief Constructor. 1545 /// \param[in] policy Vector of sub-policies to choose from, in which the TensorTransform objects are object pointers. 1546 /// \par Example 1547 /// \code 1548 /// /* Define operations */ 1549 /// vision::Invert invert_op = vision::Invert(); 1550 /// vision::Equalize equalize_op = vision::Equalize(); 1551 /// vision::Resize resize_op = vision::Resize({15, 15}); 1552 /// 1553 /// auto random_select_subpolicy_op = vision::RandomSelectSubpolicy({ 1554 /// {{invert_op, 0.5}, {equalize_op, 0.4}}, 1555 /// {{resize_op, 0.1}} 1556 /// }); 1557 /// 1558 /// /* dataset is an instance of Dataset object */ 1559 /// dataset = dataset->Map({random_select_subpolicy_op}, // operations 1560 /// {"image"}); // input columns 1561 /// \endcode 1562 explicit RandomSelectSubpolicy( 1563 const std::vector<std::vector<std::pair<std::reference_wrapper<TensorTransform>, double>>> &policy); 1564 1565 /// \brief Destructor. 1566 ~RandomSelectSubpolicy() override = default; 1567 1568 protected: 1569 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1570 /// \return Shared pointer to TensorOperation object. 1571 std::shared_ptr<TensorOperation> Parse() override; 1572 1573 private: 1574 struct Data; 1575 std::shared_ptr<Data> data_; 1576 }; 1577 1578 /// \brief Adjust the sharpness of the input image by a fixed or random degree. 1579 class DATASET_API RandomSharpness final : public TensorTransform { 1580 public: 1581 /// \brief Constructor. 1582 /// \param[in] degrees A float vector of size 2, representing the range of random sharpness 1583 /// adjustment degrees. It should be in (min, max) format. If min=max, then it is a 1584 /// single fixed magnitude operation (default = (0.1, 1.9)). 1585 /// \par Example 1586 /// \code 1587 /// /* Define operations */ 1588 /// auto decode_op = vision::Decode(); 1589 /// auto random_op = vision::RandomSharpness({0.1, 1.5}); 1590 /// 1591 /// /* dataset is an instance of Dataset object */ 1592 /// dataset = dataset->Map({decode_op, random_op}, // operations 1593 /// {"image"}); // input columns 1594 /// \endcode 1595 explicit RandomSharpness(const std::vector<float> °rees = {0.1, 1.9}); 1596 1597 /// \brief Destructor. 1598 ~RandomSharpness() override = default; 1599 1600 protected: 1601 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1602 /// \return Shared pointer to TensorOperation object. 1603 std::shared_ptr<TensorOperation> Parse() override; 1604 1605 private: 1606 struct Data; 1607 std::shared_ptr<Data> data_; 1608 }; 1609 1610 /// \brief Invert pixels randomly within a specified range. 1611 class DATASET_API RandomSolarize final : public TensorTransform { 1612 public: 1613 /// \brief Constructor. 1614 /// \param[in] threshold A vector with two elements specifying the pixel range to invert. 1615 /// Threshold values should always be in (min, max) format. 1616 /// If min=max, it will to invert all pixels above min(max). 1617 /// \par Example 1618 /// \code 1619 /// /* Define operations */ 1620 /// auto decode_op = vision::Decode(); 1621 /// auto random_op = vision::RandomSharpness({0, 255}); 1622 /// 1623 /// /* dataset is an instance of Dataset object */ 1624 /// dataset = dataset->Map({decode_op, random_op}, // operations 1625 /// {"image"}); // input columns 1626 /// \endcode 1627 explicit RandomSolarize(const std::vector<uint8_t> &threshold = {0, 255}); 1628 1629 /// \brief Destructor. 1630 ~RandomSolarize() override = default; 1631 1632 protected: 1633 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1634 /// \return Shared pointer to TensorOperation object. 1635 std::shared_ptr<TensorOperation> Parse() override; 1636 1637 private: 1638 struct Data; 1639 std::shared_ptr<Data> data_; 1640 }; 1641 1642 /// \brief Randomly flip the input image vertically with a given probability. 1643 class DATASET_API RandomVerticalFlip final : public TensorTransform { 1644 public: 1645 /// \brief Constructor. 1646 /// \param[in] prob A float representing the probability of flip. 1647 /// \par Example 1648 /// \code 1649 /// /* Define operations */ 1650 /// auto decode_op = vision::Decode(); 1651 /// auto random_op = vision::RandomVerticalFlip(); 1652 /// 1653 /// /* dataset is an instance of Dataset object */ 1654 /// dataset = dataset->Map({decode_op, random_op}, // operations 1655 /// {"image"}); // input columns 1656 /// \endcode 1657 explicit RandomVerticalFlip(float prob = 0.5); 1658 1659 /// \brief Destructor. 1660 ~RandomVerticalFlip() override = default; 1661 1662 protected: 1663 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1664 /// \return Shared pointer to TensorOperation object. 1665 std::shared_ptr<TensorOperation> Parse() override; 1666 1667 private: 1668 struct Data; 1669 std::shared_ptr<Data> data_; 1670 }; 1671 1672 /// \brief Randomly flip the input image vertically with a given probability and adjust bounding boxes accordingly. 1673 class DATASET_API RandomVerticalFlipWithBBox final : public TensorTransform { 1674 public: 1675 /// \brief Constructor. 1676 /// \param[in] prob A float representing the probability of flip. 1677 /// \par Example 1678 /// \code 1679 /// /* Define operations */ 1680 /// auto random_op = vision::RandomVerticalFlipWithBBox(); 1681 /// 1682 /// /* dataset is an instance of Dataset object */ 1683 /// dataset = dataset->Map({random_op}, // operations 1684 /// {"image", "bbox"}); // input columns 1685 /// \endcode 1686 explicit RandomVerticalFlipWithBBox(float prob = 0.5); 1687 1688 /// \brief Destructor. 1689 ~RandomVerticalFlipWithBBox() override = default; 1690 1691 protected: 1692 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1693 /// \return Shared pointer to TensorOperation object. 1694 std::shared_ptr<TensorOperation> Parse() override; 1695 1696 private: 1697 struct Data; 1698 std::shared_ptr<Data> data_; 1699 }; 1700 1701 /// \brief Reads a file in binary mode. 1702 /// \param[in] filename The path to the file to be read. 1703 /// \param[out] output The binary data. 1704 /// \return The status code. 1705 Status DATASET_API ReadFile(const std::string &filename, mindspore::MSTensor *output); 1706 1707 /// \brief Read a image file and decode it into one or three channels data. 1708 /// \param[in] filename The path to the file to be read. 1709 /// \param[out] output The Tensor data. 1710 /// \param[in] mode The read mode used for optionally converting the image, can be one of 1711 /// [ImageReadMode::kUNCHANGED, ImageReadMode::kGRAYSCALE, ImageReadMode::kCOLOR]. Default: 1712 /// ImageReadMode::kUNCHANGED. 1713 /// - ImageReadMode::kUNCHANGED, remain the output in the original format. 1714 /// - ImageReadMode::kGRAYSCALE, convert the output into one channel grayscale data. 1715 /// - ImageReadMode::kCOLOR, convert the output into three channels RGB color data. 1716 /// \return The status code. 1717 Status DATASET_API ReadImage(const std::string &filename, mindspore::MSTensor *output, 1718 ImageReadMode mode = ImageReadMode::kUNCHANGED); 1719 1720 /// \brief Read the video, audio, metadata from a video file. It supports AVI, H264, H265, MOV, MP4, WMV file formats. 1721 /// \param[in] filename The path to the videoe file to be read. 1722 /// \param[out] video_output The video frames of the video file. 1723 /// \param[out] audio_output The audio frames of the video file. 1724 /// \param[out] metadata_output The metadata contains video_fps, audio_fps. 1725 /// \param[in] start_pts The start presentation timestamp of the video. Default: 0.0. 1726 /// \param[in] end_pts The end presentation timestamp of the video. Default: 2147483647.0. 1727 /// \param[in] pts_unit The unit for the timestamps, can be one of ["pts", "sec"]. Default: "pts". 1728 /// \return The status code. 1729 Status DATASET_API ReadVideo(const std::string &filename, mindspore::MSTensor *video_output, 1730 mindspore::MSTensor *audio_output, std::map<std::string, std::string> *metadata_output, 1731 float start_pts = 0.0, float end_pts = 2147483647.0, const std::string &pts_unit = "pts"); 1732 1733 /// \brief Read the timestamps and frame rate of a video file. It supports AVI, H264, H265, MOV, MP4, WMV files. 1734 /// \param[in] filename The path to the videoe file to be read. 1735 /// \param[out] output The tuple(video_timestamps, video_fps) of the video. 1736 /// \param[in] pts_unit The unit for the timestamps, can be one of ["pts", "sec"]. Default: "pts". 1737 /// \return The status code. 1738 Status DATASET_API ReadVideoTimestamps(const std::string &filename, std::tuple<std::vector<float>, float> *output, 1739 const std::string &pts_unit = "pts"); 1740 1741 /// \brief Crop the given image and zoom to the specified size. 1742 class DATASET_API ResizedCrop final : public TensorTransform { 1743 public: 1744 /// \brief Constructor. 1745 /// \param[in] top Horizontal ordinate of the upper left corner of the crop image. 1746 /// \param[in] left Vertical ordinate of the upper left corner of the crop image. 1747 /// \param[in] height Height of cropped image. 1748 /// \param[in] width Width of cropped image. 1749 /// \param[in] size A vector representing the output size of the image. 1750 /// If the size is a single value, a squared resized of size (size, size) is returned. 1751 /// If the size has 2 values, it should be (height, width). 1752 /// \param[in] interpolation Image interpolation mode. Default: InterpolationMode::kLinear. 1753 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1754 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1755 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1756 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1757 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 1758 /// \note If the input image is more than one, then make sure that the image size is the same. 1759 /// \par Example 1760 /// \code 1761 /// /* Define operations */ 1762 /// auto decode_op = vision::Decode(); 1763 /// auto resized_crop_op = vision::ResizedCrop(128, 128, 256, 256, {128, 128}); 1764 /// 1765 /// /* dataset is an instance of Dataset object */ 1766 /// dataset = dataset->Map({decode_op, resized_crop_op}, // operations 1767 /// {"image"}); // input columns 1768 /// \endcode 1769 ResizedCrop(int32_t top, int32_t left, int32_t height, int32_t width, const std::vector<int32_t> &size, 1770 InterpolationMode interpolation = InterpolationMode::kLinear); 1771 1772 /// \brief Destructor. 1773 ~ResizedCrop() override = default; 1774 1775 protected: 1776 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1777 /// \return Shared pointer to TensorOperation object. 1778 std::shared_ptr<TensorOperation> Parse() override; 1779 1780 private: 1781 struct Data; 1782 std::shared_ptr<Data> data_; 1783 }; 1784 1785 /// \brief Resize the input image to the given size and adjust bounding boxes accordingly. 1786 class DATASET_API ResizeWithBBox final : public TensorTransform { 1787 public: 1788 /// \brief Constructor. 1789 /// \param[in] size The output size of the resized image. 1790 /// If the size is an integer, smaller edge of the image will be resized to this value with the same image aspect 1791 /// ratio. If the size is a sequence of length 2, it should be (height, width). 1792 /// \param[in] interpolation An enum for the mode of interpolation (default=InterpolationMode::kLinear). 1793 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1794 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1795 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1796 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1797 /// - InterpolationMode::kCubicPil, Interpolation method is bicubic interpolation like implemented in pillow. 1798 /// \par Example 1799 /// \code 1800 /// /* Define operations */ 1801 /// auto random_op = vision::ResizeWithBBox({100, 100}, InterpolationMode::kNearestNeighbour); 1802 /// 1803 /// /* dataset is an instance of Dataset object */ 1804 /// dataset = dataset->Map({random_op}, // operations 1805 /// {"image", "bbox"}); // input columns 1806 /// \endcode 1807 explicit ResizeWithBBox(const std::vector<int32_t> &size, 1808 InterpolationMode interpolation = InterpolationMode::kLinear); 1809 1810 /// \brief Destructor. 1811 ~ResizeWithBBox() override = default; 1812 1813 protected: 1814 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1815 /// \return Shared pointer to TensorOperation object. 1816 std::shared_ptr<TensorOperation> Parse() override; 1817 1818 private: 1819 struct Data; 1820 std::shared_ptr<Data> data_; 1821 }; 1822 1823 /// \brief Change the format of input tensor from 4-channel RGBA to 3-channel BGR. 1824 class DATASET_API RGBA2BGR final : public TensorTransform { 1825 public: 1826 /// \brief Constructor. 1827 /// \par Example 1828 /// \code 1829 /// /* Define operations */ 1830 /// auto decode_op = vision::Decode(); 1831 /// auto rgb2bgr_op = vision::RGBA2BGR(); 1832 /// 1833 /// /* dataset is an instance of Dataset object */ 1834 /// dataset = dataset->Map({decode_op, rgb2bgr_op}, // operations 1835 /// {"image"}); // input columns 1836 /// \endcode 1837 RGBA2BGR(); 1838 1839 /// \brief Destructor. 1840 ~RGBA2BGR() override = default; 1841 1842 protected: 1843 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1844 /// \return Shared pointer to TensorOperation object. 1845 std::shared_ptr<TensorOperation> Parse() override; 1846 }; 1847 1848 /// \brief Change the input 4 channel RGBA tensor to 3 channel RGB. 1849 class DATASET_API RGBA2RGB final : public TensorTransform { 1850 public: 1851 /// \brief Constructor. 1852 /// \par Example 1853 /// \code 1854 /// /* Define operations */ 1855 /// auto decode_op = vision::Decode(); 1856 /// auto rgba2rgb_op = vision::RGBA2RGB(); 1857 /// 1858 /// /* dataset is an instance of Dataset object */ 1859 /// dataset = dataset->Map({decode_op, rgba2rgb_op}, // operations 1860 /// {"image"}); // input columns 1861 /// \endcode 1862 RGBA2RGB(); 1863 1864 /// \brief Destructor. 1865 ~RGBA2RGB() override = default; 1866 1867 protected: 1868 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1869 /// \return Shared pointer to TensorOperation object. 1870 std::shared_ptr<TensorOperation> Parse() override; 1871 }; 1872 1873 /// \note Slice the tensor to multiple patches in horizontal and vertical directions. 1874 class DATASET_API SlicePatches final : public TensorTransform { 1875 public: 1876 /// \brief Constructor. 1877 /// \param[in] num_height The number of patches in vertical direction (default=1). 1878 /// \param[in] num_width The number of patches in horizontal direction (default=1). 1879 /// \param[in] slice_mode An enum for the mode of slice (default=SliceMode::kPad). 1880 /// \param[in] fill_value A value representing the pixel to fill the padding area in right and 1881 /// bottom border if slice_mode is kPad. Then padded tensor could be just sliced to multiple patches (default=0). 1882 /// \note The usage scenerio is suitable to tensor with large height and width. The tensor will keep the same 1883 /// if set both num_height and num_width to 1. And the number of output tensors is equal to num_height*num_width. 1884 /// \par Example 1885 /// \code 1886 /// /* Define operations */ 1887 /// auto decode_op = vision::Decode(); 1888 /// auto slice_patch_op = vision::SlicePatches(255, 255); 1889 /// 1890 /// /* dataset is an instance of Dataset object */ 1891 /// dataset = dataset->Map({decode_op, slice_patch_op}, // operations 1892 /// {"image"}); // input columns 1893 /// \endcode 1894 explicit SlicePatches(int32_t num_height = 1, int32_t num_width = 1, SliceMode slice_mode = SliceMode::kPad, 1895 uint8_t fill_value = 0); 1896 1897 /// \brief Destructor. 1898 ~SlicePatches() override = default; 1899 1900 protected: 1901 /// \brief Function to convert TensorTransform object into a TensorOperation object. 1902 /// \return Shared pointer to TensorOperation object. 1903 std::shared_ptr<TensorOperation> Parse() override; 1904 1905 private: 1906 struct Data; 1907 std::shared_ptr<Data> data_; 1908 }; 1909 1910 /// \brief Invert pixels within a specified range. 1911 class DATASET_API Solarize final : public TensorTransform { 1912 public: 1913 /// \brief Constructor. 1914 /// \param[in] threshold A vector with two elements specifying the pixel range to invert. 1915 /// Threshold values should always be in (min, max) format. 1916 /// If min=max, it will to invert all pixels above min(max). 1917 /// \par Example 1918 /// \code 1919 /// /* Define operations */ 1920 /// auto decode_op = vision::Decode(); 1921 /// auto solarize_op = vision::Solarize({0, 255}); 1922 /// 1923 /// /* dataset is an instance of Dataset object */ 1924 /// dataset = dataset->Map({decode_op, solarize_op}, // operations 1925 /// {"image"}); // input columns 1926 /// \endcode 1927 explicit Solarize(const std::vector<float> &threshold); 1928 1929 /// \brief Destructor. 1930 ~Solarize() override = default; 1931 1932 protected: 1933 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1934 /// \return Shared pointer to TensorOperation object. 1935 std::shared_ptr<TensorOperation> Parse() override; 1936 1937 private: 1938 struct Data; 1939 std::shared_ptr<Data> data_; 1940 }; 1941 1942 /// \brief Divide the pixel values by 255 and convert from HWC format to CHW format with required datatype. 1943 class DATASET_API ToTensor final : public TensorTransform { 1944 public: 1945 /// \brief Constructor. 1946 /// \param[in] output_type The type of the output tensor of type mindspore::DataType or String 1947 /// (default=mindspore::DataType::kNumberTypeFloat32). 1948 /// \par Example 1949 /// \code 1950 /// /* Define operations */ 1951 /// auto to_tensor_op = vision::ToTensor(); 1952 /// 1953 /// /* dataset is an instance of Dataset object */ 1954 /// dataset = dataset->Map({to_tensor_op}, // operations 1955 /// {"image"}); // input columns 1956 /// \endcode 1957 ToTensor(); 1958 explicit ToTensor(std::string output_type); 1959 explicit ToTensor(mindspore::DataType output_type); 1960 1961 /// \brief Destructor. 1962 ~ToTensor() override = default; 1963 1964 protected: 1965 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1966 /// \return Shared pointer to TensorOperation object. 1967 std::shared_ptr<TensorOperation> Parse() override; 1968 1969 private: 1970 struct Data; 1971 std::shared_ptr<Data> data_; 1972 }; 1973 1974 /// \brief Dataset-independent data-augmentation with TrivialAugment Wide. 1975 class DATASET_API TrivialAugmentWide final : public TensorTransform { 1976 public: 1977 /// \brief Constructor. 1978 /// \param[in] num_magnitude_bins The number of different magnitude values. Default: 31. 1979 /// \param[in] interpolation An enum for the mode of interpolation. Default: InterpolationMode::kNearestNeighbour. 1980 /// - InterpolationMode::kNearestNeighbour, Interpolation method is nearest-neighbor interpolation. 1981 /// - InterpolationMode::kLinear, Interpolation method is blinear interpolation. 1982 /// - InterpolationMode::kCubic, Interpolation method is bicubic interpolation. 1983 /// - InterpolationMode::kArea, Interpolation method is pixel area interpolation. 1984 /// \param[in] fill_value A vector representing the pixel intensity of the borders. Default: {0, 0, 0}. 1985 /// \par Example 1986 /// \code 1987 /// /* Define operations */ 1988 /// auto decode_op = vision::Decode(); 1989 /// auto trivial_augment_wide_op = vision::TrivialAugmentWide(); 1990 /// /* dataset is an instance of Dataset object */ 1991 /// dataset = dataset->Map({decode_op, trivial_augment_wide_op}, // operations 1992 /// {"image"}); // input columns 1993 /// \endcode 1994 explicit TrivialAugmentWide(int32_t num_magnitude_bins = 31, 1995 InterpolationMode interpolation = InterpolationMode::kNearestNeighbour, 1996 const std::vector<uint8_t> &fill_value = {0, 0, 0}); 1997 1998 /// \brief Destructor. 1999 ~TrivialAugmentWide() override = default; 2000 2001 protected: 2002 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 2003 /// \return Shared pointer to TensorOperation object. 2004 std::shared_ptr<TensorOperation> Parse() override; 2005 2006 private: 2007 struct Data; 2008 std::shared_ptr<Data> data_; 2009 }; 2010 2011 /// \brief Randomly perform transformations, as selected from input transform list, on the input tensor. 2012 class DATASET_API UniformAugment final : public TensorTransform { 2013 public: 2014 /// \brief Constructor. 2015 /// \param[in] transforms Raw pointer to vector of TensorTransform operations. 2016 /// \param[in] num_ops An integer representing the number of operations to be selected and applied. 2017 /// \par Example 2018 /// \code 2019 /// /* Define operations */ 2020 /// auto resize_op(new vision::Resize({30, 30})); 2021 /// auto random_crop_op(new vision::RandomCrop({28, 28})); 2022 /// auto center_crop_op(new vision::CenterCrop({16, 16})); 2023 /// auto uniform_op(new vision::UniformAugment({random_crop_op, center_crop_op}, 2)); 2024 /// 2025 /// /* dataset is an instance of Dataset object */ 2026 /// dataset = dataset->Map({resize_op, uniform_op}, // operations 2027 /// {"image"}); // input columns 2028 /// \endcode 2029 explicit UniformAugment(const std::vector<TensorTransform *> &transforms, int32_t num_ops = 2); 2030 2031 /// \brief Constructor. 2032 /// \param[in] transforms Smart pointer to vector of TensorTransform operations. 2033 /// \param[in] num_ops An integer representing the number of operations to be selected and applied. 2034 /// \par Example 2035 /// \code 2036 /// /* Define operations */ 2037 /// std::shared_ptr<TensorTransform> resize_op(new vision::Resize({30, 30})); 2038 /// std::shared_ptr<TensorTransform> random_crop_op(new vision::RandomCrop({28, 28})); 2039 /// std::shared_ptr<TensorTransform> center_crop_op(new vision::CenterCrop({16, 16})); 2040 /// std::shared_ptr<TensorTransform> uniform_op(new vision::UniformAugment({random_crop_op, center_crop_op}, 2)); 2041 /// 2042 /// /* dataset is an instance of Dataset object */ 2043 /// dataset = dataset->Map({resize_op, uniform_op}, // operations 2044 /// {"image"}); // input columns 2045 /// \endcode 2046 explicit UniformAugment(const std::vector<std::shared_ptr<TensorTransform>> &transforms, int32_t num_ops = 2); 2047 2048 /// \brief Constructor. 2049 /// \param[in] transforms Object pointer to vector of TensorTransform operations. 2050 /// \param[in] num_ops An integer representing the number of operations to be selected and applied. 2051 /// \par Example 2052 /// \code 2053 /// /* Define operations */ 2054 /// vision::Resize resize_op = vision::Resize({30, 30}); 2055 /// vision::RandomCrop random_crop_op = vision::RandomCrop({28, 28}); 2056 /// vision::CenterCrop center_crop_op = vision::CenterCrop({16, 16}); 2057 /// vision::UniformAugment uniform_op = vision::UniformAugment({random_crop_op, center_crop_op}, 2); 2058 /// 2059 /// /* dataset is an instance of Dataset object */ 2060 /// dataset = dataset->Map({resize_op, uniform_op}, // operations 2061 /// {"image"}); // input columns 2062 /// \endcode 2063 explicit UniformAugment(const std::vector<std::reference_wrapper<TensorTransform>> &transforms, int32_t num_ops = 2); 2064 2065 /// \brief Destructor. 2066 ~UniformAugment() override = default; 2067 2068 protected: 2069 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 2070 /// \return Shared pointer to TensorOperation object. 2071 std::shared_ptr<TensorOperation> Parse() override; 2072 2073 private: 2074 struct Data; 2075 std::shared_ptr<Data> data_; 2076 }; 2077 2078 /// \brief Flip the input image vertically. 2079 class DATASET_API VerticalFlip final : public TensorTransform { 2080 public: 2081 /// \brief Constructor. 2082 /// \par Example 2083 /// \code 2084 /// /* Define operations */ 2085 /// auto decode_op = vision::Decode(); 2086 /// auto flip_op = vision::VerticalFlip(); 2087 /// 2088 /// /* dataset is an instance of Dataset object */ 2089 /// dataset = dataset->Map({decode_op, flip_op}, // operations 2090 /// {"image"}); // input columns 2091 /// \endcode 2092 VerticalFlip(); 2093 2094 /// \brief Destructor. 2095 ~VerticalFlip() override = default; 2096 2097 protected: 2098 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 2099 /// \return Shared pointer to TensorOperation object. 2100 std::shared_ptr<TensorOperation> Parse() override; 2101 }; 2102 2103 /// \brief Write the one dimension uint8 data into a file using binary mode. 2104 /// \param[in] filename The path to the file to be written. 2105 /// \param[in] data The tensor data. 2106 /// \return The status code. 2107 Status DATASET_API WriteFile(const std::string &filename, const mindspore::MSTensor &data); 2108 2109 /// \brief Write the image data into a JPEG file. 2110 /// \param[in] filename The path to the file to be written. 2111 /// \param[in] image The data tensor. 2112 /// \param[in] quality The quality for JPEG file, in range of [1, 100]. Default: 75. 2113 /// \return The status code. 2114 Status DATASET_API WriteJpeg(const std::string &filename, const mindspore::MSTensor &image, int quality = 75); 2115 2116 /// \brief Write the image into a PNG file. 2117 /// \param[in] filename The path to the file to be written. 2118 /// \param[in] image The data tensor. 2119 /// \param[in] compression_level The compression level for PNG file, in range of [0, 9]. Default: 6. 2120 /// \return The status code. 2121 Status DATASET_API WritePng(const std::string &filename, const mindspore::MSTensor &image, int compression_level = 6); 2122 } // namespace vision 2123 } // namespace dataset 2124 } // namespace mindspore 2125 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_VISION_H_ 2126