• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
18 
19 #include <algorithm>
20 #include <limits>
21 #include <memory>
22 
23 #include "minddata/dataset/util/random.h"
24 
25 namespace mindspore {
26 namespace dataset {
RandomSamplerRT(bool replacement,int64_t num_samples,bool reshuffle_each_epoch,int64_t samples_per_tensor)27 RandomSamplerRT::RandomSamplerRT(bool replacement, int64_t num_samples, bool reshuffle_each_epoch,
28                                  int64_t samples_per_tensor)
29     : SamplerRT(num_samples, samples_per_tensor),
30       seed_(GetSeed()),
31       replacement_(replacement),
32       next_id_(0),
33       dist(nullptr),
34       reshuffle_each_epoch_(reshuffle_each_epoch) {}
35 
GetNextSample(TensorRow * out)36 Status RandomSamplerRT::GetNextSample(TensorRow *out) {
37   RETURN_UNEXPECTED_IF_NULL(out);
38   if (next_id_ > num_samples_) {
39     RETURN_STATUS_UNEXPECTED(
40       "[Internal ERROR] Sampler index must be less than or equal to num_samples(total rows in dataset), but got" +
41       std::to_string(next_id_) + ", num_samplers:" + std::to_string(num_samples_));
42   } else if (next_id_ == num_samples_) {
43     (*out) = TensorRow(TensorRow::kFlagEOE);
44   } else {
45     if (HasChildSampler()) {
46       RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
47     }
48 
49     std::shared_ptr<Tensor> sampleIds;
50     int64_t last_id = std::min(samples_per_tensor_ + next_id_, num_samples_);
51     RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, last_id - next_id_));
52     auto id_ptr = sampleIds->begin<int64_t>();
53 
54     for (int64_t i = 0; i < (last_id - next_id_); i++) {
55       int64_t sampled_id = 0;
56       if (replacement_) {
57         sampled_id = (*dist)(rnd_);
58       } else {
59         sampled_id = shuffled_ids_[static_cast<size_t>(i + next_id_)];
60       }
61 
62       if (HasChildSampler()) {
63         RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
64       }
65 
66       *(id_ptr + static_cast<ptrdiff_t>(i)) = sampled_id;
67     }
68     next_id_ = last_id;
69     (*out) = {sampleIds};
70   }
71   return Status::OK();
72 }
73 
InitSampler()74 Status RandomSamplerRT::InitSampler() {
75   if (is_initialized) {
76     return Status::OK();
77   }
78   // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
79   // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
80   if (num_samples_ == 0 || num_samples_ > num_rows_) {
81     num_samples_ = num_rows_;
82   }
83   CHECK_FAIL_RETURN_UNEXPECTED(
84     num_samples_ > 0 && num_rows_ > 0,
85     "[Internal ERROR] num_samples and num_rows must be greater than 0, but got num_samples: " +
86       std::to_string(num_samples_) + ", num_rows: " + std::to_string(num_rows_));
87   samples_per_tensor_ = samples_per_tensor_ > num_samples_ ? num_samples_ : samples_per_tensor_;
88   rnd_.seed(seed_);
89 
90   if (!replacement_) {
91     shuffled_ids_.reserve(num_rows_);
92     for (int64_t i = 0; i < num_rows_; i++) {
93       shuffled_ids_.push_back(i);
94     }
95     std::shuffle(shuffled_ids_.begin(), shuffled_ids_.end(), rnd_);
96   } else {
97     dist = std::make_unique<std::uniform_int_distribution<int64_t>>(0, num_rows_ - 1);
98   }
99 
100   is_initialized = true;
101   return Status::OK();
102 }
103 
ResetSampler(const bool failover_reset)104 Status RandomSamplerRT::ResetSampler(const bool failover_reset) {
105   CHECK_FAIL_RETURN_UNEXPECTED(failover_reset || next_id_ == num_samples_,
106                                "[Internal ERROR] ResetSampler() called early or late.");
107   next_id_ = 0;
108 
109   if (reshuffle_each_epoch_) {
110     seed_++;
111   }
112 
113   rnd_.seed(seed_);
114 
115   if (!replacement_ && reshuffle_each_epoch_) {
116     std::shuffle(shuffled_ids_.begin(), shuffled_ids_.end(), rnd_);
117   }
118 
119   if (HasChildSampler()) {
120     RETURN_IF_NOT_OK(child_[0]->ResetSampler(failover_reset));
121   }
122 
123   return Status::OK();
124 }
125 
SamplerPrint(std::ostream & out,bool show_all) const126 void RandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
127   out << "\nSampler: RandomSampler";
128   if (show_all) {
129     // Call the super class for displaying any common detailed info
130     SamplerRT::SamplerPrint(out, show_all);
131     // Then add our own info if any
132   }
133 }
134 
to_json(nlohmann::json * out_json)135 Status RandomSamplerRT::to_json(nlohmann::json *out_json) {
136   RETURN_UNEXPECTED_IF_NULL(out_json);
137   nlohmann::json args;
138   RETURN_IF_NOT_OK(SamplerRT::to_json(&args));
139   args["sampler_name"] = "RandomSampler";
140   args["replacement"] = replacement_;
141   args["reshuffle_each_epoch"] = reshuffle_each_epoch_;
142 
143   *out_json = args;
144   return Status::OK();
145 }
146 }  // namespace dataset
147 }  // namespace mindspore
148