1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
17
18 #include <limits>
19 #include <memory>
20 #include "minddata/dataset/util/random.h"
21
22 namespace mindspore {
23 namespace dataset {
RandomSamplerRT(bool replacement,int64_t num_samples,bool reshuffle_each_epoch,int64_t samples_per_tensor)24 RandomSamplerRT::RandomSamplerRT(bool replacement, int64_t num_samples, bool reshuffle_each_epoch,
25 int64_t samples_per_tensor)
26 : SamplerRT(num_samples, samples_per_tensor),
27 seed_(GetSeed()),
28 replacement_(replacement),
29 next_id_(0),
30 dist(nullptr),
31 reshuffle_each_epoch_(reshuffle_each_epoch) {}
32
GetNextSample(TensorRow * out)33 Status RandomSamplerRT::GetNextSample(TensorRow *out) {
34 if (next_id_ > num_samples_) {
35 RETURN_STATUS_UNEXPECTED("Sampler index must be less than or equal to num_samples(total rows in dataset), but got" +
36 std::to_string(next_id_) + ", num_samplers:" + std::to_string(num_samples_));
37 } else if (next_id_ == num_samples_) {
38 (*out) = TensorRow(TensorRow::kFlagEOE);
39 } else {
40 if (HasChildSampler()) {
41 RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
42 }
43
44 std::shared_ptr<Tensor> sampleIds;
45 int64_t last_id = std::min(samples_per_tensor_ + next_id_, num_samples_);
46 RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, last_id - next_id_));
47 auto id_ptr = sampleIds->begin<int64_t>();
48
49 for (int64_t i = 0; i < (last_id - next_id_); i++) {
50 int64_t sampled_id = 0;
51 if (replacement_) {
52 sampled_id = (*dist)(rnd_);
53 } else {
54 sampled_id = shuffled_ids_[static_cast<size_t>(i + next_id_)];
55 }
56
57 if (HasChildSampler()) {
58 RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
59 }
60
61 *(id_ptr + static_cast<ptrdiff_t>(i)) = sampled_id;
62 }
63 next_id_ = last_id;
64 (*out) = {sampleIds};
65 }
66 return Status::OK();
67 }
68
InitSampler()69 Status RandomSamplerRT::InitSampler() {
70 if (is_initialized) {
71 return Status::OK();
72 }
73 // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
74 // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
75 if (num_samples_ == 0 || num_samples_ > num_rows_) {
76 num_samples_ = num_rows_;
77 }
78 CHECK_FAIL_RETURN_UNEXPECTED(
79 num_samples_ > 0 && num_rows_ > 0,
80 "Invalid parameter, num_samples and num_rows must be greater than 0, but got num_samples: " +
81 std::to_string(num_samples_) + ", num_rows: " + std::to_string(num_rows_));
82 samples_per_tensor_ = samples_per_tensor_ > num_samples_ ? num_samples_ : samples_per_tensor_;
83 rnd_.seed(seed_);
84
85 if (!replacement_) {
86 shuffled_ids_.reserve(num_rows_);
87 for (int64_t i = 0; i < num_rows_; i++) {
88 shuffled_ids_.push_back(i);
89 }
90 std::shuffle(shuffled_ids_.begin(), shuffled_ids_.end(), rnd_);
91 } else {
92 dist = std::make_unique<std::uniform_int_distribution<int64_t>>(0, num_rows_ - 1);
93 }
94
95 is_initialized = true;
96 return Status::OK();
97 }
98
ResetSampler()99 Status RandomSamplerRT::ResetSampler() {
100 CHECK_FAIL_RETURN_UNEXPECTED(next_id_ == num_samples_, "[Internal ERROR] Reset() Sampler called early or late.");
101 next_id_ = 0;
102
103 if (reshuffle_each_epoch_) {
104 seed_++;
105 }
106
107 rnd_.seed(seed_);
108
109 if (!replacement_ && reshuffle_each_epoch_) {
110 std::shuffle(shuffled_ids_.begin(), shuffled_ids_.end(), rnd_);
111 }
112
113 if (HasChildSampler()) {
114 RETURN_IF_NOT_OK(child_[0]->ResetSampler());
115 }
116
117 return Status::OK();
118 }
119
SamplerPrint(std::ostream & out,bool show_all) const120 void RandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
121 out << "\nSampler: RandomSampler";
122 if (show_all) {
123 // Call the super class for displaying any common detailed info
124 SamplerRT::SamplerPrint(out, show_all);
125 // Then add our own info if any
126 }
127 }
128
to_json(nlohmann::json * out_json)129 Status RandomSamplerRT::to_json(nlohmann::json *out_json) {
130 nlohmann::json args;
131 RETURN_IF_NOT_OK(SamplerRT::to_json(&args));
132 args["sampler_name"] = "RandomSampler";
133 args["replacement"] = replacement_;
134 args["reshuffle_each_epoch"] = reshuffle_each_epoch_;
135
136 *out_json = args;
137 return Status::OK();
138 }
139 } // namespace dataset
140 } // namespace mindspore
141