• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"
17 #include <algorithm>
18 #include <memory>
19 #include "minddata/dataset/util/random.h"
20 
21 namespace mindspore {
22 namespace dataset {
PKSamplerRT(int64_t num_val,bool shuffle,int64_t num_samples,int64_t samples_per_tensor)23 PKSamplerRT::PKSamplerRT(int64_t num_val, bool shuffle, int64_t num_samples, int64_t samples_per_tensor)
24     : SamplerRT(num_samples, samples_per_tensor),
25       shuffle_(shuffle),
26       seed_(GetSeed()),
27       next_id_(0),
28       samples_per_class_(num_val) {}
29 
InitSampler()30 Status PKSamplerRT::InitSampler() {
31   if (is_initialized) {
32     return Status::OK();
33   }
34   labels_.reserve(label_to_ids_.size());
35   for (const auto &pair : label_to_ids_) {
36     if (!pair.second.empty()) {
37       labels_.push_back(pair.first);
38     }
39   }
40   rnd_.seed(seed_++);
41 
42   // The special handshake gives the list of classes and id's, but it did not set the num_rows_ to
43   // capture the total number of possible sample ids.
44   // Compute that here for this case to find the total number of samples that are available to return.
45   // (in this case, samples per class * total classes).
46   if (samples_per_class_ > std::numeric_limits<int64_t>::max() / static_cast<int64_t>(labels_.size())) {
47     RETURN_STATUS_UNEXPECTED("Overflow in counting  num_rows");
48   }
49   num_rows_ = samples_per_class_ * static_cast<int64_t>(labels_.size());
50 
51   // The user may have chosen to sample less than the total amount.
52   // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
53   // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
54   if (num_samples_ == 0 || num_samples_ > num_rows_) {
55     num_samples_ = num_rows_;
56   }
57 
58   samples_per_tensor_ = (samples_per_tensor_ > num_samples_) ? num_samples_ : samples_per_tensor_;
59   if (shuffle_ == true) {
60     std::shuffle(labels_.begin(), labels_.end(), rnd_);
61   } else {
62     std::sort(labels_.begin(), labels_.end());
63   }
64   CHECK_FAIL_RETURN_UNEXPECTED(
65     num_samples_ > 0, "Invalid parameter, num_class or num samples per class must be greater than 0, but got " +
66                         std::to_string(num_samples_));
67   is_initialized = true;
68   return Status::OK();
69 }
70 
GetNextSample(TensorRow * out)71 Status PKSamplerRT::GetNextSample(TensorRow *out) {
72   if (next_id_ > num_samples_ || num_samples_ == 0) {
73     RETURN_STATUS_UNEXPECTED(
74       "Sampler index must be less than or equal to num_samples(total rows in dataset), but got: " +
75       std::to_string(next_id_) + ", num_samplers:" + std::to_string(num_samples_));
76   } else if (next_id_ == num_samples_) {
77     (*out) = TensorRow(TensorRow::kFlagEOE);
78   } else {
79     if (HasChildSampler()) {
80       RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
81     }
82 
83     std::shared_ptr<Tensor> sample_ids;
84     int64_t last_id = (samples_per_tensor_ + next_id_ > num_samples_) ? num_samples_ : samples_per_tensor_ + next_id_;
85     RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, last_id - next_id_));
86     auto id_ptr = sample_ids->begin<int64_t>();
87     CHECK_FAIL_RETURN_UNEXPECTED(samples_per_class_ != 0, "Invalid Parameter, num samples per class can't be zero.");
88     while (next_id_ < last_id && id_ptr != sample_ids->end<int64_t>()) {
89       int64_t cls_id = next_id_++ / samples_per_class_;
90       const std::vector<int64_t> &samples = label_to_ids_[labels_[cls_id]];
91       int64_t rnd_ind = std::uniform_int_distribution<int64_t>(0, samples.size() - 1)(rnd_);
92       int64_t sampled_id = samples[rnd_ind];
93 
94       if (HasChildSampler()) {
95         RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
96       }
97 
98       *id_ptr = sampled_id;
99       ++id_ptr;
100     }
101 
102     (*out) = {sample_ids};
103   }
104   return Status::OK();
105 }
106 
ResetSampler()107 Status PKSamplerRT::ResetSampler() {
108   CHECK_FAIL_RETURN_UNEXPECTED(next_id_ == num_samples_, "[Internal ERROR] Reset() Sampler called early or late.");
109   next_id_ = 0;
110   rnd_.seed(seed_++);
111 
112   if (HasChildSampler()) {
113     RETURN_IF_NOT_OK(child_[0]->ResetSampler());
114   }
115 
116   return Status::OK();
117 }
118 
HandshakeRandomAccessOp(const RandomAccessOp * op)119 Status PKSamplerRT::HandshakeRandomAccessOp(const RandomAccessOp *op) {
120   RETURN_UNEXPECTED_IF_NULL(op);
121   RETURN_IF_NOT_OK(op->GetClassIds(&label_to_ids_));
122   RETURN_IF_NOT_OK(InitSampler());
123   return Status::OK();
124 }
125 
SamplerPrint(std::ostream & out,bool show_all) const126 void PKSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
127   out << "\nSampler: PKSampler";
128   if (show_all) {
129     // Call the super class for displaying any common detailed info
130     SamplerRT::SamplerPrint(out, show_all);
131     // Then add our own info if any
132   }
133 }
134 
to_json(nlohmann::json * out_json)135 Status PKSamplerRT::to_json(nlohmann::json *out_json) {
136   nlohmann::json args;
137   RETURN_IF_NOT_OK(SamplerRT::to_json(&args));
138   args["sampler_name"] = "PKSampler";
139   args["num_val"] = samples_per_class_;
140   args["shuffle"] = shuffle_;
141   *out_json = args;
142   return Status::OK();
143 }
144 }  // namespace dataset
145 }  // namespace mindspore
146