1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_WEIGHTED_RANDOM_SAMPLER_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_WEIGHTED_RANDOM_SAMPLER_H_ 18 19 #include <deque> 20 #include <limits> 21 #include <memory> 22 #include <vector> 23 24 #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" 25 26 namespace mindspore { 27 namespace dataset { 28 // Samples elements from id `0, 1, ..., weights.size()-1` with given probabilities (weights). 29 class WeightedRandomSamplerRT : public SamplerRT { 30 public: 31 // Constructor. 32 // @param weights A lift of sample weights. 33 // @param num_samples Number of samples to be drawn. 34 // @param replacement Determine if samples are drawn with/without replacement. 35 // @param samples_per_tensor The number of ids we draw on each call to GetNextSample(). 36 // When samples_per_tensor=0, GetNextSample() will draw all the sample ids and return them at once. 37 WeightedRandomSamplerRT(const std::vector<double> &weights, int64_t num_samples, bool replacement, 38 int64_t samples_per_tensor = std::numeric_limits<int64_t>::max()); 39 40 // Destructor. 41 ~WeightedRandomSamplerRT() = default; 42 43 // Initialize the sampler. 44 // @param op (Not used in this sampler) 45 // @return Status 46 Status InitSampler() override; 47 48 /// \brief Reset the internal variable(s) to the initial state and reshuffle the indices. 49 /// \param[in] failover_reset A boolean to show whether we are resetting the pipeline 50 /// \return Status The status code returned 51 Status ResetSampler(const bool failover_reset = false) override; 52 53 // Get the sample ids. 54 // @param[out] TensorRow where the sample ids will be placed. 55 // @note the sample ids (int64_t) will be placed in one Tensor 56 Status GetNextSample(TensorRow *out) override; 57 58 // Printer for debugging purposes. 59 // @param out - output stream to write to 60 // @param show_all - bool to show detailed vs summary 61 void SamplerPrint(std::ostream &out, bool show_all) const override; 62 63 /// \brief Get the arguments of node 64 /// \param[out] out_json JSON string of all attributes 65 /// \return Status of the function 66 Status to_json(nlohmann::json *out_json) override; 67 68 private: 69 // A list of weights for each sample. 70 std::vector<double> weights_; 71 72 // A flag indicating if samples are drawn with/without replacement. 73 bool replacement_; 74 75 // Current sample id. 76 int64_t sample_id_; 77 78 // Random engine and device 79 std::mt19937 rand_gen_; 80 81 // Discrete distribution for generating weighted random numbers with replacement. 82 std::unique_ptr<std::discrete_distribution<int64_t>> discrete_dist_; 83 84 // Exponential distribution for generating weighted random numbers without replacement. 85 // based on "Accelerating weighted random sampling without replacement" by Kirill Muller. 86 std::unique_ptr<std::exponential_distribution<>> exp_dist_; 87 88 // Initialized the computation for generating weighted random numbers without replacement 89 // using onepass method. 90 void InitOnePassSampling(); 91 92 // Store the random weighted ids generated by onepass method in `InitOnePassSampling` 93 std::deque<int64_t> onepass_ids_; 94 }; 95 } // namespace dataset 96 } // namespace mindspore 97 98 #endif 99