1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_ 17 #define TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_ 18 19 #include <vector> 20 21 #include "tensorflow/core/lib/core/status.h" 22 #include "tensorflow/core/lib/gtl/array_slice.h" 23 #include "tensorflow/core/lib/random/distribution_sampler.h" 24 #include "tensorflow/core/lib/random/random_distributions.h" 25 #include "tensorflow/core/lib/random/weighted_picker.h" 26 #include "tensorflow/core/platform/logging.h" 27 #include "tensorflow/core/platform/mutex.h" 28 #include "tensorflow/core/platform/thread_annotations.h" 29 #include "tensorflow/core/platform/types.h" 30 31 namespace tensorflow { 32 33 class Env; 34 35 // Abstract subclass for sampling from the set of non-negative integers 36 // [0, range) 37 class RangeSampler { 38 public: RangeSampler(int64 range)39 explicit RangeSampler(int64 range) : range_(range) { CHECK_GT(range_, 0); } 40 virtual ~RangeSampler(); 41 42 // Sample a single value 43 virtual int64 Sample(random::SimplePhilox* rnd) const = 0; 44 45 // The probability that a single call to Sample() returns the given value. 46 // Assumes that value is in [0, range). No range checking is done. 47 virtual float Probability(int64 value) const = 0; 48 49 // Fill "batch" with samples from the distribution. 50 // If unique=true, then we re-pick each element until we get a 51 // value distinct from all previously picked values in the batch. 52 void SampleBatch(random::SimplePhilox* rnd, bool unique, 53 gtl::MutableArraySlice<int64> batch) const; 54 55 // Fill "batch" with samples from the distribution, and report 56 // "expected counts". 57 // 58 // The "expected count" of a value is an estimate of the expected 59 // number of occurrences of the value in the batch returned by a 60 // call to this function with the given parameters. If unique=true, 61 // the expected count is an inclusion probability. For details on 62 // this estimation, see the comment to "ExpectedCountHelper" in the 63 // .cc file. 64 // 65 // Expected counts for the elements of the returned "batch" are reported 66 // in the aligned array "batch_expected_count". 67 // 68 // The user can optionally provide "extras", containing values in the range. 69 // The expected counts for the extras are reported in the aligned array 70 // "extras_expected_count". 71 // 72 // "batch_expected_count" must have size equal to 0 or to the size of "batch". 73 // "extras" and "extras_expected_count" must have equal size. 74 void SampleBatchGetExpectedCount( 75 random::SimplePhilox* rnd, bool unique, 76 gtl::MutableArraySlice<int64> batch, 77 gtl::MutableArraySlice<float> batch_expected_count, 78 gtl::ArraySlice<int64> extras, 79 gtl::MutableArraySlice<float> extras_expected_count) const; 80 81 // Same as SampleBatchGetExpectedCount (see above), but with avoided values. 82 // We repick to avoid all of the values in "avoided_values". 83 // "avoided_values" is only supported with unique=true. If 84 // unique=false, then avoided_values must be empty. 85 virtual void SampleBatchGetExpectedCountAvoid( 86 random::SimplePhilox* rnd, bool unique, 87 gtl::MutableArraySlice<int64> batch, 88 gtl::MutableArraySlice<float> batch_expected_count, 89 gtl::ArraySlice<int64> extras, 90 gtl::MutableArraySlice<float> extras_expected_count, 91 gtl::ArraySlice<int64> avoided_values) const; 92 93 // Does this sampler need to be updated with values, e.g. UnigramSampler NeedsUpdates()94 virtual bool NeedsUpdates() const { return false; } 95 96 // Updates the underlying distribution Update(gtl::ArraySlice<int64> values)97 virtual void Update(gtl::ArraySlice<int64> values) { 98 LOG(FATAL) << "Update not supported for this sampler type."; 99 } 100 range()101 int64 range() { return range_; } 102 103 protected: 104 const int64 range_; 105 }; 106 107 // An AllSampler only samples batches of size equal to range. 108 // It returns the entire range. 109 // It cannot sample single values. 110 class AllSampler : public RangeSampler { 111 public: 112 explicit AllSampler(int64 range); 113 ~AllSampler()114 ~AllSampler() override {} 115 Sample(random::SimplePhilox * rnd)116 int64 Sample(random::SimplePhilox* rnd) const override { 117 LOG(FATAL) << "Should not be called"; 118 return 0; 119 } 120 Probability(int64 value)121 float Probability(int64 value) const override { 122 LOG(FATAL) << "Should not be called"; 123 return 0; 124 } 125 126 void SampleBatchGetExpectedCountAvoid( 127 random::SimplePhilox* rnd, bool unique, 128 gtl::MutableArraySlice<int64> batch, 129 gtl::MutableArraySlice<float> batch_expected_count, 130 gtl::ArraySlice<int64> extras, 131 gtl::MutableArraySlice<float> extras_expected_count, 132 gtl::ArraySlice<int64> avoided_values) const override; 133 }; 134 135 class UniformSampler : public RangeSampler { 136 public: 137 explicit UniformSampler(int64 range); 138 ~UniformSampler()139 ~UniformSampler() override {} 140 141 int64 Sample(random::SimplePhilox* rnd) const override; 142 143 float Probability(int64 value) const override; 144 145 private: 146 const float inv_range_; 147 }; 148 149 class LogUniformSampler : public RangeSampler { 150 public: 151 explicit LogUniformSampler(int64 range); 152 ~LogUniformSampler()153 ~LogUniformSampler() override {} 154 155 int64 Sample(random::SimplePhilox* rnd) const override; 156 157 float Probability(int64 value) const override; 158 159 private: 160 const double log_range_; 161 }; 162 163 // Thread-unsafe unigram sampler 164 class ThreadUnsafeUnigramSampler : public RangeSampler { 165 public: 166 explicit ThreadUnsafeUnigramSampler(int64 range); ~ThreadUnsafeUnigramSampler()167 ~ThreadUnsafeUnigramSampler() override {} 168 169 int64 Sample(random::SimplePhilox* rnd) const override; 170 171 float Probability(int64 value) const override; 172 NeedsUpdates()173 bool NeedsUpdates() const override { return true; } 174 void Update(gtl::ArraySlice<int64> values) override; 175 176 private: 177 random::WeightedPicker picker_; 178 }; 179 180 // Thread-safe unigram sampler 181 class UnigramSampler : public RangeSampler { 182 public: 183 explicit UnigramSampler(int64 range); ~UnigramSampler()184 ~UnigramSampler() override {} 185 186 int64 Sample(random::SimplePhilox* rnd) const override; 187 188 float Probability(int64 value) const override; 189 190 // Overriding at a high level results in far fewer lock acquisitions. 191 void SampleBatchGetExpectedCountAvoid( 192 random::SimplePhilox* rnd, bool unique, 193 gtl::MutableArraySlice<int64> batch, 194 gtl::MutableArraySlice<float> batch_expected_count, 195 gtl::ArraySlice<int64> extras, 196 gtl::MutableArraySlice<float> extras_expected_count, 197 gtl::ArraySlice<int64> avoided_values) const override; 198 NeedsUpdates()199 bool NeedsUpdates() const override { return true; } 200 void Update(gtl::ArraySlice<int64> values) override; 201 202 private: 203 ThreadUnsafeUnigramSampler unsafe_sampler_ TF_GUARDED_BY(mu_); 204 mutable mutex mu_; 205 }; 206 207 // A unigram sampler that uses a fixed unigram distribution read from a 208 // file or passed in as an in-memory array instead of building up the 209 // distribution from data on the fly. There is also an option to skew the 210 // distribution by applying a distortion power to the weights. 211 class FixedUnigramSampler : public RangeSampler { 212 public: 213 // The vocab_file is assumed to be a CSV, with the last entry of each row a 214 // value representing the counts or probabilities for the corresponding ID. 215 FixedUnigramSampler(Env* env, int64 range, const string& vocab_file, 216 float distortion, int32 num_reserved_ids, 217 int32 num_shards, int32 shard); 218 219 FixedUnigramSampler(int64 range, const std::vector<float>& unigrams, 220 float distortion, int32 num_reserved_ids, 221 int32 num_shards, int32 shard); 222 223 float Probability(int64 value) const override; 224 225 int64 Sample(random::SimplePhilox* rnd) const override; 226 227 private: 228 // Underlying distribution sampler. 229 std::unique_ptr<random::DistributionSampler> dist_sampler_; 230 // Weights for individual samples. The probability of a sample i is defined 231 // as weights_.at(i) / total_weight_. 232 std::vector<float> weights_; 233 // The total weights of all samples. 234 float total_weight_; 235 // Sharding information of the sampler. The whole vocabulary is sharded 236 // into num_shards_ smaller ranges and each sampler is responsible for one 237 // such smaller range, identified by the shard number. 238 int32 num_shards_; 239 int32 shard_; 240 241 // Fill the sampler with the appropriate number of reserved IDs. 242 void FillReservedIds(int32 num_reserved_ids); 243 // Load IDs to sample from a CSV file. It is assumed that the last item of 244 // each row contains a count or probability for the corresponding ID. 245 Status LoadFromFile(Env* env, const string& vocab_file, float distortion); 246 // Load from an in-memory array. 247 void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion); 248 }; 249 250 } // namespace tensorflow 251 252 #endif // TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_ 253