1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18
19 #define _USE_MATH_DEFINES
20 #include <math.h>
21 #include <cmath>
22 #undef _USE_MATH_DEFINES
23
24 #include <string.h>
25 #include <algorithm>
26
27 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
28 #include "tensorflow/core/lib/random/philox_random.h"
29
30 namespace tensorflow {
31 namespace random {
32
33 // Helper function to convert a 16-bit integer to a half between [0..1).
34 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
35 // Helper function to convert a 32-bit integer to a float between [0..1).
36 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
37 // Helper function to convert two 32-bit integers to a double between [0..1).
38 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
39
40 // A class that generates uniform distribution random numbers from the
41 // underlying random integer generator.
42 // Arguments:
43 // Generator: a generator type that returns a number of uint32 upon each
44 // invocation. It needs to define kResultElementCount for the
45 // sample count for each invocation, and ResultType for the
46 // actual returned sample type.
47 // RealType: the data type of the real numbers that will be returned by the
48 // distribution. This could be either float or double for now.
49 // This class is meant to be implemented through specialization. The default
50 // is not defined by design.
51 template <class Generator, typename RealType>
52 class UniformDistribution;
53
54 template <class Generator>
55 class UniformDistribution<Generator, Eigen::half> {
56 public:
57 // The number of elements that will be returned.
58 static const int kResultElementCount = Generator::kResultElementCount;
59 // Cost of generation of a single element (in cycles).
60 static const int kElementCost = 3;
61 // Indicate that this distribution may take variable number of samples
62 // during the runtime.
63 static const bool kVariableSamplesPerOutput = false;
64 typedef Array<Eigen::half, kResultElementCount> ResultType;
65 typedef Eigen::half ResultElementType;
66
67 PHILOX_DEVICE_INLINE
operator()68 ResultType operator()(Generator* gen) {
69 typename Generator::ResultType sample = (*gen)();
70 ResultType result;
71 for (int i = 0; i < kResultElementCount; ++i) {
72 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
73 }
74 return result;
75 }
76 };
77
78 template <class Generator>
79 class UniformDistribution<Generator, float> {
80 public:
81 // The number of elements that will be returned.
82 static const int kResultElementCount = Generator::kResultElementCount;
83 // Cost of generation of a single element (in cycles).
84 static const int kElementCost = 3;
85 // Indicate that this distribution may take variable number of samples
86 // during the runtime.
87 static const bool kVariableSamplesPerOutput = false;
88 typedef Array<float, kResultElementCount> ResultType;
89 typedef float ResultElementType;
90
91 PHILOX_DEVICE_INLINE
operator()92 ResultType operator()(Generator* gen) {
93 typename Generator::ResultType sample = (*gen)();
94 ResultType result;
95 for (int i = 0; i < kResultElementCount; ++i) {
96 result[i] = Uint32ToFloat(sample[i]);
97 }
98 return result;
99 }
100 };
101
102 template <class Generator>
103 class UniformDistribution<Generator, double> {
104 public:
105 // The number of elements that will be returned.
106 static const int kResultElementCount = Generator::kResultElementCount / 2;
107 // Cost of generation of a single element (in cycles).
108 static const int kElementCost = 3;
109 // Indicate that this distribution may take variable number of samples
110 // during the runtime.
111 static const bool kVariableSamplesPerOutput = false;
112 typedef Array<double, kResultElementCount> ResultType;
113 typedef double ResultElementType;
114
115 PHILOX_DEVICE_INLINE
operator()116 ResultType operator()(Generator* gen) {
117 typename Generator::ResultType sample = (*gen)();
118 ResultType result;
119 for (int i = 0; i < kResultElementCount; ++i) {
120 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
121 }
122 return result;
123 }
124 };
125
126 template <class Generator>
127 class UniformDistribution<Generator, int32> {
128 public:
129 // The number of elements that will be returned.
130 static const int kResultElementCount = Generator::kResultElementCount;
131 // Cost of generation of a single element (in cycles).
132 static const int kElementCost = 3;
133 // Indicate that this distribution may take variable number of samples
134 // during the runtime.
135 static const bool kVariableSamplesPerOutput = false;
136 typedef Array<int32, kResultElementCount> ResultType;
137 typedef int32 ResultElementType;
138
139 // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)140 UniformDistribution(int32 lo, int32 hi) : lo_(lo), range_(hi - lo) {}
141
142 PHILOX_DEVICE_INLINE
operator()143 ResultType operator()(Generator* gen) {
144 typename Generator::ResultType sample = (*gen)();
145 ResultType result;
146 for (int i = 0; i < kResultElementCount; ++i) {
147 result[i] = lo_ + static_cast<int32>(sample[i] % range_);
148 }
149 return result;
150 }
151
152 private:
153 // Note that lo_ is intentionally signed while range_ is intentionally
154 // unsigned. This is because hi - lo can overflow signed integers if
155 // lo < 0 < hi, but always fits in unsigned.
156 int32 lo_;
157 uint32 range_;
158 };
159
160 template <class Generator>
161 class UniformDistribution<Generator, int64> {
162 public:
163 // The number of elements that will be returned.
164 static const int kResultElementCount = Generator::kResultElementCount / 2;
165 // Cost of generation of a single element (in cycles).
166 static const int kElementCost = 3;
167 // Indicate that this distribution may take variable number of samples
168 // during the runtime.
169 static const bool kVariableSamplesPerOutput = false;
170 typedef Array<int64, kResultElementCount> ResultType;
171 typedef int64 ResultElementType;
172
173 // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)174 UniformDistribution(int64 lo, int64 hi) : lo_(lo), range_(hi - lo) {}
175
176 PHILOX_DEVICE_INLINE
operator()177 ResultType operator()(Generator* gen) {
178 typename Generator::ResultType sample = (*gen)();
179 ResultType result;
180 for (int i = 0; i < kResultElementCount; ++i) {
181 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
182 result[i] = lo_ + static_cast<int64>(bits % range_);
183 }
184 return result;
185 }
186
187 private:
188 // Note that lo_ is intentionally signed while range_ is intentionally
189 // unsigned. This is because hi - lo can overflow signed integers if
190 // lo < 0 < hi, but always fits in unsigned.
191 int64 lo_;
192 uint64 range_;
193 };
194
195 // A class that adapts the underlying native multiple samples to return a single
196 // sample at a time.
197 template <class Generator>
198 class SingleSampleAdapter {
199 public:
200 // The number of elements that will be returned.
201 static const int kResultElementCount = 1;
202 // The number of elements that will be returned by the underlying generator.
203 static const int kNativeElementCount = Generator::kResultElementCount;
204 typedef typename Generator::ResultElementType ResultType;
205 typedef typename Generator::ResultElementType ResultElementType;
206
207 PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)208 explicit SingleSampleAdapter(Generator* gen)
209 : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
210
211 PHILOX_DEVICE_INLINE
operator()212 ResultType operator()() {
213 if (used_result_index_ == Generator::kResultElementCount) {
214 unused_results_ = (*generator_)();
215 used_result_index_ = 0;
216 }
217
218 return unused_results_[used_result_index_++];
219 }
220
221 PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)222 void Skip(uint64 num_skips) {
223 if (!num_skips) {
224 return;
225 }
226 int num_unused_results = kNativeElementCount - used_result_index_;
227 if (num_skips <= num_unused_results) {
228 used_result_index_ += num_skips;
229 return;
230 }
231 num_skips -= num_unused_results;
232 used_result_index_ = kNativeElementCount;
233 SkipFromGenerator(num_skips / kNativeElementCount);
234 num_skips = num_skips % kNativeElementCount;
235 if (num_skips) {
236 unused_results_ = (*generator_)();
237 used_result_index_ = num_skips;
238 }
239 }
240
241 private:
242 // This implementation iteratively skips over `num_skips` samples
243 // from `generator_`. There is an O(1) implementation for PhiloxRandom
244 // in random_distributions.cc.
245 PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)246 void SkipFromGenerator(uint64 num_skips) {
247 while (num_skips--) {
248 (*generator_)();
249 }
250 }
251
252 Generator* generator_;
253 typename Generator::ResultType unused_results_;
254 int used_result_index_;
255 };
256
257 // A class that generates unit normal distribution random numbers from the
258 // underlying random integer generator.
259 // Arguments:
260 // Generator: a generator type that returns a number of uint32 upon each
261 // each invocation. It needs to define kResultElementCount for the
262 // sample count for each invocation, and ResultType for actual
263 // returned sample type.
264 // RealType: the data type of the real numbers that will be returned by the
265 // distribution. This could be either float or double for now.
266 // This class is meant to be implemented through specialization. The default
267 // is not defined by design.
268 template <class Generator, typename RealType>
269 class NormalDistribution;
270
271 PHILOX_DEVICE_INLINE
272 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
273
274 PHILOX_DEVICE_INLINE
275 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
276 double* d1);
277
278 // Exactly like the float version, except that we convert to half afterwards;
279 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
280 // gain from working in half internally.
281 template <class Generator>
282 class NormalDistribution<Generator, Eigen::half> {
283 public:
284 // The number of elements that will be returned.
285 static const int kResultElementCount = Generator::kResultElementCount;
286 // Cost of generation of a single element (in cycles).
287 static const int kElementCost = 70;
288 // Indicate that this distribution may take variable number of samples
289 // during the runtime.
290 static const bool kVariableSamplesPerOutput = false;
291 typedef Array<Eigen::half, kResultElementCount> ResultType;
292 typedef Eigen::half ResultElementType;
293
294 PHILOX_DEVICE_INLINE
operator()295 ResultType operator()(Generator* gen) {
296 typename Generator::ResultType sample = (*gen)();
297 ResultType result;
298 for (int i = 0; i < kResultElementCount; i += 2) {
299 float f[2];
300 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
301 result[i] = Eigen::half(f[0]);
302 result[i + 1] = Eigen::half(f[1]);
303 }
304 return result;
305 }
306 };
307
308 template <class Generator>
309 class NormalDistribution<Generator, float> {
310 public:
311 // The number of elements that will be returned.
312 static const int kResultElementCount = Generator::kResultElementCount;
313 // Cost of generation of a single element (in cycles).
314 static const int kElementCost = 70;
315 // Indicate that this distribution may take variable number of samples
316 // during the runtime.
317 static const bool kVariableSamplesPerOutput = false;
318 typedef Array<float, kResultElementCount> ResultType;
319 typedef float ResultElementType;
320
321 PHILOX_DEVICE_INLINE
operator()322 ResultType operator()(Generator* gen) {
323 typename Generator::ResultType sample = (*gen)();
324 ResultType result;
325 for (int i = 0; i < kResultElementCount; i += 2) {
326 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
327 }
328 return result;
329 }
330 };
331
332 template <class Generator>
333 class NormalDistribution<Generator, double> {
334 public:
335 // The number of elements that will be returned.
336 static const int kResultElementCount = Generator::kResultElementCount / 2;
337 // Cost of generation of a single element (in cycles).
338 static const int kElementCost = 70;
339 // Indicate that this distribution may take variable number of samples
340 // during the runtime.
341 static const bool kVariableSamplesPerOutput = false;
342 typedef Array<double, kResultElementCount> ResultType;
343 typedef double ResultElementType;
344
345 PHILOX_DEVICE_INLINE
operator()346 ResultType operator()(Generator* gen) {
347 typename Generator::ResultType sample = (*gen)();
348 ResultType result;
349 for (int i = 0; i < kResultElementCount; i += 2) {
350 const int i2 = 2 * i;
351 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
352 sample[i2 + 3], &result[i], &result[i + 1]);
353 }
354 return result;
355 }
356 };
357
358 // A class that returns standard normal distribution between
359 // [-kTruncateValue, kTruncateValue].
360 // Arguments:
361 // Generator: a generator type that returns a number of uint32 upon each
362 // each invocation. It needs to define kResultElementCount for the
363 // sample count for each invocation, and ResultType for actual
364 // returned sample type.
365 // RealType: the data type of the real numbers that will be returned by the
366 // distribution. This could be either float or double for now.
367 // This class is meant to be implemented through specialization. The default
368 // is not defined by design.
369 template <class SingleSampleGenerator, typename RealType>
370 class TruncatedNormalDistribution;
371
372 // Exactly like the float version, except that we convert to half afterwards;
373 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
374 // gain from working in half internally.
375 template <class SingleSampleGenerator>
376 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
377 public:
378 // The number of elements that will be returned.
379 static const int kResultElementCount =
380 SingleSampleGenerator::kNativeElementCount;
381 // Cost of generation of a single element (in cycles).
382 static const int kElementCost = 90;
383 // Indicate that this distribution may take variable number of samples
384 // during the runtime.
385 static const bool kVariableSamplesPerOutput = true;
386 // The threshold where the normal distribution is truncated.
387 const float kTruncateValue = 2.0f;
388
389 typedef Array<Eigen::half, kResultElementCount> ResultType;
390 typedef Eigen::half ResultElementType;
391
392 PHILOX_DEVICE_INLINE
operator()393 ResultType operator()(SingleSampleGenerator* gen) {
394 ResultType results;
395 int index = 0;
396 while (true) {
397 // Repeatedly take samples from the normal distribution, until we have
398 // the desired number of elements that fall within the pre-defined cutoff
399 // threshold.
400 const uint32 x0 = (*gen)();
401 const uint32 x1 = (*gen)();
402 float f[2];
403 BoxMullerFloat(x0, x1, &f[0], &f[1]);
404
405 for (int i = 0; i < 2; ++i) {
406 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
407 results[index++] = Eigen::half(f[i]);
408 if (index >= kResultElementCount) {
409 return results;
410 }
411 }
412 }
413 }
414 }
415 };
416
417 // Partial specialization for float.
418 template <class SingleSampleGenerator>
419 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
420 public:
421 // The number of elements that will be returned.
422 static const int kResultElementCount =
423 SingleSampleGenerator::kNativeElementCount;
424 // Cost of generation of a single element (in cycles).
425 static const int kElementCost = 90;
426 // Indicate that this distribution may take variable number of samples
427 // during the runtime.
428 static const bool kVariableSamplesPerOutput = true;
429 // The threshold where the normal distribution is truncated.
430 const float kTruncateValue = 2.0f;
431
432 typedef Array<float, kResultElementCount> ResultType;
433 typedef float ResultElementType;
434
435 PHILOX_DEVICE_INLINE
operator()436 ResultType operator()(SingleSampleGenerator* gen) {
437 ResultType results;
438 int index = 0;
439 while (true) {
440 // Repeatedly take samples from the normal distribution, until we have
441 // the desired number of elements that fall within the pre-defined cutoff
442 // threshold.
443 const uint32 x0 = (*gen)();
444 const uint32 x1 = (*gen)();
445 float f[2];
446 BoxMullerFloat(x0, x1, &f[0], &f[1]);
447
448 for (int i = 0; i < 2; ++i) {
449 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
450 results[index++] = f[i];
451 if (index >= kResultElementCount) {
452 return results;
453 }
454 }
455 }
456 }
457 }
458 };
459
460 // Partial specialization for double.
461 template <class SingleSampleGenerator>
462 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
463 public:
464 // The number of elements that will be returned.
465 static const int kResultElementCount =
466 (SingleSampleGenerator::kNativeElementCount > 1)
467 ? SingleSampleGenerator::kNativeElementCount / 2
468 : 1;
469 // Cost of generation of a single element (in cycles).
470 static const int kElementCost = 90;
471 // Indicate that this distribution may take variable number of samples
472 // during the runtime.
473 static const bool kVariableSamplesPerOutput = true;
474 typedef Array<double, kResultElementCount> ResultType;
475 typedef double ResultElementType;
476 const double kTruncateValue = 2.0;
477
478 PHILOX_DEVICE_INLINE
operator()479 ResultType operator()(SingleSampleGenerator* gen) {
480 ResultType results;
481 int index = 0;
482 while (1) {
483 const uint32 x0 = (*gen)();
484 const uint32 x1 = (*gen)();
485 const uint32 x2 = (*gen)();
486 const uint32 x3 = (*gen)();
487 double d[2];
488 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
489
490 for (int i = 0; i < 2; ++i) {
491 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
492 results[index++] = d[i];
493 if (index >= kResultElementCount) {
494 return results;
495 }
496 }
497 }
498 }
499 }
500 };
501
502 // Helper function to convert two 32-bit uniform integers to two floats
503 // under the unit normal distribution.
504 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)505 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
506 // This function implements the Box-Muller transform:
507 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
508 // Do not send a really small number to log().
509 // We cannot mark "epsilon" as "static const" because NVCC would complain
510 const float epsilon = 1.0e-7f;
511 float u1 = Uint32ToFloat(x0);
512 if (u1 < epsilon) {
513 u1 = epsilon;
514 }
515 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
516 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
517 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
518 *f0 = Eigen::numext::sin(v1);
519 *f1 = Eigen::numext::cos(v1);
520 #else
521 sincosf(v1, f0, f1);
522 #endif
523 *f0 *= u2;
524 *f1 *= u2;
525 }
526
527 // Helper function to convert four 32-bit uniform integers to two doubles
528 // under the unit normal distribution.
529 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)530 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
531 double* d1) {
532 // This function implements the Box-Muller transform:
533 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
534 // Do not send a really small number to log().
535 // We cannot mark "epsilon" as "static const" because NVCC would complain
536 const double epsilon = 1.0e-7;
537 double u1 = Uint64ToDouble(x0, x1);
538 if (u1 < epsilon) {
539 u1 = epsilon;
540 }
541 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
542 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
543 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
544 *d0 = Eigen::numext::sin(v1);
545 *d1 = Eigen::numext::cos(v1);
546 #else
547 sincos(v1, d0, d1);
548 #endif
549 *d0 *= u2;
550 *d1 *= u2;
551 }
552
553 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)554 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
555 // IEEE754 halfs are formatted as follows (MSB first):
556 // sign(1) exponent(5) mantissa(10)
557 // Conceptually construct the following:
558 // sign == 0
559 // exponent == 15 -- an excess 15 representation of a zero exponent
560 // mantissa == 10 random bits
561 const uint16 man = x & 0x3ffu; // 10 bit mantissa
562 const uint16 exp = static_cast<uint16>(15);
563 const uint16 val = (exp << 10) | man;
564
565 Eigen::half result;
566 result.x = val;
567 return result - Eigen::half(1.0);
568 }
569
570 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)571 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
572 // IEEE754 floats are formatted as follows (MSB first):
573 // sign(1) exponent(8) mantissa(23)
574 // Conceptually construct the following:
575 // sign == 0
576 // exponent == 127 -- an excess 127 representation of a zero exponent
577 // mantissa == 23 random bits
578 const uint32 man = x & 0x7fffffu; // 23 bit mantissa
579 const uint32 exp = static_cast<uint32>(127);
580 const uint32 val = (exp << 23) | man;
581
582 // Assumes that endian-ness is same for float and uint32.
583 float result;
584 memcpy(&result, &val, sizeof(val));
585 return result - 1.0f;
586 }
587
588 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)589 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
590 // IEEE754 doubles are formatted as follows (MSB first):
591 // sign(1) exponent(11) mantissa(52)
592 // Conceptually construct the following:
593 // sign == 0
594 // exponent == 1023 -- an excess 1023 representation of a zero exponent
595 // mantissa == 52 random bits
596 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa
597 const uint32 mlo = x1; // lower 32 bits of mantissa
598 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa
599 const uint64 exp = static_cast<uint64>(1023);
600 const uint64 val = (exp << 52) | man;
601 // Assumes that endian-ness is same for double and uint64.
602 double result;
603 memcpy(&result, &val, sizeof(val));
604 return result - 1.0;
605 }
606
607 } // namespace random
608 } // namespace tensorflow
609
610 #endif // TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
611