1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18
19 #define _USE_MATH_DEFINES
20 #include <math.h>
21
22 #include <cmath>
23 #undef _USE_MATH_DEFINES
24
25 #include <string.h>
26 #include <tensorflow/core/lib/bfloat16/bfloat16.h>
27
28 #include <algorithm>
29 #include <type_traits>
30 #include <unsupported/Eigen/CXX11/Tensor>
31
32 #include "philox_random.h"
33
34 namespace tensorflow {
35 namespace random {
36
37 // Helper function to convert a 16-bit integer to a half between [0..1).
38 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
39 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
40 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
41 // Helper function to convert a 32-bit integer to a float between [0..1).
42 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
43 // Helper function to convert two 32-bit integers to a double between [0..1).
44 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
45
46 // Computes a + b. Requires that the result is representable in the destination
47 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
48 // need *not* be representable in that type. (The condition on b excludes the
49 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
50 // compute.)
51 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)52 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) {
53 // Implementation note: both b_div_2 and b - b_div_2 are positive and
54 // representatble as Int.
55 auto b_div_2 = b >> 1;
56 return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
57 }
58
59 // A class that generates uniform distribution random numbers from the
60 // underlying random integer generator.
61 // Arguments:
62 // Generator: a generator type that returns a number of uint32 upon each
63 // invocation. It needs to define kResultElementCount for the
64 // sample count for each invocation, and ResultType for the
65 // actual returned sample type.
66 // RealType: the data type of the real numbers that will be returned by the
67 // distribution. This could be either float or double for now.
68 // This class is meant to be implemented through specialization. The default
69 // is not defined by design.
70 template <class Generator, typename RealType>
71 class UniformDistribution;
72
73 template <class Generator>
74 class UniformDistribution<Generator, Eigen::half> {
75 public:
76 // The number of elements that will be returned.
77 static const int kResultElementCount = Generator::kResultElementCount;
78 // Cost of generation of a single element (in cycles).
79 static const int kElementCost = 3;
80 // Indicate that this distribution may take variable number of samples
81 // during the runtime.
82 static const bool kVariableSamplesPerOutput = false;
83 typedef Array<Eigen::half, kResultElementCount> ResultType;
84 typedef Eigen::half ResultElementType;
85
86 PHILOX_DEVICE_INLINE
operator()87 ResultType operator()(Generator* gen) {
88 typename Generator::ResultType sample = (*gen)();
89 ResultType result;
90 for (int i = 0; i < kResultElementCount; ++i) {
91 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
92 }
93 return result;
94 }
95 };
96
97 template <class Generator>
98 class UniformDistribution<Generator, bfloat16> {
99 public:
100 // The number of elements that will be returned.
101 static const int kResultElementCount = Generator::kResultElementCount;
102 // Cost of generation of a single element (in cycles).
103 static const int kElementCost = 3;
104 // Indicate that this distribution may take variable number of samples
105 // during the runtime.
106 static const bool kVariableSamplesPerOutput = false;
107 typedef Array<bfloat16, kResultElementCount> ResultType;
108 typedef bfloat16 ResultElementType;
109
110 PHILOX_DEVICE_INLINE
operator()111 ResultType operator()(Generator* gen) {
112 typename Generator::ResultType sample = (*gen)();
113 ResultType result;
114 for (int i = 0; i < kResultElementCount; ++i) {
115 result[i] = Uint16ToGfloat16(sample[i]);
116 }
117 return result;
118 }
119 };
120
121 template <class Generator>
122 class UniformDistribution<Generator, float> {
123 public:
124 // The number of elements that will be returned.
125 static const int kResultElementCount = Generator::kResultElementCount;
126 // Cost of generation of a single element (in cycles).
127 static const int kElementCost = 3;
128 // Indicate that this distribution may take variable number of samples
129 // during the runtime.
130 static const bool kVariableSamplesPerOutput = false;
131 typedef Array<float, kResultElementCount> ResultType;
132 typedef float ResultElementType;
133
134 PHILOX_DEVICE_INLINE
operator()135 ResultType operator()(Generator* gen) {
136 typename Generator::ResultType sample = (*gen)();
137 ResultType result;
138 for (int i = 0; i < kResultElementCount; ++i) {
139 result[i] = Uint32ToFloat(sample[i]);
140 }
141 return result;
142 }
143 };
144
145 template <class Generator>
146 class UniformDistribution<Generator, double> {
147 public:
148 // The number of elements that will be returned.
149 static const int kResultElementCount = Generator::kResultElementCount / 2;
150 // Cost of generation of a single element (in cycles).
151 static const int kElementCost = 3;
152 // Indicate that this distribution may take variable number of samples
153 // during the runtime.
154 static const bool kVariableSamplesPerOutput = false;
155 typedef Array<double, kResultElementCount> ResultType;
156 typedef double ResultElementType;
157
158 PHILOX_DEVICE_INLINE
operator()159 ResultType operator()(Generator* gen) {
160 typename Generator::ResultType sample = (*gen)();
161 ResultType result;
162 for (int i = 0; i < kResultElementCount; ++i) {
163 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
164 }
165 return result;
166 }
167 };
168
169 template <class Generator>
170 class UniformDistribution<Generator, int32> {
171 public:
172 // The number of elements that will be returned.
173 static const int kResultElementCount = Generator::kResultElementCount;
174 // Cost of generation of a single element (in cycles).
175 static const int kElementCost = 3;
176 // Indicate that this distribution may take variable number of samples
177 // during the runtime.
178 static const bool kVariableSamplesPerOutput = false;
179 typedef Array<int32, kResultElementCount> ResultType;
180 typedef int32 ResultElementType;
181
182 // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)183 UniformDistribution(int32 lo, int32 hi)
184 : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
185
186 PHILOX_DEVICE_INLINE
operator()187 ResultType operator()(Generator* gen) {
188 typename Generator::ResultType sample = (*gen)();
189 ResultType result;
190 for (int i = 0; i < kResultElementCount; ++i) {
191 result[i] = SignedAdd(lo_, sample[i] % range_);
192 }
193 return result;
194 }
195
196 private:
197 // Note that lo_ is intentionally signed while range_ is intentionally
198 // unsigned. This is because hi - lo can overflow signed integers if
199 // lo < 0 < hi, but always fits in unsigned.
200 int32 lo_;
201 uint32 range_;
202 };
203
204 template <class Generator>
205 class UniformDistribution<Generator, int64> {
206 public:
207 // The number of elements that will be returned.
208 static const int kResultElementCount = Generator::kResultElementCount / 2;
209 // Cost of generation of a single element (in cycles).
210 static const int kElementCost = 3;
211 // Indicate that this distribution may take variable number of samples
212 // during the runtime.
213 static const bool kVariableSamplesPerOutput = false;
214 typedef Array<int64, kResultElementCount> ResultType;
215 typedef int64 ResultElementType;
216
217 // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)218 UniformDistribution(int64 lo, int64 hi)
219 : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
220
221 PHILOX_DEVICE_INLINE
operator()222 ResultType operator()(Generator* gen) {
223 typename Generator::ResultType sample = (*gen)();
224 ResultType result;
225 for (int i = 0; i < kResultElementCount; ++i) {
226 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
227 result[i] = SignedAdd(lo_, bits % range_);
228 }
229 return result;
230 }
231
232 private:
233 // Note that lo_ is intentionally signed while range_ is intentionally
234 // unsigned. This is because hi - lo can overflow signed integers if
235 // lo < 0 < hi, but always fits in unsigned.
236 int64 lo_;
237 uint64 range_;
238 };
239
240 // A class that adapts the underlying native multiple samples to return a single
241 // sample at a time.
242 template <class Generator>
243 class SingleSampleAdapter {
244 public:
245 // The number of elements that will be returned.
246 static const int kResultElementCount = 1;
247 // The number of elements that will be returned by the underlying generator.
248 static const int kNativeElementCount = Generator::kResultElementCount;
249 typedef typename Generator::ResultElementType ResultType;
250 typedef typename Generator::ResultElementType ResultElementType;
251
252 PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)253 explicit SingleSampleAdapter(Generator* gen)
254 : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
255
256 PHILOX_DEVICE_INLINE
operator()257 ResultType operator()() {
258 if (used_result_index_ == Generator::kResultElementCount) {
259 unused_results_ = (*generator_)();
260 used_result_index_ = 0;
261 }
262
263 return unused_results_[used_result_index_++];
264 }
265
266 PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)267 void Skip(uint64 num_skips) {
268 if (!num_skips) {
269 return;
270 }
271 int num_unused_results = kNativeElementCount - used_result_index_;
272 if (num_skips <= num_unused_results) {
273 used_result_index_ += num_skips;
274 return;
275 }
276 num_skips -= num_unused_results;
277 used_result_index_ = kNativeElementCount;
278 SkipFromGenerator(num_skips / kNativeElementCount);
279 num_skips = num_skips % kNativeElementCount;
280 if (num_skips) {
281 unused_results_ = (*generator_)();
282 used_result_index_ = num_skips;
283 }
284 }
285
286 private:
287 // This implementation iteratively skips over `num_skips` samples
288 // from `generator_`. There is an O(1) implementation for PhiloxRandom
289 // in random_distributions.cc.
290 PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)291 void SkipFromGenerator(uint64 num_skips) {
292 while (num_skips--) {
293 (*generator_)();
294 }
295 }
296
297 Generator* generator_;
298 typename Generator::ResultType unused_results_;
299 int used_result_index_;
300 };
301
302 // A class that generates unit normal distribution random numbers from the
303 // underlying random integer generator.
304 // Arguments:
305 // Generator: a generator type that returns a number of uint32 upon each
306 // each invocation. It needs to define kResultElementCount for the
307 // sample count for each invocation, and ResultType for actual
308 // returned sample type.
309 // RealType: the data type of the real numbers that will be returned by the
310 // distribution. This could be either float or double for now.
311 // This class is meant to be implemented through specialization. The default
312 // is not defined by design.
313 template <class Generator, typename RealType>
314 class NormalDistribution;
315
316 PHILOX_DEVICE_INLINE
317 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
318
319 PHILOX_DEVICE_INLINE
320 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1);
321
322 // Exactly like the float version, except that we convert to half afterwards;
323 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
324 // gain from working in half internally.
325 template <class Generator>
326 class NormalDistribution<Generator, Eigen::half> {
327 public:
328 // The number of elements that will be returned.
329 static const int kResultElementCount = Generator::kResultElementCount;
330 // Cost of generation of a single element (in cycles).
331 static const int kElementCost = 70;
332 // Indicate that this distribution may take variable number of samples
333 // during the runtime.
334 static const bool kVariableSamplesPerOutput = false;
335 typedef Array<Eigen::half, kResultElementCount> ResultType;
336 typedef Eigen::half ResultElementType;
337
338 PHILOX_DEVICE_INLINE
operator()339 ResultType operator()(Generator* gen) {
340 typename Generator::ResultType sample = (*gen)();
341 ResultType result;
342 for (int i = 0; i < kResultElementCount; i += 2) {
343 float f[2];
344 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
345 result[i] = Eigen::half(f[0]);
346 result[i + 1] = Eigen::half(f[1]);
347 }
348 return result;
349 }
350 };
351
352 template <class Generator>
353 class NormalDistribution<Generator, bfloat16> {
354 public:
355 // The number of elements that will be returned.
356 static const int kResultElementCount = Generator::kResultElementCount;
357 // Cost of generation of a single element (in cycles).
358 static const int kElementCost = 70;
359 // Indicate that this distribution may take variable number of samples
360 // during the runtime.
361 static const bool kVariableSamplesPerOutput = false;
362 typedef Array<bfloat16, kResultElementCount> ResultType;
363 typedef bfloat16 ResultElementType;
364
365 PHILOX_DEVICE_INLINE
operator()366 ResultType operator()(Generator* gen) {
367 typename Generator::ResultType sample = (*gen)();
368 ResultType result;
369 static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number");
370 for (int i = 0; i < kResultElementCount; i += 2) {
371 float f[2];
372 // Box-Muller transform requires processing 2 elements at a time.
373 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
374 result[i] = bfloat16(f[0]);
375 result[i + 1] = bfloat16(f[1]);
376 }
377 return result;
378 }
379 };
380
381 template <class Generator>
382 class NormalDistribution<Generator, float> {
383 public:
384 // The number of elements that will be returned.
385 static const int kResultElementCount = Generator::kResultElementCount;
386 // Cost of generation of a single element (in cycles).
387 static const int kElementCost = 70;
388 // Indicate that this distribution may take variable number of samples
389 // during the runtime.
390 static const bool kVariableSamplesPerOutput = false;
391 typedef Array<float, kResultElementCount> ResultType;
392 typedef float ResultElementType;
393
394 PHILOX_DEVICE_INLINE
operator()395 ResultType operator()(Generator* gen) {
396 typename Generator::ResultType sample = (*gen)();
397 ResultType result;
398 for (int i = 0; i < kResultElementCount; i += 2) {
399 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
400 }
401 return result;
402 }
403 };
404
405 template <class Generator>
406 class NormalDistribution<Generator, double> {
407 public:
408 // The number of elements that will be returned.
409 static const int kResultElementCount = Generator::kResultElementCount / 2;
410 // Cost of generation of a single element (in cycles).
411 static const int kElementCost = 70;
412 // Indicate that this distribution may take variable number of samples
413 // during the runtime.
414 static const bool kVariableSamplesPerOutput = false;
415 typedef Array<double, kResultElementCount> ResultType;
416 typedef double ResultElementType;
417
418 PHILOX_DEVICE_INLINE
operator()419 ResultType operator()(Generator* gen) {
420 typename Generator::ResultType sample = (*gen)();
421 ResultType result;
422 for (int i = 0; i < kResultElementCount; i += 2) {
423 const int i2 = 2 * i;
424 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
425 &result[i + 1]);
426 }
427 return result;
428 }
429 };
430
431 // A class that returns standard normal distribution between
432 // [-kTruncateValue, kTruncateValue].
433 // Arguments:
434 // Generator: a generator type that returns a number of uint32 upon each
435 // each invocation. It needs to define kResultElementCount for the
436 // sample count for each invocation, and ResultType for actual
437 // returned sample type.
438 // RealType: the data type of the real numbers that will be returned by the
439 // distribution. This could be either float or double for now.
440 // This class is meant to be implemented through specialization. The default
441 // is not defined by design.
442 template <class SingleSampleGenerator, typename RealType>
443 class TruncatedNormalDistribution;
444
445 // Exactly like the float version, except that we convert to half afterwards;
446 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
447 // gain from working in half internally.
448 template <class SingleSampleGenerator>
449 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
450 public:
451 // The number of elements that will be returned.
452 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
453 // Cost of generation of a single element (in cycles).
454 static const int kElementCost = 90;
455 // Indicate that this distribution may take variable number of samples
456 // during the runtime.
457 static const bool kVariableSamplesPerOutput = true;
458 // The threshold where the normal distribution is truncated.
459 const float kTruncateValue = 2.0f;
460
461 typedef Array<Eigen::half, kResultElementCount> ResultType;
462 typedef Eigen::half ResultElementType;
463
464 PHILOX_DEVICE_INLINE
operator()465 ResultType operator()(SingleSampleGenerator* gen) {
466 ResultType results;
467 int index = 0;
468 while (true) {
469 // Repeatedly take samples from the normal distribution, until we have
470 // the desired number of elements that fall within the pre-defined cutoff
471 // threshold.
472 const uint32 x0 = (*gen)();
473 const uint32 x1 = (*gen)();
474 float f[2];
475 BoxMullerFloat(x0, x1, &f[0], &f[1]);
476
477 for (int i = 0; i < 2; ++i) {
478 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
479 results[index++] = Eigen::half(f[i]);
480 if (index >= kResultElementCount) {
481 return results;
482 }
483 }
484 }
485 }
486 }
487 };
488
489 template <class SingleSampleGenerator>
490 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
491 public:
492 // The number of elements that will be returned.
493 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
494 // Cost of generation of a single element (in cycles).
495 static const int kElementCost = 90;
496 // Indicate that this distribution may take variable number of samples
497 // during the runtime.
498 static const bool kVariableSamplesPerOutput = true;
499 // The threshold where the normal distribution is truncated.
500 const float kTruncateValue = 2.0f;
501
502 typedef Array<bfloat16, kResultElementCount> ResultType;
503 typedef bfloat16 ResultElementType;
504
505 PHILOX_DEVICE_INLINE
operator()506 ResultType operator()(SingleSampleGenerator* gen) {
507 ResultType results;
508 int index = 0;
509 while (true) {
510 // Repeatedly take samples from the normal distribution, until we have
511 // the desired number of elements that fall within the pre-defined cutoff
512 // threshold.
513 const uint32 x0 = (*gen)();
514 const uint32 x1 = (*gen)();
515 float f[2];
516 BoxMullerFloat(x0, x1, &f[0], &f[1]);
517
518 for (int i = 0; i < 2; ++i) {
519 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
520 results[index++] = bfloat16(f[i]);
521 if (index >= kResultElementCount) {
522 return results;
523 }
524 }
525 }
526 }
527 }
528 };
529
530 // Partial specialization for float.
531 template <class SingleSampleGenerator>
532 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
533 public:
534 // The number of elements that will be returned.
535 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
536 // Cost of generation of a single element (in cycles).
537 static const int kElementCost = 90;
538 // Indicate that this distribution may take variable number of samples
539 // during the runtime.
540 static const bool kVariableSamplesPerOutput = true;
541 // The threshold where the normal distribution is truncated.
542 const float kTruncateValue = 2.0f;
543
544 typedef Array<float, kResultElementCount> ResultType;
545 typedef float ResultElementType;
546
547 PHILOX_DEVICE_INLINE
operator()548 ResultType operator()(SingleSampleGenerator* gen) {
549 ResultType results;
550 int index = 0;
551 while (true) {
552 // Repeatedly take samples from the normal distribution, until we have
553 // the desired number of elements that fall within the pre-defined cutoff
554 // threshold.
555 const uint32 x0 = (*gen)();
556 const uint32 x1 = (*gen)();
557 float f[2];
558 BoxMullerFloat(x0, x1, &f[0], &f[1]);
559
560 for (int i = 0; i < 2; ++i) {
561 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
562 results[index++] = f[i];
563 if (index >= kResultElementCount) {
564 return results;
565 }
566 }
567 }
568 }
569 }
570 };
571
572 // Partial specialization for double.
573 template <class SingleSampleGenerator>
574 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
575 public:
576 // The number of elements that will be returned.
577 static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
578 ? SingleSampleGenerator::kNativeElementCount / 2
579 : 1;
580 // Cost of generation of a single element (in cycles).
581 static const int kElementCost = 90;
582 // Indicate that this distribution may take variable number of samples
583 // during the runtime.
584 static const bool kVariableSamplesPerOutput = true;
585 typedef Array<double, kResultElementCount> ResultType;
586 typedef double ResultElementType;
587 const double kTruncateValue = 2.0;
588
589 PHILOX_DEVICE_INLINE
operator()590 ResultType operator()(SingleSampleGenerator* gen) {
591 ResultType results;
592 int index = 0;
593 while (1) {
594 const uint32 x0 = (*gen)();
595 const uint32 x1 = (*gen)();
596 const uint32 x2 = (*gen)();
597 const uint32 x3 = (*gen)();
598 double d[2];
599 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
600
601 for (int i = 0; i < 2; ++i) {
602 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
603 results[index++] = d[i];
604 if (index >= kResultElementCount) {
605 return results;
606 }
607 }
608 }
609 }
610 }
611 };
612
613 // Helper function to convert two 32-bit uniform integers to two floats
614 // under the unit normal distribution.
615 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)616 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
617 // This function implements the Box-Muller transform:
618 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
619 // Do not send a really small number to log().
620 // We cannot mark "epsilon" as "static const" because NVCC would complain
621 const float epsilon = 1.0e-7f;
622 float u1 = Uint32ToFloat(x0);
623 if (u1 < epsilon) {
624 u1 = epsilon;
625 }
626 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
627 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
628 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
629 *f0 = Eigen::numext::sin(v1);
630 *f1 = Eigen::numext::cos(v1);
631 #else
632 sincosf(v1, f0, f1);
633 #endif
634 *f0 *= u2;
635 *f1 *= u2;
636 }
637
638 // Helper function to convert four 32-bit uniform integers to two doubles
639 // under the unit normal distribution.
640 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)641 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) {
642 // This function implements the Box-Muller transform:
643 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
644 // Do not send a really small number to log().
645 // We cannot mark "epsilon" as "static const" because NVCC would complain
646 const double epsilon = 1.0e-7;
647 double u1 = Uint64ToDouble(x0, x1);
648 if (u1 < epsilon) {
649 u1 = epsilon;
650 }
651 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
652 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
653 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
654 *d0 = Eigen::numext::sin(v1);
655 *d1 = Eigen::numext::cos(v1);
656 #else
657 sincos(v1, d0, d1);
658 #endif
659 *d0 *= u2;
660 *d1 *= u2;
661 }
662
663 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)664 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
665 // IEEE754 halfs are formatted as follows (MSB first):
666 // sign(1) exponent(5) mantissa(10)
667 // Conceptually construct the following:
668 // sign == 0
669 // exponent == 15 -- an excess 15 representation of a zero exponent
670 // mantissa == 10 random bits
671 const uint16 man = x & 0x3ffu; // 10 bit mantissa
672 const uint16 exp = static_cast<uint16>(15);
673 const uint16 val = (exp << 10) | man;
674
675 Eigen::half result;
676 result.x = val;
677 return result - Eigen::half(1.0);
678 }
679
680 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
681 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)682 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
683 // bfloat are formatted as follows (MSB first):
684 // sign(1) exponent(8) mantissa(7)
685 // Conceptually construct the following:
686 // sign == 0
687 // exponent == 127 -- an excess 127 representation of a zero exponent
688 // mantissa == 7 random bits
689 const uint16 man = x & 0x7fu; // 7 bit mantissa
690 const uint16 exp = static_cast<uint16>(127);
691 const uint16 val = (exp << 7) | man;
692
693 bfloat16 result;
694 memcpy(&result, &val, sizeof(val));
695 // The mantissa has an implicit leading 1, so the above code creates a value
696 // in [1, 2). The minus will not cause a rounding that makes the result 1.
697 // Instead it will just be close to 1.
698 return result - bfloat16(1.0);
699 }
700
701 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)702 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
703 // IEEE754 floats are formatted as follows (MSB first):
704 // sign(1) exponent(8) mantissa(23)
705 // Conceptually construct the following:
706 // sign == 0
707 // exponent == 127 -- an excess 127 representation of a zero exponent
708 // mantissa == 23 random bits
709 const uint32 man = x & 0x7fffffu; // 23 bit mantissa
710 const uint32 exp = static_cast<uint32>(127);
711 const uint32 val = (exp << 23) | man;
712
713 // Assumes that endian-ness is same for float and uint32.
714 float result;
715 memcpy(&result, &val, sizeof(val));
716 return result - 1.0f;
717 }
718
719 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)720 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
721 // IEEE754 doubles are formatted as follows (MSB first):
722 // sign(1) exponent(11) mantissa(52)
723 // Conceptually construct the following:
724 // sign == 0
725 // exponent == 1023 -- an excess 1023 representation of a zero exponent
726 // mantissa == 52 random bits
727 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa
728 const uint32 mlo = x1; // lower 32 bits of mantissa
729 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa
730 const uint64 exp = static_cast<uint64>(1023);
731 const uint64 val = (exp << 52) | man;
732 // Assumes that endian-ness is same for double and uint64.
733 double result;
734 memcpy(&result, &val, sizeof(val));
735 return result - 1.0;
736 }
737
738 } // namespace random
739 } // namespace tensorflow
740
741 #endif // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
742