1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18
19 #include <string.h>
20
21 #include <algorithm>
22 #include <cmath>
23 #include <type_traits>
24
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/lib/random/philox_random.h"
27 #include "tensorflow/core/platform/types.h"
28
29 namespace tensorflow {
30 namespace random {
31
32 // Helper function to convert a 16-bit integer to a half between [0..1).
33 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
34 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
35 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
36 // Helper function to convert a 32-bit integer to a float between [0..1).
37 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
38 // Helper function to convert two 32-bit integers to a double between [0..1).
39 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
40
41 // Computes a + b. Requires that the result is representable in the destination
42 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
43 // need *not* be representable in that type. (The condition on b excludes the
44 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
45 // compute.)
46 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)47 PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
48 typename std::make_unsigned<Int>::type b) {
49 // Implementation note: both b_div_2 and b - b_div_2 are positive and
50 // representable as Int.
51 auto b_div_2 = b >> 1;
52 return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
53 }
54
55 // A class that generates uniform distribution random numbers from the
56 // underlying random integer generator.
57 // Arguments:
58 // Generator: a generator type that returns a number of uint32 upon each
59 // invocation. It needs to define kResultElementCount for the
60 // sample count for each invocation, and ResultType for the
61 // actual returned sample type.
62 // RealType: the data type of the real numbers that will be returned by the
63 // distribution. This could be either float or double for now.
64 // This class is meant to be implemented through specialization. The default
65 // is not defined by design.
66 template <class Generator, typename RealType>
67 class UniformDistribution;
68
69 template <class Generator>
70 class UniformDistribution<Generator, Eigen::half> {
71 public:
72 // The number of elements that will be returned.
73 static constexpr int kResultElementCount = Generator::kResultElementCount;
74 // Cost of generation of a single element (in cycles).
75 static constexpr int kElementCost = 3;
76 // Indicate that this distribution may take variable number of samples
77 // during the runtime.
78 static constexpr bool kVariableSamplesPerOutput = false;
79 typedef Array<Eigen::half, kResultElementCount> ResultType;
80 typedef Eigen::half ResultElementType;
81
82 PHILOX_DEVICE_INLINE
operator()83 ResultType operator()(Generator* gen) {
84 typename Generator::ResultType sample = (*gen)();
85 ResultType result;
86 for (int i = 0; i < kResultElementCount; ++i) {
87 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
88 }
89 return result;
90 }
91 };
92
93 template <class Generator>
94 class UniformDistribution<Generator, bfloat16> {
95 public:
96 // The number of elements that will be returned.
97 static constexpr int kResultElementCount = Generator::kResultElementCount;
98 // Cost of generation of a single element (in cycles).
99 static constexpr int kElementCost = 3;
100 // Indicate that this distribution may take variable number of samples
101 // during the runtime.
102 static constexpr bool kVariableSamplesPerOutput = false;
103 typedef Array<bfloat16, kResultElementCount> ResultType;
104 typedef bfloat16 ResultElementType;
105
106 PHILOX_DEVICE_INLINE
operator()107 ResultType operator()(Generator* gen) {
108 typename Generator::ResultType sample = (*gen)();
109 ResultType result;
110 for (int i = 0; i < kResultElementCount; ++i) {
111 result[i] = Uint16ToGfloat16(sample[i]);
112 }
113 return result;
114 }
115 };
116
117 template <class Generator>
118 class UniformDistribution<Generator, float> {
119 public:
120 // The number of elements that will be returned.
121 static constexpr int kResultElementCount = Generator::kResultElementCount;
122 // Cost of generation of a single element (in cycles).
123 static constexpr int kElementCost = 3;
124 // Indicate that this distribution may take variable number of samples
125 // during the runtime.
126 static constexpr bool kVariableSamplesPerOutput = false;
127 typedef Array<float, kResultElementCount> ResultType;
128 typedef float ResultElementType;
129
130 PHILOX_DEVICE_INLINE
operator()131 ResultType operator()(Generator* gen) {
132 typename Generator::ResultType sample = (*gen)();
133 ResultType result;
134 for (int i = 0; i < kResultElementCount; ++i) {
135 result[i] = Uint32ToFloat(sample[i]);
136 }
137 return result;
138 }
139 };
140
141 template <class Generator>
142 class UniformDistribution<Generator, double> {
143 public:
144 // The number of elements that will be returned.
145 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
146 // Cost of generation of a single element (in cycles).
147 static constexpr int kElementCost = 3;
148 // Indicate that this distribution may take variable number of samples
149 // during the runtime.
150 static constexpr bool kVariableSamplesPerOutput = false;
151 typedef Array<double, kResultElementCount> ResultType;
152 typedef double ResultElementType;
153
154 PHILOX_DEVICE_INLINE
operator()155 ResultType operator()(Generator* gen) {
156 typename Generator::ResultType sample = (*gen)();
157 ResultType result;
158 for (int i = 0; i < kResultElementCount; ++i) {
159 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
160 }
161 return result;
162 }
163 };
164
165 template <class Generator>
166 class UniformDistribution<Generator, int32> {
167 public:
168 // The number of elements that will be returned.
169 static constexpr int kResultElementCount = Generator::kResultElementCount;
170 // Cost of generation of a single element (in cycles).
171 static constexpr int kElementCost = 3;
172 // Indicate that this distribution may take variable number of samples
173 // during the runtime.
174 static constexpr bool kVariableSamplesPerOutput = false;
175 typedef Array<int32, kResultElementCount> ResultType;
176 typedef int32 ResultElementType;
177
178 // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)179 UniformDistribution(int32 lo, int32 hi)
180 : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
181
182 PHILOX_DEVICE_INLINE
operator()183 ResultType operator()(Generator* gen) {
184 typename Generator::ResultType sample = (*gen)();
185 ResultType result;
186 for (int i = 0; i < kResultElementCount; ++i) {
187 result[i] = SignedAdd(lo_, sample[i] % range_);
188 }
189 return result;
190 }
191
192 private:
193 // Note that lo_ is intentionally signed while range_ is intentionally
194 // unsigned. This is because hi - lo can overflow signed integers if
195 // lo < 0 < hi, but always fits in unsigned.
196 int32 lo_;
197 uint32 range_;
198 };
199
200 template <class Generator>
201 class UniformDistribution<Generator, int64> {
202 public:
203 // The number of elements that will be returned.
204 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
205 // Cost of generation of a single element (in cycles).
206 static constexpr int kElementCost = 3;
207 // Indicate that this distribution may take variable number of samples
208 // during the runtime.
209 static constexpr bool kVariableSamplesPerOutput = false;
210 typedef Array<int64, kResultElementCount> ResultType;
211 typedef int64 ResultElementType;
212
213 // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)214 UniformDistribution(int64 lo, int64 hi)
215 : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
216
217 PHILOX_DEVICE_INLINE
operator()218 ResultType operator()(Generator* gen) {
219 typename Generator::ResultType sample = (*gen)();
220 ResultType result;
221 for (int i = 0; i < kResultElementCount; ++i) {
222 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
223 result[i] = SignedAdd(lo_, bits % range_);
224 }
225 return result;
226 }
227
228 private:
229 // Note that lo_ is intentionally signed while range_ is intentionally
230 // unsigned. This is because hi - lo can overflow signed integers if
231 // lo < 0 < hi, but always fits in unsigned.
232 int64 lo_;
233 uint64 range_;
234 };
235
236 // Similar to `UniformDistribution`, except that instead of generating numbers
237 // in the range [low, high), it generates numbers covering the whole range of
238 // the integer type.
239 template <typename Generator, typename IntType>
240 class UniformFullIntDistribution;
241
242 template <typename Generator, typename IntType>
243 class UniformFullIntDistribution32 {
244 public:
245 // The number of elements that will be returned.
246 static constexpr int kResultElementCount = Generator::kResultElementCount;
247 // Cost of generation of a single element (in cycles).
248 static constexpr int kElementCost = 3;
249 // Indicate that this distribution may take variable number of samples
250 // during the runtime.
251 static constexpr bool kVariableSamplesPerOutput = false;
252 typedef Array<IntType, kResultElementCount> ResultType;
253 typedef IntType ResultElementType;
254
255 PHILOX_DEVICE_INLINE
operator()256 ResultType operator()(Generator* gen) {
257 typename Generator::ResultType sample = (*gen)();
258 ResultType result;
259 for (int i = 0; i < kResultElementCount; ++i) {
260 result[i] = sample[i];
261 }
262 return result;
263 }
264 };
265
266 template <typename Generator, typename IntType>
267 class UniformFullIntDistribution64 {
268 public:
269 // The number of elements that will be returned.
270 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
271 // Cost of generation of a single element (in cycles).
272 static constexpr int kElementCost = 3;
273 // Indicate that this distribution may take variable number of samples
274 // during the runtime.
275 static constexpr bool kVariableSamplesPerOutput = false;
276 typedef Array<IntType, kResultElementCount> ResultType;
277 typedef IntType ResultElementType;
278
279 PHILOX_DEVICE_INLINE
operator()280 ResultType operator()(Generator* gen) {
281 typename Generator::ResultType sample = (*gen)();
282 ResultType result;
283 for (int i = 0; i < kResultElementCount; ++i) {
284 result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
285 }
286 return result;
287 }
288 };
289
290 template <typename Generator>
291 class UniformFullIntDistribution<Generator, int32>
292 : public UniformFullIntDistribution32<Generator, int32> {};
293 template <typename Generator>
294 class UniformFullIntDistribution<Generator, uint32>
295 : public UniformFullIntDistribution32<Generator, uint32> {};
296 template <typename Generator>
297 class UniformFullIntDistribution<Generator, int64>
298 : public UniformFullIntDistribution64<Generator, int64> {};
299 template <typename Generator>
300 class UniformFullIntDistribution<Generator, uint64>
301 : public UniformFullIntDistribution64<Generator, uint64> {};
302
303 // A class that adapts the underlying native multiple samples to return a single
304 // sample at a time.
305 template <class Generator>
306 class SingleSampleAdapter {
307 public:
308 // The number of elements that will be returned.
309 static constexpr int kResultElementCount = 1;
310 // The number of elements that will be returned by the underlying generator.
311 static constexpr int kNativeElementCount = Generator::kResultElementCount;
312 typedef typename Generator::ResultElementType ResultType;
313 typedef typename Generator::ResultElementType ResultElementType;
314
315 PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)316 explicit SingleSampleAdapter(Generator* gen)
317 : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
318
319 PHILOX_DEVICE_INLINE
operator()320 ResultType operator()() {
321 if (used_result_index_ == Generator::kResultElementCount) {
322 unused_results_ = (*generator_)();
323 used_result_index_ = 0;
324 }
325
326 return unused_results_[used_result_index_++];
327 }
328
329 PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)330 void Skip(uint64 num_skips) {
331 if (!num_skips) {
332 return;
333 }
334 int num_unused_results = kNativeElementCount - used_result_index_;
335 if (num_skips <= num_unused_results) {
336 used_result_index_ += num_skips;
337 return;
338 }
339 num_skips -= num_unused_results;
340 used_result_index_ = kNativeElementCount;
341 SkipFromGenerator(num_skips / kNativeElementCount);
342 num_skips = num_skips % kNativeElementCount;
343 if (num_skips) {
344 unused_results_ = (*generator_)();
345 used_result_index_ = num_skips;
346 }
347 }
348
349 private:
350 // This implementation iteratively skips over `num_skips` samples
351 // from `generator_`. There is an O(1) implementation for PhiloxRandom
352 // in random_distributions.cc.
353 PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)354 void SkipFromGenerator(uint64 num_skips) {
355 while (num_skips--) {
356 (*generator_)();
357 }
358 }
359
360 Generator* generator_;
361 typename Generator::ResultType unused_results_;
362 int used_result_index_;
363 };
364
365 // A class that generates unit normal distribution random numbers from the
366 // underlying random integer generator.
367 // Arguments:
368 // Generator: a generator type that returns a number of uint32 upon each
369 // each invocation. It needs to define kResultElementCount for the
370 // sample count for each invocation, and ResultType for actual
371 // returned sample type.
372 // RealType: the data type of the real numbers that will be returned by the
373 // distribution. This could be either float or double for now.
374 // This class is meant to be implemented through specialization. The default
375 // is not defined by design.
376 template <class Generator, typename RealType>
377 class NormalDistribution;
378
379 PHILOX_DEVICE_INLINE
380 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
381
382 PHILOX_DEVICE_INLINE
383 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
384 double* d1);
385
386 // Exactly like the float version, except that we convert to half afterwards;
387 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
388 // gain from working in half internally.
389 template <class Generator>
390 class NormalDistribution<Generator, Eigen::half> {
391 public:
392 // The number of elements that will be returned.
393 static constexpr int kResultElementCount = Generator::kResultElementCount;
394 // Cost of generation of a single element (in cycles).
395 static constexpr int kElementCost = 70;
396 // Indicate that this distribution may take variable number of samples
397 // during the runtime.
398 static constexpr bool kVariableSamplesPerOutput = false;
399 typedef Array<Eigen::half, kResultElementCount> ResultType;
400 typedef Eigen::half ResultElementType;
401
402 PHILOX_DEVICE_INLINE
operator()403 ResultType operator()(Generator* gen) {
404 typename Generator::ResultType sample = (*gen)();
405 ResultType result;
406 for (int i = 0; i < kResultElementCount; i += 2) {
407 float f[2];
408 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
409 result[i] = Eigen::half(f[0]);
410 result[i + 1] = Eigen::half(f[1]);
411 }
412 return result;
413 }
414 };
415
416 template <class Generator>
417 class NormalDistribution<Generator, bfloat16> {
418 public:
419 // The number of elements that will be returned.
420 static constexpr int kResultElementCount = Generator::kResultElementCount;
421 // Cost of generation of a single element (in cycles).
422 static constexpr int kElementCost = 70;
423 // Indicate that this distribution may take variable number of samples
424 // during the runtime.
425 static constexpr bool kVariableSamplesPerOutput = false;
426 typedef Array<bfloat16, kResultElementCount> ResultType;
427 typedef bfloat16 ResultElementType;
428
429 PHILOX_DEVICE_INLINE
operator()430 ResultType operator()(Generator* gen) {
431 typename Generator::ResultType sample = (*gen)();
432 ResultType result;
433 static_assert(kResultElementCount % 2 == 0,
434 "kResultElementCount should be an even number");
435 for (int i = 0; i < kResultElementCount; i += 2) {
436 float f[2];
437 // Box-Muller transform requires processing 2 elements at a time.
438 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
439 result[i] = bfloat16(f[0]);
440 result[i + 1] = bfloat16(f[1]);
441 }
442 return result;
443 }
444 };
445
446 template <class Generator>
447 class NormalDistribution<Generator, float> {
448 public:
449 // The number of elements that will be returned.
450 static constexpr int kResultElementCount = Generator::kResultElementCount;
451 // Cost of generation of a single element (in cycles).
452 static constexpr int kElementCost = 70;
453 // Indicate that this distribution may take variable number of samples
454 // during the runtime.
455 static constexpr bool kVariableSamplesPerOutput = false;
456 typedef Array<float, kResultElementCount> ResultType;
457 typedef float ResultElementType;
458
459 PHILOX_DEVICE_INLINE
operator()460 ResultType operator()(Generator* gen) {
461 typename Generator::ResultType sample = (*gen)();
462 ResultType result;
463 for (int i = 0; i < kResultElementCount; i += 2) {
464 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
465 }
466 return result;
467 }
468 };
469
470 template <class Generator>
471 class NormalDistribution<Generator, double> {
472 public:
473 // The number of elements that will be returned.
474 static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
475 // Cost of generation of a single element (in cycles).
476 static constexpr int kElementCost = 70;
477 // Indicate that this distribution may take variable number of samples
478 // during the runtime.
479 static constexpr bool kVariableSamplesPerOutput = false;
480 typedef Array<double, kResultElementCount> ResultType;
481 typedef double ResultElementType;
482
483 PHILOX_DEVICE_INLINE
operator()484 ResultType operator()(Generator* gen) {
485 typename Generator::ResultType sample = (*gen)();
486 ResultType result;
487 for (int i = 0; i < kResultElementCount; i += 2) {
488 const int i2 = 2 * i;
489 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
490 sample[i2 + 3], &result[i], &result[i + 1]);
491 }
492 return result;
493 }
494 };
495
496 // A class that returns standard normal distribution between
497 // [-kTruncateValue, kTruncateValue].
498 // Arguments:
499 // Generator: a generator type that returns a number of uint32 upon each
500 // each invocation. It needs to define kResultElementCount for the
501 // sample count for each invocation, and ResultType for actual
502 // returned sample type.
503 // RealType: the data type of the real numbers that will be returned by the
504 // distribution. This could be either float or double for now.
505 // This class is meant to be implemented through specialization. The default
506 // is not defined by design.
507 template <class SingleSampleGenerator, typename RealType>
508 class TruncatedNormalDistribution;
509
510 // Exactly like the float version, except that we convert to half afterwards;
511 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
512 // gain from working in half internally.
513 template <class SingleSampleGenerator>
514 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
515 public:
516 // The number of elements that will be returned.
517 static constexpr int kResultElementCount =
518 SingleSampleGenerator::kNativeElementCount;
519 // Cost of generation of a single element (in cycles).
520 static constexpr int kElementCost = 90;
521 // Indicate that this distribution may take variable number of samples
522 // during the runtime.
523 static constexpr bool kVariableSamplesPerOutput = true;
524 // The threshold where the normal distribution is truncated.
525 const float kTruncateValue = 2.0f;
526
527 typedef Array<Eigen::half, kResultElementCount> ResultType;
528 typedef Eigen::half ResultElementType;
529
530 PHILOX_DEVICE_INLINE
operator()531 ResultType operator()(SingleSampleGenerator* gen) {
532 ResultType results;
533 int index = 0;
534 while (true) {
535 // Repeatedly take samples from the normal distribution, until we have
536 // the desired number of elements that fall within the pre-defined cutoff
537 // threshold.
538 const uint32 x0 = (*gen)();
539 const uint32 x1 = (*gen)();
540 float f[2];
541 BoxMullerFloat(x0, x1, &f[0], &f[1]);
542
543 if (Eigen::numext::abs(f[0]) < kTruncateValue) {
544 results[index++] = Eigen::half(f[0]);
545 if (index >= kResultElementCount) {
546 return results;
547 }
548 }
549 if (Eigen::numext::abs(f[1]) < kTruncateValue) {
550 results[index++] = Eigen::half(f[1]);
551 if (index >= kResultElementCount) {
552 return results;
553 }
554 }
555 }
556 }
557 };
558
559 template <class SingleSampleGenerator>
560 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
561 public:
562 // The number of elements that will be returned.
563 static constexpr int kResultElementCount =
564 SingleSampleGenerator::kNativeElementCount;
565 // Cost of generation of a single element (in cycles).
566 static constexpr int kElementCost = 90;
567 // Indicate that this distribution may take variable number of samples
568 // during the runtime.
569 static constexpr bool kVariableSamplesPerOutput = true;
570 // The threshold where the normal distribution is truncated.
571 const float kTruncateValue = 2.0f;
572
573 typedef Array<bfloat16, kResultElementCount> ResultType;
574 typedef bfloat16 ResultElementType;
575
576 PHILOX_DEVICE_INLINE
operator()577 ResultType operator()(SingleSampleGenerator* gen) {
578 ResultType results;
579 int index = 0;
580 while (true) {
581 // Repeatedly take samples from the normal distribution, until we have
582 // the desired number of elements that fall within the pre-defined cutoff
583 // threshold.
584 const uint32 x0 = (*gen)();
585 const uint32 x1 = (*gen)();
586 float f[2];
587 BoxMullerFloat(x0, x1, &f[0], &f[1]);
588
589 if (Eigen::numext::abs(f[0]) < kTruncateValue) {
590 results[index++] = bfloat16(f[0]);
591 if (index >= kResultElementCount) {
592 return results;
593 }
594 }
595 if (Eigen::numext::abs(f[1]) < kTruncateValue) {
596 results[index++] = bfloat16(f[1]);
597 if (index >= kResultElementCount) {
598 return results;
599 }
600 }
601 }
602 }
603 };
604
605 // Partial specialization for float.
606 template <class SingleSampleGenerator>
607 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
608 public:
609 // The number of elements that will be returned.
610 static constexpr int kResultElementCount =
611 SingleSampleGenerator::kNativeElementCount;
612 // Cost of generation of a single element (in cycles).
613 static constexpr int kElementCost = 90;
614 // Indicate that this distribution may take variable number of samples
615 // during the runtime.
616 static constexpr bool kVariableSamplesPerOutput = true;
617 // The threshold where the normal distribution is truncated.
618 const float kTruncateValue = 2.0f;
619
620 typedef Array<float, kResultElementCount> ResultType;
621 typedef float ResultElementType;
622
623 PHILOX_DEVICE_INLINE
operator()624 ResultType operator()(SingleSampleGenerator* gen) {
625 ResultType results;
626 int index = 0;
627 while (true) {
628 // Repeatedly take samples from the normal distribution, until we have
629 // the desired number of elements that fall within the pre-defined cutoff
630 // threshold.
631 const uint32 x0 = (*gen)();
632 const uint32 x1 = (*gen)();
633 float f[2];
634 BoxMullerFloat(x0, x1, &f[0], &f[1]);
635
636 if (Eigen::numext::abs(f[0]) < kTruncateValue) {
637 results[index++] = f[0];
638 if (index >= kResultElementCount) {
639 return results;
640 }
641 }
642 if (Eigen::numext::abs(f[1]) < kTruncateValue) {
643 results[index++] = f[1];
644 if (index >= kResultElementCount) {
645 return results;
646 }
647 }
648 }
649 }
650 };
651
652 // Partial specialization for double.
653 template <class SingleSampleGenerator>
654 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
655 public:
656 // The number of elements that will be returned.
657 static constexpr int kResultElementCount =
658 (SingleSampleGenerator::kNativeElementCount > 1)
659 ? SingleSampleGenerator::kNativeElementCount / 2
660 : 1;
661 // Cost of generation of a single element (in cycles).
662 static constexpr int kElementCost = 90;
663 // Indicate that this distribution may take variable number of samples
664 // during the runtime.
665 static constexpr bool kVariableSamplesPerOutput = true;
666 typedef Array<double, kResultElementCount> ResultType;
667 typedef double ResultElementType;
668 const double kTruncateValue = 2.0;
669
670 PHILOX_DEVICE_INLINE
operator()671 ResultType operator()(SingleSampleGenerator* gen) {
672 ResultType results;
673 int index = 0;
674 while (1) {
675 const uint32 x0 = (*gen)();
676 const uint32 x1 = (*gen)();
677 const uint32 x2 = (*gen)();
678 const uint32 x3 = (*gen)();
679 double d[2];
680 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
681
682 if (Eigen::numext::abs(d[0]) < kTruncateValue) {
683 results[index++] = d[0];
684 if (index >= kResultElementCount) {
685 return results;
686 }
687 }
688 if (Eigen::numext::abs(d[1]) < kTruncateValue) {
689 results[index++] = d[1];
690 if (index >= kResultElementCount) {
691 return results;
692 }
693 }
694 }
695 }
696 };
697
698 // Helper function to convert two 32-bit uniform integers to two floats
699 // under the unit normal distribution.
700 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)701 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
702 // This function implements the Box-Muller transform:
703 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
704 // Do not send a really small number to log().
705 // We cannot mark "epsilon" as "static const" because NVCC would complain
706 const float epsilon = 1.0e-7f;
707 float u1 = Uint32ToFloat(x0);
708 if (u1 < epsilon) {
709 u1 = epsilon;
710 }
711 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
712 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
713 #if !defined(__linux__)
714 *f0 = Eigen::numext::sin(v1);
715 *f1 = Eigen::numext::cos(v1);
716 #else
717 sincosf(v1, f0, f1);
718 #endif
719 *f0 *= u2;
720 *f1 *= u2;
721 }
722
723 // Helper function to convert four 32-bit uniform integers to two doubles
724 // under the unit normal distribution.
725 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)726 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
727 double* d1) {
728 // This function implements the Box-Muller transform:
729 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
730 // Do not send a really small number to log().
731 // We cannot mark "epsilon" as "static const" because NVCC would complain
732 const double epsilon = 1.0e-7;
733 double u1 = Uint64ToDouble(x0, x1);
734 if (u1 < epsilon) {
735 u1 = epsilon;
736 }
737 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
738 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
739 #if !defined(__linux__)
740 *d0 = Eigen::numext::sin(v1);
741 *d1 = Eigen::numext::cos(v1);
742 #else
743 sincos(v1, d0, d1);
744 #endif
745 *d0 *= u2;
746 *d1 *= u2;
747 }
748
749 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)750 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
751 // IEEE754 halfs are formatted as follows (MSB first):
752 // sign(1) exponent(5) mantissa(10)
753 // Conceptually construct the following:
754 // sign == 0
755 // exponent == 15 -- an excess 15 representation of a zero exponent
756 // mantissa == 10 random bits
757 const uint16 man = x & 0x3ffu; // 10 bit mantissa
758 const uint16 exp = static_cast<uint16>(15);
759 const uint16 val = (exp << 10) | man;
760
761 Eigen::half result;
762 result.x = val;
763 return result - Eigen::half(1.0);
764 }
765
766 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
767 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)768 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
769 // bfloat are formatted as follows (MSB first):
770 // sign(1) exponent(8) mantissa(7)
771 // Conceptually construct the following:
772 // sign == 0
773 // exponent == 127 -- an excess 127 representation of a zero exponent
774 // mantissa == 7 random bits
775 const uint16 man = x & 0x7fu; // 7 bit mantissa
776 const uint16 exp = static_cast<uint16>(127);
777 const uint16 val = (exp << 7) | man;
778
779 bfloat16 result;
780 memcpy(&result, &val, sizeof(val));
781 // The mantissa has an implicit leading 1, so the above code creates a value
782 // in [1, 2). The minus will not cause a rounding that makes the result 1.
783 // Instead it will just be close to 1.
784 return result - bfloat16(1.0);
785 }
786
787 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)788 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
789 // IEEE754 floats are formatted as follows (MSB first):
790 // sign(1) exponent(8) mantissa(23)
791 // Conceptually construct the following:
792 // sign == 0
793 // exponent == 127 -- an excess 127 representation of a zero exponent
794 // mantissa == 23 random bits
795 const uint32 man = x & 0x7fffffu; // 23 bit mantissa
796 const uint32 exp = static_cast<uint32>(127);
797 const uint32 val = (exp << 23) | man;
798
799 // Assumes that endian-ness is same for float and uint32.
800 float result;
801 memcpy(&result, &val, sizeof(val));
802 return result - 1.0f;
803 }
804
805 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)806 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
807 // IEEE754 doubles are formatted as follows (MSB first):
808 // sign(1) exponent(11) mantissa(52)
809 // Conceptually construct the following:
810 // sign == 0
811 // exponent == 1023 -- an excess 1023 representation of a zero exponent
812 // mantissa == 52 random bits
813 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa
814 const uint32 mlo = x1; // lower 32 bits of mantissa
815 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa
816 const uint64 exp = static_cast<uint64>(1023);
817 const uint64 val = (exp << 52) | man;
818 // Assumes that endian-ness is same for double and uint64.
819 double result;
820 memcpy(&result, &val, sizeof(val));
821 return result - 1.0;
822 }
823
824 } // namespace random
825 } // namespace tensorflow
826
827 #endif // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
828