1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18
19 #include <string.h>
20
21 #include <cmath>
22
23 #include <algorithm>
24 #include <type_traits>
25
26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
27 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
28 #include "tensorflow/core/lib/random/philox_random.h"
29
30 namespace tensorflow {
31 namespace random {
32
33 // Helper function to convert a 16-bit integer to a half between [0..1).
34 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
35 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
36 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
37 // Helper function to convert a 32-bit integer to a float between [0..1).
38 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
39 // Helper function to convert two 32-bit integers to a double between [0..1).
40 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
41
42 // Computes a + b. Requires that the result is representable in the destination
43 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
44 // need *not* be representable in that type. (The condition on b excludes the
45 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
46 // compute.)
47 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)48 PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
49 typename std::make_unsigned<Int>::type b) {
50 // Implementation note: both b_div_2 and b - b_div_2 are positive and
51 // representatble as Int.
52 auto b_div_2 = b >> 1;
53 return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
54 }
55
56 // A class that generates uniform distribution random numbers from the
57 // underlying random integer generator.
58 // Arguments:
59 // Generator: a generator type that returns a number of uint32 upon each
60 // invocation. It needs to define kResultElementCount for the
61 // sample count for each invocation, and ResultType for the
62 // actual returned sample type.
63 // RealType: the data type of the real numbers that will be returned by the
64 // distribution. This could be either float or double for now.
65 // This class is meant to be implemented through specialization. The default
66 // is not defined by design.
67 template <class Generator, typename RealType>
68 class UniformDistribution;
69
70 template <class Generator>
71 class UniformDistribution<Generator, Eigen::half> {
72 public:
73 // The number of elements that will be returned.
74 static const int kResultElementCount = Generator::kResultElementCount;
75 // Cost of generation of a single element (in cycles).
76 static const int kElementCost = 3;
77 // Indicate that this distribution may take variable number of samples
78 // during the runtime.
79 static const bool kVariableSamplesPerOutput = false;
80 typedef Array<Eigen::half, kResultElementCount> ResultType;
81 typedef Eigen::half ResultElementType;
82
83 PHILOX_DEVICE_INLINE
operator()84 ResultType operator()(Generator* gen) {
85 typename Generator::ResultType sample = (*gen)();
86 ResultType result;
87 for (int i = 0; i < kResultElementCount; ++i) {
88 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
89 }
90 return result;
91 }
92 };
93
94 template <class Generator>
95 class UniformDistribution<Generator, bfloat16> {
96 public:
97 // The number of elements that will be returned.
98 static const int kResultElementCount = Generator::kResultElementCount;
99 // Cost of generation of a single element (in cycles).
100 static const int kElementCost = 3;
101 // Indicate that this distribution may take variable number of samples
102 // during the runtime.
103 static const bool kVariableSamplesPerOutput = false;
104 typedef Array<bfloat16, kResultElementCount> ResultType;
105 typedef bfloat16 ResultElementType;
106
107 PHILOX_DEVICE_INLINE
operator()108 ResultType operator()(Generator* gen) {
109 typename Generator::ResultType sample = (*gen)();
110 ResultType result;
111 for (int i = 0; i < kResultElementCount; ++i) {
112 result[i] = Uint16ToGfloat16(sample[i]);
113 }
114 return result;
115 }
116 };
117
118 template <class Generator>
119 class UniformDistribution<Generator, float> {
120 public:
121 // The number of elements that will be returned.
122 static const int kResultElementCount = Generator::kResultElementCount;
123 // Cost of generation of a single element (in cycles).
124 static const int kElementCost = 3;
125 // Indicate that this distribution may take variable number of samples
126 // during the runtime.
127 static const bool kVariableSamplesPerOutput = false;
128 typedef Array<float, kResultElementCount> ResultType;
129 typedef float ResultElementType;
130
131 PHILOX_DEVICE_INLINE
operator()132 ResultType operator()(Generator* gen) {
133 typename Generator::ResultType sample = (*gen)();
134 ResultType result;
135 for (int i = 0; i < kResultElementCount; ++i) {
136 result[i] = Uint32ToFloat(sample[i]);
137 }
138 return result;
139 }
140 };
141
142 template <class Generator>
143 class UniformDistribution<Generator, double> {
144 public:
145 // The number of elements that will be returned.
146 static const int kResultElementCount = Generator::kResultElementCount / 2;
147 // Cost of generation of a single element (in cycles).
148 static const int kElementCost = 3;
149 // Indicate that this distribution may take variable number of samples
150 // during the runtime.
151 static const bool kVariableSamplesPerOutput = false;
152 typedef Array<double, kResultElementCount> ResultType;
153 typedef double ResultElementType;
154
155 PHILOX_DEVICE_INLINE
operator()156 ResultType operator()(Generator* gen) {
157 typename Generator::ResultType sample = (*gen)();
158 ResultType result;
159 for (int i = 0; i < kResultElementCount; ++i) {
160 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
161 }
162 return result;
163 }
164 };
165
166 template <class Generator>
167 class UniformDistribution<Generator, int32> {
168 public:
169 // The number of elements that will be returned.
170 static const int kResultElementCount = Generator::kResultElementCount;
171 // Cost of generation of a single element (in cycles).
172 static const int kElementCost = 3;
173 // Indicate that this distribution may take variable number of samples
174 // during the runtime.
175 static const bool kVariableSamplesPerOutput = false;
176 typedef Array<int32, kResultElementCount> ResultType;
177 typedef int32 ResultElementType;
178
179 // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)180 UniformDistribution(int32 lo, int32 hi)
181 : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
182
183 PHILOX_DEVICE_INLINE
operator()184 ResultType operator()(Generator* gen) {
185 typename Generator::ResultType sample = (*gen)();
186 ResultType result;
187 for (int i = 0; i < kResultElementCount; ++i) {
188 result[i] = SignedAdd(lo_, sample[i] % range_);
189 }
190 return result;
191 }
192
193 private:
194 // Note that lo_ is intentionally signed while range_ is intentionally
195 // unsigned. This is because hi - lo can overflow signed integers if
196 // lo < 0 < hi, but always fits in unsigned.
197 int32 lo_;
198 uint32 range_;
199 };
200
201 template <class Generator>
202 class UniformDistribution<Generator, int64> {
203 public:
204 // The number of elements that will be returned.
205 static const int kResultElementCount = Generator::kResultElementCount / 2;
206 // Cost of generation of a single element (in cycles).
207 static const int kElementCost = 3;
208 // Indicate that this distribution may take variable number of samples
209 // during the runtime.
210 static const bool kVariableSamplesPerOutput = false;
211 typedef Array<int64, kResultElementCount> ResultType;
212 typedef int64 ResultElementType;
213
214 // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)215 UniformDistribution(int64 lo, int64 hi)
216 : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
217
218 PHILOX_DEVICE_INLINE
operator()219 ResultType operator()(Generator* gen) {
220 typename Generator::ResultType sample = (*gen)();
221 ResultType result;
222 for (int i = 0; i < kResultElementCount; ++i) {
223 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
224 result[i] = SignedAdd(lo_, bits % range_);
225 }
226 return result;
227 }
228
229 private:
230 // Note that lo_ is intentionally signed while range_ is intentionally
231 // unsigned. This is because hi - lo can overflow signed integers if
232 // lo < 0 < hi, but always fits in unsigned.
233 int64 lo_;
234 uint64 range_;
235 };
236
237 // Similar to `UniformDistribution`, except that instead of generating numbers
238 // in the range [low, high), it generates numbers covering the whole range of
239 // the integer type.
240 template <typename Generator, typename IntType>
241 class UniformFullIntDistribution;
242
243 template <typename Generator, typename IntType>
244 class UniformFullIntDistribution32 {
245 public:
246 // The number of elements that will be returned.
247 static const int kResultElementCount = Generator::kResultElementCount;
248 // Cost of generation of a single element (in cycles).
249 static const int kElementCost = 3;
250 // Indicate that this distribution may take variable number of samples
251 // during the runtime.
252 static const bool kVariableSamplesPerOutput = false;
253 typedef Array<IntType, kResultElementCount> ResultType;
254 typedef IntType ResultElementType;
255
256 PHILOX_DEVICE_INLINE
operator()257 ResultType operator()(Generator* gen) {
258 typename Generator::ResultType sample = (*gen)();
259 ResultType result;
260 for (int i = 0; i < kResultElementCount; ++i) {
261 result[i] = sample[i];
262 }
263 return result;
264 }
265 };
266
267 template <typename Generator, typename IntType>
268 class UniformFullIntDistribution64 {
269 public:
270 // The number of elements that will be returned.
271 static const int kResultElementCount = Generator::kResultElementCount / 2;
272 // Cost of generation of a single element (in cycles).
273 static const int kElementCost = 3;
274 // Indicate that this distribution may take variable number of samples
275 // during the runtime.
276 static const bool kVariableSamplesPerOutput = false;
277 typedef Array<IntType, kResultElementCount> ResultType;
278 typedef IntType ResultElementType;
279
280 PHILOX_DEVICE_INLINE
operator()281 ResultType operator()(Generator* gen) {
282 typename Generator::ResultType sample = (*gen)();
283 ResultType result;
284 for (int i = 0; i < kResultElementCount; ++i) {
285 result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
286 }
287 return result;
288 }
289 };
290
291 template <typename Generator>
292 class UniformFullIntDistribution<Generator, int32>
293 : public UniformFullIntDistribution32<Generator, int32> {};
294 template <typename Generator>
295 class UniformFullIntDistribution<Generator, uint32>
296 : public UniformFullIntDistribution32<Generator, uint32> {};
297 template <typename Generator>
298 class UniformFullIntDistribution<Generator, int64>
299 : public UniformFullIntDistribution64<Generator, int64> {};
300 template <typename Generator>
301 class UniformFullIntDistribution<Generator, uint64>
302 : public UniformFullIntDistribution64<Generator, uint64> {};
303
304 // A class that adapts the underlying native multiple samples to return a single
305 // sample at a time.
306 template <class Generator>
307 class SingleSampleAdapter {
308 public:
309 // The number of elements that will be returned.
310 static const int kResultElementCount = 1;
311 // The number of elements that will be returned by the underlying generator.
312 static const int kNativeElementCount = Generator::kResultElementCount;
313 typedef typename Generator::ResultElementType ResultType;
314 typedef typename Generator::ResultElementType ResultElementType;
315
316 PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)317 explicit SingleSampleAdapter(Generator* gen)
318 : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
319
320 PHILOX_DEVICE_INLINE
operator()321 ResultType operator()() {
322 if (used_result_index_ == Generator::kResultElementCount) {
323 unused_results_ = (*generator_)();
324 used_result_index_ = 0;
325 }
326
327 return unused_results_[used_result_index_++];
328 }
329
330 PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)331 void Skip(uint64 num_skips) {
332 if (!num_skips) {
333 return;
334 }
335 int num_unused_results = kNativeElementCount - used_result_index_;
336 if (num_skips <= num_unused_results) {
337 used_result_index_ += num_skips;
338 return;
339 }
340 num_skips -= num_unused_results;
341 used_result_index_ = kNativeElementCount;
342 SkipFromGenerator(num_skips / kNativeElementCount);
343 num_skips = num_skips % kNativeElementCount;
344 if (num_skips) {
345 unused_results_ = (*generator_)();
346 used_result_index_ = num_skips;
347 }
348 }
349
350 private:
351 // This implementation iteratively skips over `num_skips` samples
352 // from `generator_`. There is an O(1) implementation for PhiloxRandom
353 // in random_distributions.cc.
354 PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)355 void SkipFromGenerator(uint64 num_skips) {
356 while (num_skips--) {
357 (*generator_)();
358 }
359 }
360
361 Generator* generator_;
362 typename Generator::ResultType unused_results_;
363 int used_result_index_;
364 };
365
366 // A class that generates unit normal distribution random numbers from the
367 // underlying random integer generator.
368 // Arguments:
369 // Generator: a generator type that returns a number of uint32 upon each
370 // each invocation. It needs to define kResultElementCount for the
371 // sample count for each invocation, and ResultType for actual
372 // returned sample type.
373 // RealType: the data type of the real numbers that will be returned by the
374 // distribution. This could be either float or double for now.
375 // This class is meant to be implemented through specialization. The default
376 // is not defined by design.
377 template <class Generator, typename RealType>
378 class NormalDistribution;
379
380 PHILOX_DEVICE_INLINE
381 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
382
383 PHILOX_DEVICE_INLINE
384 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
385 double* d1);
386
387 // Exactly like the float version, except that we convert to half afterwards;
388 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
389 // gain from working in half internally.
390 template <class Generator>
391 class NormalDistribution<Generator, Eigen::half> {
392 public:
393 // The number of elements that will be returned.
394 static const int kResultElementCount = Generator::kResultElementCount;
395 // Cost of generation of a single element (in cycles).
396 static const int kElementCost = 70;
397 // Indicate that this distribution may take variable number of samples
398 // during the runtime.
399 static const bool kVariableSamplesPerOutput = false;
400 typedef Array<Eigen::half, kResultElementCount> ResultType;
401 typedef Eigen::half ResultElementType;
402
403 PHILOX_DEVICE_INLINE
operator()404 ResultType operator()(Generator* gen) {
405 typename Generator::ResultType sample = (*gen)();
406 ResultType result;
407 for (int i = 0; i < kResultElementCount; i += 2) {
408 float f[2];
409 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
410 result[i] = Eigen::half(f[0]);
411 result[i + 1] = Eigen::half(f[1]);
412 }
413 return result;
414 }
415 };
416
417 template <class Generator>
418 class NormalDistribution<Generator, bfloat16> {
419 public:
420 // The number of elements that will be returned.
421 static const int kResultElementCount = Generator::kResultElementCount;
422 // Cost of generation of a single element (in cycles).
423 static const int kElementCost = 70;
424 // Indicate that this distribution may take variable number of samples
425 // during the runtime.
426 static const bool kVariableSamplesPerOutput = false;
427 typedef Array<bfloat16, kResultElementCount> ResultType;
428 typedef bfloat16 ResultElementType;
429
430 PHILOX_DEVICE_INLINE
operator()431 ResultType operator()(Generator* gen) {
432 typename Generator::ResultType sample = (*gen)();
433 ResultType result;
434 static_assert(kResultElementCount % 2 == 0,
435 "kResultElementCount should be an even number");
436 for (int i = 0; i < kResultElementCount; i += 2) {
437 float f[2];
438 // Box-Muller transform requires processing 2 elements at a time.
439 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
440 result[i] = bfloat16(f[0]);
441 result[i + 1] = bfloat16(f[1]);
442 }
443 return result;
444 }
445 };
446
447 template <class Generator>
448 class NormalDistribution<Generator, float> {
449 public:
450 // The number of elements that will be returned.
451 static const int kResultElementCount = Generator::kResultElementCount;
452 // Cost of generation of a single element (in cycles).
453 static const int kElementCost = 70;
454 // Indicate that this distribution may take variable number of samples
455 // during the runtime.
456 static const bool kVariableSamplesPerOutput = false;
457 typedef Array<float, kResultElementCount> ResultType;
458 typedef float ResultElementType;
459
460 PHILOX_DEVICE_INLINE
operator()461 ResultType operator()(Generator* gen) {
462 typename Generator::ResultType sample = (*gen)();
463 ResultType result;
464 for (int i = 0; i < kResultElementCount; i += 2) {
465 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
466 }
467 return result;
468 }
469 };
470
471 template <class Generator>
472 class NormalDistribution<Generator, double> {
473 public:
474 // The number of elements that will be returned.
475 static const int kResultElementCount = Generator::kResultElementCount / 2;
476 // Cost of generation of a single element (in cycles).
477 static const int kElementCost = 70;
478 // Indicate that this distribution may take variable number of samples
479 // during the runtime.
480 static const bool kVariableSamplesPerOutput = false;
481 typedef Array<double, kResultElementCount> ResultType;
482 typedef double ResultElementType;
483
484 PHILOX_DEVICE_INLINE
operator()485 ResultType operator()(Generator* gen) {
486 typename Generator::ResultType sample = (*gen)();
487 ResultType result;
488 for (int i = 0; i < kResultElementCount; i += 2) {
489 const int i2 = 2 * i;
490 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
491 sample[i2 + 3], &result[i], &result[i + 1]);
492 }
493 return result;
494 }
495 };
496
497 // A class that returns standard normal distribution between
498 // [-kTruncateValue, kTruncateValue].
499 // Arguments:
500 // Generator: a generator type that returns a number of uint32 upon each
501 // each invocation. It needs to define kResultElementCount for the
502 // sample count for each invocation, and ResultType for actual
503 // returned sample type.
504 // RealType: the data type of the real numbers that will be returned by the
505 // distribution. This could be either float or double for now.
506 // This class is meant to be implemented through specialization. The default
507 // is not defined by design.
508 template <class SingleSampleGenerator, typename RealType>
509 class TruncatedNormalDistribution;
510
511 // Exactly like the float version, except that we convert to half afterwards;
512 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
513 // gain from working in half internally.
514 template <class SingleSampleGenerator>
515 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
516 public:
517 // The number of elements that will be returned.
518 static const int kResultElementCount =
519 SingleSampleGenerator::kNativeElementCount;
520 // Cost of generation of a single element (in cycles).
521 static const int kElementCost = 90;
522 // Indicate that this distribution may take variable number of samples
523 // during the runtime.
524 static const bool kVariableSamplesPerOutput = true;
525 // The threshold where the normal distribution is truncated.
526 const float kTruncateValue = 2.0f;
527
528 typedef Array<Eigen::half, kResultElementCount> ResultType;
529 typedef Eigen::half ResultElementType;
530
531 PHILOX_DEVICE_INLINE
operator()532 ResultType operator()(SingleSampleGenerator* gen) {
533 ResultType results;
534 int index = 0;
535 while (true) {
536 // Repeatedly take samples from the normal distribution, until we have
537 // the desired number of elements that fall within the pre-defined cutoff
538 // threshold.
539 const uint32 x0 = (*gen)();
540 const uint32 x1 = (*gen)();
541 float f[2];
542 BoxMullerFloat(x0, x1, &f[0], &f[1]);
543
544 if (Eigen::numext::abs(f[0]) < kTruncateValue) {
545 results[index++] = Eigen::half(f[0]);
546 if (index >= kResultElementCount) {
547 return results;
548 }
549 }
550 if (Eigen::numext::abs(f[1]) < kTruncateValue) {
551 results[index++] = Eigen::half(f[1]);
552 if (index >= kResultElementCount) {
553 return results;
554 }
555 }
556 }
557 }
558 };
559
560 template <class SingleSampleGenerator>
561 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
562 public:
563 // The number of elements that will be returned.
564 static const int kResultElementCount =
565 SingleSampleGenerator::kNativeElementCount;
566 // Cost of generation of a single element (in cycles).
567 static const int kElementCost = 90;
568 // Indicate that this distribution may take variable number of samples
569 // during the runtime.
570 static const bool kVariableSamplesPerOutput = true;
571 // The threshold where the normal distribution is truncated.
572 const float kTruncateValue = 2.0f;
573
574 typedef Array<bfloat16, kResultElementCount> ResultType;
575 typedef bfloat16 ResultElementType;
576
577 PHILOX_DEVICE_INLINE
operator()578 ResultType operator()(SingleSampleGenerator* gen) {
579 ResultType results;
580 int index = 0;
581 while (true) {
582 // Repeatedly take samples from the normal distribution, until we have
583 // the desired number of elements that fall within the pre-defined cutoff
584 // threshold.
585 const uint32 x0 = (*gen)();
586 const uint32 x1 = (*gen)();
587 float f[2];
588 BoxMullerFloat(x0, x1, &f[0], &f[1]);
589
590 if (Eigen::numext::abs(f[0]) < kTruncateValue) {
591 results[index++] = bfloat16(f[0]);
592 if (index >= kResultElementCount) {
593 return results;
594 }
595 }
596 if (Eigen::numext::abs(f[1]) < kTruncateValue) {
597 results[index++] = bfloat16(f[1]);
598 if (index >= kResultElementCount) {
599 return results;
600 }
601 }
602 }
603 }
604 };
605
606 // Partial specialization for float.
607 template <class SingleSampleGenerator>
608 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
609 public:
610 // The number of elements that will be returned.
611 static const int kResultElementCount =
612 SingleSampleGenerator::kNativeElementCount;
613 // Cost of generation of a single element (in cycles).
614 static const int kElementCost = 90;
615 // Indicate that this distribution may take variable number of samples
616 // during the runtime.
617 static const bool kVariableSamplesPerOutput = true;
618 // The threshold where the normal distribution is truncated.
619 const float kTruncateValue = 2.0f;
620
621 typedef Array<float, kResultElementCount> ResultType;
622 typedef float ResultElementType;
623
624 PHILOX_DEVICE_INLINE
operator()625 ResultType operator()(SingleSampleGenerator* gen) {
626 ResultType results;
627 int index = 0;
628 while (true) {
629 // Repeatedly take samples from the normal distribution, until we have
630 // the desired number of elements that fall within the pre-defined cutoff
631 // threshold.
632 const uint32 x0 = (*gen)();
633 const uint32 x1 = (*gen)();
634 float f[2];
635 BoxMullerFloat(x0, x1, &f[0], &f[1]);
636
637 if (Eigen::numext::abs(f[0]) < kTruncateValue) {
638 results[index++] = f[0];
639 if (index >= kResultElementCount) {
640 return results;
641 }
642 }
643 if (Eigen::numext::abs(f[1]) < kTruncateValue) {
644 results[index++] = f[1];
645 if (index >= kResultElementCount) {
646 return results;
647 }
648 }
649 }
650 }
651 };
652
653 // Partial specialization for double.
654 template <class SingleSampleGenerator>
655 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
656 public:
657 // The number of elements that will be returned.
658 static const int kResultElementCount =
659 (SingleSampleGenerator::kNativeElementCount > 1)
660 ? SingleSampleGenerator::kNativeElementCount / 2
661 : 1;
662 // Cost of generation of a single element (in cycles).
663 static const int kElementCost = 90;
664 // Indicate that this distribution may take variable number of samples
665 // during the runtime.
666 static const bool kVariableSamplesPerOutput = true;
667 typedef Array<double, kResultElementCount> ResultType;
668 typedef double ResultElementType;
669 const double kTruncateValue = 2.0;
670
671 PHILOX_DEVICE_INLINE
operator()672 ResultType operator()(SingleSampleGenerator* gen) {
673 ResultType results;
674 int index = 0;
675 while (1) {
676 const uint32 x0 = (*gen)();
677 const uint32 x1 = (*gen)();
678 const uint32 x2 = (*gen)();
679 const uint32 x3 = (*gen)();
680 double d[2];
681 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
682
683 if (Eigen::numext::abs(d[0]) < kTruncateValue) {
684 results[index++] = d[0];
685 if (index >= kResultElementCount) {
686 return results;
687 }
688 }
689 if (Eigen::numext::abs(d[1]) < kTruncateValue) {
690 results[index++] = d[1];
691 if (index >= kResultElementCount) {
692 return results;
693 }
694 }
695 }
696 }
697 };
698
699 // Helper function to convert two 32-bit uniform integers to two floats
700 // under the unit normal distribution.
701 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)702 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
703 // This function implements the Box-Muller transform:
704 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
705 // Do not send a really small number to log().
706 // We cannot mark "epsilon" as "static const" because NVCC would complain
707 const float epsilon = 1.0e-7f;
708 float u1 = Uint32ToFloat(x0);
709 if (u1 < epsilon) {
710 u1 = epsilon;
711 }
712 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
713 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
714 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
715 *f0 = Eigen::numext::sin(v1);
716 *f1 = Eigen::numext::cos(v1);
717 #else
718 sincosf(v1, f0, f1);
719 #endif
720 *f0 *= u2;
721 *f1 *= u2;
722 }
723
724 // Helper function to convert four 32-bit uniform integers to two doubles
725 // under the unit normal distribution.
726 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)727 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
728 double* d1) {
729 // This function implements the Box-Muller transform:
730 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
731 // Do not send a really small number to log().
732 // We cannot mark "epsilon" as "static const" because NVCC would complain
733 const double epsilon = 1.0e-7;
734 double u1 = Uint64ToDouble(x0, x1);
735 if (u1 < epsilon) {
736 u1 = epsilon;
737 }
738 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
739 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
740 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
741 *d0 = Eigen::numext::sin(v1);
742 *d1 = Eigen::numext::cos(v1);
743 #else
744 sincos(v1, d0, d1);
745 #endif
746 *d0 *= u2;
747 *d1 *= u2;
748 }
749
750 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)751 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
752 // IEEE754 halfs are formatted as follows (MSB first):
753 // sign(1) exponent(5) mantissa(10)
754 // Conceptually construct the following:
755 // sign == 0
756 // exponent == 15 -- an excess 15 representation of a zero exponent
757 // mantissa == 10 random bits
758 const uint16 man = x & 0x3ffu; // 10 bit mantissa
759 const uint16 exp = static_cast<uint16>(15);
760 const uint16 val = (exp << 10) | man;
761
762 Eigen::half result;
763 result.x = val;
764 return result - Eigen::half(1.0);
765 }
766
767 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
768 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)769 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
770 // bfloat are formatted as follows (MSB first):
771 // sign(1) exponent(8) mantissa(7)
772 // Conceptually construct the following:
773 // sign == 0
774 // exponent == 127 -- an excess 127 representation of a zero exponent
775 // mantissa == 7 random bits
776 const uint16 man = x & 0x7fu; // 7 bit mantissa
777 const uint16 exp = static_cast<uint16>(127);
778 const uint16 val = (exp << 7) | man;
779
780 bfloat16 result;
781 memcpy(&result, &val, sizeof(val));
782 // The mantissa has an implicit leading 1, so the above code creates a value
783 // in [1, 2). The minus will not cause a rounding that makes the result 1.
784 // Instead it will just be close to 1.
785 return result - bfloat16(1.0);
786 }
787
788 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)789 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
790 // IEEE754 floats are formatted as follows (MSB first):
791 // sign(1) exponent(8) mantissa(23)
792 // Conceptually construct the following:
793 // sign == 0
794 // exponent == 127 -- an excess 127 representation of a zero exponent
795 // mantissa == 23 random bits
796 const uint32 man = x & 0x7fffffu; // 23 bit mantissa
797 const uint32 exp = static_cast<uint32>(127);
798 const uint32 val = (exp << 23) | man;
799
800 // Assumes that endian-ness is same for float and uint32.
801 float result;
802 memcpy(&result, &val, sizeof(val));
803 return result - 1.0f;
804 }
805
806 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)807 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
808 // IEEE754 doubles are formatted as follows (MSB first):
809 // sign(1) exponent(11) mantissa(52)
810 // Conceptually construct the following:
811 // sign == 0
812 // exponent == 1023 -- an excess 1023 representation of a zero exponent
813 // mantissa == 52 random bits
814 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa
815 const uint32 mlo = x1; // lower 32 bits of mantissa
816 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa
817 const uint64 exp = static_cast<uint64>(1023);
818 const uint64 val = (exp << 52) | man;
819 // Assumes that endian-ness is same for double and uint64.
820 double result;
821 memcpy(&result, &val, sizeof(val));
822 return result - 1.0;
823 }
824
825 } // namespace random
826 } // namespace tensorflow
827
828 #endif // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
829