• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18 
19 #include <string.h>
20 
21 #include <cmath>
22 
23 #include <algorithm>
24 #include <type_traits>
25 
26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
27 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
28 #include "tensorflow/core/lib/random/philox_random.h"
29 
30 namespace tensorflow {
31 namespace random {
32 
33 // Helper function to convert a 16-bit integer to a half between [0..1).
34 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
35 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
36 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
37 // Helper function to convert a 32-bit integer to a float between [0..1).
38 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
39 // Helper function to convert two 32-bit integers to a double between [0..1).
40 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
41 
42 // Computes a + b. Requires that the result is representable in the destination
43 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
44 // need *not* be representable in that type. (The condition on b excludes the
45 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
46 // compute.)
47 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)48 PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
49                                    typename std::make_unsigned<Int>::type b) {
50   // Implementation note: both b_div_2 and b - b_div_2 are positive and
51   // representatble as Int.
52   auto b_div_2 = b >> 1;
53   return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
54 }
55 
56 // A class that generates uniform distribution random numbers from the
57 // underlying random integer generator.
58 // Arguments:
59 //   Generator: a generator type that returns a number of uint32 upon each
60 //              invocation. It needs to define kResultElementCount for the
61 //              sample count for each invocation, and ResultType for the
62 //              actual returned sample type.
63 //   RealType: the data type of the real numbers that will be returned by the
64 //             distribution. This could be either float or double for now.
65 // This class is meant to be implemented through specialization. The default
66 // is not defined by design.
67 template <class Generator, typename RealType>
68 class UniformDistribution;
69 
70 template <class Generator>
71 class UniformDistribution<Generator, Eigen::half> {
72  public:
73   // The number of elements that will be returned.
74   static const int kResultElementCount = Generator::kResultElementCount;
75   // Cost of generation of a single element (in cycles).
76   static const int kElementCost = 3;
77   // Indicate that this distribution may take variable number of samples
78   // during the runtime.
79   static const bool kVariableSamplesPerOutput = false;
80   typedef Array<Eigen::half, kResultElementCount> ResultType;
81   typedef Eigen::half ResultElementType;
82 
83   PHILOX_DEVICE_INLINE
operator()84   ResultType operator()(Generator* gen) {
85     typename Generator::ResultType sample = (*gen)();
86     ResultType result;
87     for (int i = 0; i < kResultElementCount; ++i) {
88       result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
89     }
90     return result;
91   }
92 };
93 
94 template <class Generator>
95 class UniformDistribution<Generator, bfloat16> {
96  public:
97   // The number of elements that will be returned.
98   static const int kResultElementCount = Generator::kResultElementCount;
99   // Cost of generation of a single element (in cycles).
100   static const int kElementCost = 3;
101   // Indicate that this distribution may take variable number of samples
102   // during the runtime.
103   static const bool kVariableSamplesPerOutput = false;
104   typedef Array<bfloat16, kResultElementCount> ResultType;
105   typedef bfloat16 ResultElementType;
106 
107   PHILOX_DEVICE_INLINE
operator()108   ResultType operator()(Generator* gen) {
109     typename Generator::ResultType sample = (*gen)();
110     ResultType result;
111     for (int i = 0; i < kResultElementCount; ++i) {
112       result[i] = Uint16ToGfloat16(sample[i]);
113     }
114     return result;
115   }
116 };
117 
118 template <class Generator>
119 class UniformDistribution<Generator, float> {
120  public:
121   // The number of elements that will be returned.
122   static const int kResultElementCount = Generator::kResultElementCount;
123   // Cost of generation of a single element (in cycles).
124   static const int kElementCost = 3;
125   // Indicate that this distribution may take variable number of samples
126   // during the runtime.
127   static const bool kVariableSamplesPerOutput = false;
128   typedef Array<float, kResultElementCount> ResultType;
129   typedef float ResultElementType;
130 
131   PHILOX_DEVICE_INLINE
operator()132   ResultType operator()(Generator* gen) {
133     typename Generator::ResultType sample = (*gen)();
134     ResultType result;
135     for (int i = 0; i < kResultElementCount; ++i) {
136       result[i] = Uint32ToFloat(sample[i]);
137     }
138     return result;
139   }
140 };
141 
142 template <class Generator>
143 class UniformDistribution<Generator, double> {
144  public:
145   // The number of elements that will be returned.
146   static const int kResultElementCount = Generator::kResultElementCount / 2;
147   // Cost of generation of a single element (in cycles).
148   static const int kElementCost = 3;
149   // Indicate that this distribution may take variable number of samples
150   // during the runtime.
151   static const bool kVariableSamplesPerOutput = false;
152   typedef Array<double, kResultElementCount> ResultType;
153   typedef double ResultElementType;
154 
155   PHILOX_DEVICE_INLINE
operator()156   ResultType operator()(Generator* gen) {
157     typename Generator::ResultType sample = (*gen)();
158     ResultType result;
159     for (int i = 0; i < kResultElementCount; ++i) {
160       result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
161     }
162     return result;
163   }
164 };
165 
166 template <class Generator>
167 class UniformDistribution<Generator, int32> {
168  public:
169   // The number of elements that will be returned.
170   static const int kResultElementCount = Generator::kResultElementCount;
171   // Cost of generation of a single element (in cycles).
172   static const int kElementCost = 3;
173   // Indicate that this distribution may take variable number of samples
174   // during the runtime.
175   static const bool kVariableSamplesPerOutput = false;
176   typedef Array<int32, kResultElementCount> ResultType;
177   typedef int32 ResultElementType;
178 
179   // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)180   UniformDistribution(int32 lo, int32 hi)
181       : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
182 
183   PHILOX_DEVICE_INLINE
operator()184   ResultType operator()(Generator* gen) {
185     typename Generator::ResultType sample = (*gen)();
186     ResultType result;
187     for (int i = 0; i < kResultElementCount; ++i) {
188       result[i] = SignedAdd(lo_, sample[i] % range_);
189     }
190     return result;
191   }
192 
193  private:
194   // Note that lo_ is intentionally signed while range_ is intentionally
195   // unsigned.  This is because hi - lo can overflow signed integers if
196   // lo < 0 < hi, but always fits in unsigned.
197   int32 lo_;
198   uint32 range_;
199 };
200 
201 template <class Generator>
202 class UniformDistribution<Generator, int64> {
203  public:
204   // The number of elements that will be returned.
205   static const int kResultElementCount = Generator::kResultElementCount / 2;
206   // Cost of generation of a single element (in cycles).
207   static const int kElementCost = 3;
208   // Indicate that this distribution may take variable number of samples
209   // during the runtime.
210   static const bool kVariableSamplesPerOutput = false;
211   typedef Array<int64, kResultElementCount> ResultType;
212   typedef int64 ResultElementType;
213 
214   // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)215   UniformDistribution(int64 lo, int64 hi)
216       : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
217 
218   PHILOX_DEVICE_INLINE
operator()219   ResultType operator()(Generator* gen) {
220     typename Generator::ResultType sample = (*gen)();
221     ResultType result;
222     for (int i = 0; i < kResultElementCount; ++i) {
223       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
224       result[i] = SignedAdd(lo_, bits % range_);
225     }
226     return result;
227   }
228 
229  private:
230   // Note that lo_ is intentionally signed while range_ is intentionally
231   // unsigned.  This is because hi - lo can overflow signed integers if
232   // lo < 0 < hi, but always fits in unsigned.
233   int64 lo_;
234   uint64 range_;
235 };
236 
237 // Similar to `UniformDistribution`, except that instead of generating numbers
238 // in the range [low, high), it generates numbers covering the whole range of
239 // the integer type.
240 template <typename Generator, typename IntType>
241 class UniformFullIntDistribution;
242 
243 template <typename Generator, typename IntType>
244 class UniformFullIntDistribution32 {
245  public:
246   // The number of elements that will be returned.
247   static const int kResultElementCount = Generator::kResultElementCount;
248   // Cost of generation of a single element (in cycles).
249   static const int kElementCost = 3;
250   // Indicate that this distribution may take variable number of samples
251   // during the runtime.
252   static const bool kVariableSamplesPerOutput = false;
253   typedef Array<IntType, kResultElementCount> ResultType;
254   typedef IntType ResultElementType;
255 
256   PHILOX_DEVICE_INLINE
operator()257   ResultType operator()(Generator* gen) {
258     typename Generator::ResultType sample = (*gen)();
259     ResultType result;
260     for (int i = 0; i < kResultElementCount; ++i) {
261       result[i] = sample[i];
262     }
263     return result;
264   }
265 };
266 
267 template <typename Generator, typename IntType>
268 class UniformFullIntDistribution64 {
269  public:
270   // The number of elements that will be returned.
271   static const int kResultElementCount = Generator::kResultElementCount / 2;
272   // Cost of generation of a single element (in cycles).
273   static const int kElementCost = 3;
274   // Indicate that this distribution may take variable number of samples
275   // during the runtime.
276   static const bool kVariableSamplesPerOutput = false;
277   typedef Array<IntType, kResultElementCount> ResultType;
278   typedef IntType ResultElementType;
279 
280   PHILOX_DEVICE_INLINE
operator()281   ResultType operator()(Generator* gen) {
282     typename Generator::ResultType sample = (*gen)();
283     ResultType result;
284     for (int i = 0; i < kResultElementCount; ++i) {
285       result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
286     }
287     return result;
288   }
289 };
290 
291 template <typename Generator>
292 class UniformFullIntDistribution<Generator, int32>
293     : public UniformFullIntDistribution32<Generator, int32> {};
294 template <typename Generator>
295 class UniformFullIntDistribution<Generator, uint32>
296     : public UniformFullIntDistribution32<Generator, uint32> {};
297 template <typename Generator>
298 class UniformFullIntDistribution<Generator, int64>
299     : public UniformFullIntDistribution64<Generator, int64> {};
300 template <typename Generator>
301 class UniformFullIntDistribution<Generator, uint64>
302     : public UniformFullIntDistribution64<Generator, uint64> {};
303 
304 // A class that adapts the underlying native multiple samples to return a single
305 // sample at a time.
306 template <class Generator>
307 class SingleSampleAdapter {
308  public:
309   // The number of elements that will be returned.
310   static const int kResultElementCount = 1;
311   // The number of elements that will be returned by the underlying generator.
312   static const int kNativeElementCount = Generator::kResultElementCount;
313   typedef typename Generator::ResultElementType ResultType;
314   typedef typename Generator::ResultElementType ResultElementType;
315 
316   PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)317   explicit SingleSampleAdapter(Generator* gen)
318       : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
319 
320   PHILOX_DEVICE_INLINE
operator()321   ResultType operator()() {
322     if (used_result_index_ == Generator::kResultElementCount) {
323       unused_results_ = (*generator_)();
324       used_result_index_ = 0;
325     }
326 
327     return unused_results_[used_result_index_++];
328   }
329 
330   PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)331   void Skip(uint64 num_skips) {
332     if (!num_skips) {
333       return;
334     }
335     int num_unused_results = kNativeElementCount - used_result_index_;
336     if (num_skips <= num_unused_results) {
337       used_result_index_ += num_skips;
338       return;
339     }
340     num_skips -= num_unused_results;
341     used_result_index_ = kNativeElementCount;
342     SkipFromGenerator(num_skips / kNativeElementCount);
343     num_skips = num_skips % kNativeElementCount;
344     if (num_skips) {
345       unused_results_ = (*generator_)();
346       used_result_index_ = num_skips;
347     }
348   }
349 
350  private:
351   // This implementation iteratively skips over `num_skips` samples
352   // from `generator_`. There is an O(1) implementation for PhiloxRandom
353   // in random_distributions.cc.
354   PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)355   void SkipFromGenerator(uint64 num_skips) {
356     while (num_skips--) {
357       (*generator_)();
358     }
359   }
360 
361   Generator* generator_;
362   typename Generator::ResultType unused_results_;
363   int used_result_index_;
364 };
365 
366 // A class that generates unit normal distribution random numbers from the
367 // underlying random integer generator.
368 // Arguments:
369 //   Generator: a generator type that returns a number of uint32 upon each
370 //              each invocation. It needs to define kResultElementCount for the
371 //              sample count for each invocation, and ResultType for actual
372 //              returned sample type.
373 //   RealType: the data type of the real numbers that will be returned by the
374 //             distribution. This could be either float or double for now.
375 // This class is meant to be implemented through specialization. The default
376 // is not defined by design.
377 template <class Generator, typename RealType>
378 class NormalDistribution;
379 
380 PHILOX_DEVICE_INLINE
381 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
382 
383 PHILOX_DEVICE_INLINE
384 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
385                      double* d1);
386 
387 // Exactly like the float version, except that we convert to half afterwards;
388 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
389 // gain from working in half internally.
390 template <class Generator>
391 class NormalDistribution<Generator, Eigen::half> {
392  public:
393   // The number of elements that will be returned.
394   static const int kResultElementCount = Generator::kResultElementCount;
395   // Cost of generation of a single element (in cycles).
396   static const int kElementCost = 70;
397   // Indicate that this distribution may take variable number of samples
398   // during the runtime.
399   static const bool kVariableSamplesPerOutput = false;
400   typedef Array<Eigen::half, kResultElementCount> ResultType;
401   typedef Eigen::half ResultElementType;
402 
403   PHILOX_DEVICE_INLINE
operator()404   ResultType operator()(Generator* gen) {
405     typename Generator::ResultType sample = (*gen)();
406     ResultType result;
407     for (int i = 0; i < kResultElementCount; i += 2) {
408       float f[2];
409       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
410       result[i] = Eigen::half(f[0]);
411       result[i + 1] = Eigen::half(f[1]);
412     }
413     return result;
414   }
415 };
416 
417 template <class Generator>
418 class NormalDistribution<Generator, bfloat16> {
419  public:
420   // The number of elements that will be returned.
421   static const int kResultElementCount = Generator::kResultElementCount;
422   // Cost of generation of a single element (in cycles).
423   static const int kElementCost = 70;
424   // Indicate that this distribution may take variable number of samples
425   // during the runtime.
426   static const bool kVariableSamplesPerOutput = false;
427   typedef Array<bfloat16, kResultElementCount> ResultType;
428   typedef bfloat16 ResultElementType;
429 
430   PHILOX_DEVICE_INLINE
operator()431   ResultType operator()(Generator* gen) {
432     typename Generator::ResultType sample = (*gen)();
433     ResultType result;
434     static_assert(kResultElementCount % 2 == 0,
435                   "kResultElementCount should be an even number");
436     for (int i = 0; i < kResultElementCount; i += 2) {
437       float f[2];
438       // Box-Muller transform requires processing 2 elements at a time.
439       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
440       result[i] = bfloat16(f[0]);
441       result[i + 1] = bfloat16(f[1]);
442     }
443     return result;
444   }
445 };
446 
447 template <class Generator>
448 class NormalDistribution<Generator, float> {
449  public:
450   // The number of elements that will be returned.
451   static const int kResultElementCount = Generator::kResultElementCount;
452   // Cost of generation of a single element (in cycles).
453   static const int kElementCost = 70;
454   // Indicate that this distribution may take variable number of samples
455   // during the runtime.
456   static const bool kVariableSamplesPerOutput = false;
457   typedef Array<float, kResultElementCount> ResultType;
458   typedef float ResultElementType;
459 
460   PHILOX_DEVICE_INLINE
operator()461   ResultType operator()(Generator* gen) {
462     typename Generator::ResultType sample = (*gen)();
463     ResultType result;
464     for (int i = 0; i < kResultElementCount; i += 2) {
465       BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
466     }
467     return result;
468   }
469 };
470 
471 template <class Generator>
472 class NormalDistribution<Generator, double> {
473  public:
474   // The number of elements that will be returned.
475   static const int kResultElementCount = Generator::kResultElementCount / 2;
476   // Cost of generation of a single element (in cycles).
477   static const int kElementCost = 70;
478   // Indicate that this distribution may take variable number of samples
479   // during the runtime.
480   static const bool kVariableSamplesPerOutput = false;
481   typedef Array<double, kResultElementCount> ResultType;
482   typedef double ResultElementType;
483 
484   PHILOX_DEVICE_INLINE
operator()485   ResultType operator()(Generator* gen) {
486     typename Generator::ResultType sample = (*gen)();
487     ResultType result;
488     for (int i = 0; i < kResultElementCount; i += 2) {
489       const int i2 = 2 * i;
490       BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
491                       sample[i2 + 3], &result[i], &result[i + 1]);
492     }
493     return result;
494   }
495 };
496 
497 // A class that returns standard normal distribution between
498 // [-kTruncateValue, kTruncateValue].
499 // Arguments:
500 //   Generator: a generator type that returns a number of uint32 upon each
501 //              each invocation. It needs to define kResultElementCount for the
502 //              sample count for each invocation, and ResultType for actual
503 //              returned sample type.
504 //   RealType: the data type of the real numbers that will be returned by the
505 //             distribution. This could be either float or double for now.
506 // This class is meant to be implemented through specialization. The default
507 // is not defined by design.
508 template <class SingleSampleGenerator, typename RealType>
509 class TruncatedNormalDistribution;
510 
511 // Exactly like the float version, except that we convert to half afterwards;
512 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
513 // gain from working in half internally.
514 template <class SingleSampleGenerator>
515 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
516  public:
517   // The number of elements that will be returned.
518   static const int kResultElementCount =
519       SingleSampleGenerator::kNativeElementCount;
520   // Cost of generation of a single element (in cycles).
521   static const int kElementCost = 90;
522   // Indicate that this distribution may take variable number of samples
523   // during the runtime.
524   static const bool kVariableSamplesPerOutput = true;
525   // The threshold where the normal distribution is truncated.
526   const float kTruncateValue = 2.0f;
527 
528   typedef Array<Eigen::half, kResultElementCount> ResultType;
529   typedef Eigen::half ResultElementType;
530 
531   PHILOX_DEVICE_INLINE
operator()532   ResultType operator()(SingleSampleGenerator* gen) {
533     ResultType results;
534     int index = 0;
535     while (true) {
536       // Repeatedly take samples from the normal distribution, until we have
537       // the desired number of elements that fall within the pre-defined cutoff
538       // threshold.
539       const uint32 x0 = (*gen)();
540       const uint32 x1 = (*gen)();
541       float f[2];
542       BoxMullerFloat(x0, x1, &f[0], &f[1]);
543 
544       if (Eigen::numext::abs(f[0]) < kTruncateValue) {
545         results[index++] = Eigen::half(f[0]);
546         if (index >= kResultElementCount) {
547           return results;
548         }
549       }
550       if (Eigen::numext::abs(f[1]) < kTruncateValue) {
551         results[index++] = Eigen::half(f[1]);
552         if (index >= kResultElementCount) {
553           return results;
554         }
555       }
556     }
557   }
558 };
559 
560 template <class SingleSampleGenerator>
561 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
562  public:
563   // The number of elements that will be returned.
564   static const int kResultElementCount =
565       SingleSampleGenerator::kNativeElementCount;
566   // Cost of generation of a single element (in cycles).
567   static const int kElementCost = 90;
568   // Indicate that this distribution may take variable number of samples
569   // during the runtime.
570   static const bool kVariableSamplesPerOutput = true;
571   // The threshold where the normal distribution is truncated.
572   const float kTruncateValue = 2.0f;
573 
574   typedef Array<bfloat16, kResultElementCount> ResultType;
575   typedef bfloat16 ResultElementType;
576 
577   PHILOX_DEVICE_INLINE
operator()578   ResultType operator()(SingleSampleGenerator* gen) {
579     ResultType results;
580     int index = 0;
581     while (true) {
582       // Repeatedly take samples from the normal distribution, until we have
583       // the desired number of elements that fall within the pre-defined cutoff
584       // threshold.
585       const uint32 x0 = (*gen)();
586       const uint32 x1 = (*gen)();
587       float f[2];
588       BoxMullerFloat(x0, x1, &f[0], &f[1]);
589 
590       if (Eigen::numext::abs(f[0]) < kTruncateValue) {
591         results[index++] = bfloat16(f[0]);
592         if (index >= kResultElementCount) {
593           return results;
594         }
595       }
596       if (Eigen::numext::abs(f[1]) < kTruncateValue) {
597         results[index++] = bfloat16(f[1]);
598         if (index >= kResultElementCount) {
599           return results;
600         }
601       }
602     }
603   }
604 };
605 
606 // Partial specialization for float.
607 template <class SingleSampleGenerator>
608 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
609  public:
610   // The number of elements that will be returned.
611   static const int kResultElementCount =
612       SingleSampleGenerator::kNativeElementCount;
613   // Cost of generation of a single element (in cycles).
614   static const int kElementCost = 90;
615   // Indicate that this distribution may take variable number of samples
616   // during the runtime.
617   static const bool kVariableSamplesPerOutput = true;
618   // The threshold where the normal distribution is truncated.
619   const float kTruncateValue = 2.0f;
620 
621   typedef Array<float, kResultElementCount> ResultType;
622   typedef float ResultElementType;
623 
624   PHILOX_DEVICE_INLINE
operator()625   ResultType operator()(SingleSampleGenerator* gen) {
626     ResultType results;
627     int index = 0;
628     while (true) {
629       // Repeatedly take samples from the normal distribution, until we have
630       // the desired number of elements that fall within the pre-defined cutoff
631       // threshold.
632       const uint32 x0 = (*gen)();
633       const uint32 x1 = (*gen)();
634       float f[2];
635       BoxMullerFloat(x0, x1, &f[0], &f[1]);
636 
637       if (Eigen::numext::abs(f[0]) < kTruncateValue) {
638         results[index++] = f[0];
639         if (index >= kResultElementCount) {
640           return results;
641         }
642       }
643       if (Eigen::numext::abs(f[1]) < kTruncateValue) {
644         results[index++] = f[1];
645         if (index >= kResultElementCount) {
646           return results;
647         }
648       }
649     }
650   }
651 };
652 
653 // Partial specialization for double.
654 template <class SingleSampleGenerator>
655 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
656  public:
657   // The number of elements that will be returned.
658   static const int kResultElementCount =
659       (SingleSampleGenerator::kNativeElementCount > 1)
660           ? SingleSampleGenerator::kNativeElementCount / 2
661           : 1;
662   // Cost of generation of a single element (in cycles).
663   static const int kElementCost = 90;
664   // Indicate that this distribution may take variable number of samples
665   // during the runtime.
666   static const bool kVariableSamplesPerOutput = true;
667   typedef Array<double, kResultElementCount> ResultType;
668   typedef double ResultElementType;
669   const double kTruncateValue = 2.0;
670 
671   PHILOX_DEVICE_INLINE
operator()672   ResultType operator()(SingleSampleGenerator* gen) {
673     ResultType results;
674     int index = 0;
675     while (1) {
676       const uint32 x0 = (*gen)();
677       const uint32 x1 = (*gen)();
678       const uint32 x2 = (*gen)();
679       const uint32 x3 = (*gen)();
680       double d[2];
681       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
682 
683       if (Eigen::numext::abs(d[0]) < kTruncateValue) {
684         results[index++] = d[0];
685         if (index >= kResultElementCount) {
686           return results;
687         }
688       }
689       if (Eigen::numext::abs(d[1]) < kTruncateValue) {
690         results[index++] = d[1];
691         if (index >= kResultElementCount) {
692           return results;
693         }
694       }
695     }
696   }
697 };
698 
699 // Helper function to convert two 32-bit uniform integers to two floats
700 // under the unit normal distribution.
701 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)702 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
703   // This function implements the Box-Muller transform:
704   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
705   // Do not send a really small number to log().
706   // We cannot mark "epsilon" as "static const" because NVCC would complain
707   const float epsilon = 1.0e-7f;
708   float u1 = Uint32ToFloat(x0);
709   if (u1 < epsilon) {
710     u1 = epsilon;
711   }
712   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
713   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
714 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
715   *f0 = Eigen::numext::sin(v1);
716   *f1 = Eigen::numext::cos(v1);
717 #else
718   sincosf(v1, f0, f1);
719 #endif
720   *f0 *= u2;
721   *f1 *= u2;
722 }
723 
724 // Helper function to convert four 32-bit uniform integers to two doubles
725 // under the unit normal distribution.
726 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)727 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
728                      double* d1) {
729   // This function implements the Box-Muller transform:
730   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
731   // Do not send a really small number to log().
732   // We cannot mark "epsilon" as "static const" because NVCC would complain
733   const double epsilon = 1.0e-7;
734   double u1 = Uint64ToDouble(x0, x1);
735   if (u1 < epsilon) {
736     u1 = epsilon;
737   }
738   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
739   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
740 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
741   *d0 = Eigen::numext::sin(v1);
742   *d1 = Eigen::numext::cos(v1);
743 #else
744   sincos(v1, d0, d1);
745 #endif
746   *d0 *= u2;
747   *d1 *= u2;
748 }
749 
750 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)751 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
752   // IEEE754 halfs are formatted as follows (MSB first):
753   //    sign(1) exponent(5) mantissa(10)
754   // Conceptually construct the following:
755   //    sign == 0
756   //    exponent == 15  -- an excess 15 representation of a zero exponent
757   //    mantissa == 10 random bits
758   const uint16 man = x & 0x3ffu;  // 10 bit mantissa
759   const uint16 exp = static_cast<uint16>(15);
760   const uint16 val = (exp << 10) | man;
761 
762   Eigen::half result;
763   result.x = val;
764   return result - Eigen::half(1.0);
765 }
766 
767 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
768 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)769 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
770   // bfloat are formatted as follows (MSB first):
771   //    sign(1) exponent(8) mantissa(7)
772   // Conceptually construct the following:
773   //    sign == 0
774   //    exponent == 127  -- an excess 127 representation of a zero exponent
775   //    mantissa == 7 random bits
776   const uint16 man = x & 0x7fu;  // 7 bit mantissa
777   const uint16 exp = static_cast<uint16>(127);
778   const uint16 val = (exp << 7) | man;
779 
780   bfloat16 result;
781   memcpy(&result, &val, sizeof(val));
782   // The mantissa has an implicit leading 1, so the above code creates a value
783   // in [1, 2). The minus will not cause a rounding that makes the result 1.
784   // Instead it will just be close to 1.
785   return result - bfloat16(1.0);
786 }
787 
788 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)789 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
790   // IEEE754 floats are formatted as follows (MSB first):
791   //    sign(1) exponent(8) mantissa(23)
792   // Conceptually construct the following:
793   //    sign == 0
794   //    exponent == 127  -- an excess 127 representation of a zero exponent
795   //    mantissa == 23 random bits
796   const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
797   const uint32 exp = static_cast<uint32>(127);
798   const uint32 val = (exp << 23) | man;
799 
800   // Assumes that endian-ness is same for float and uint32.
801   float result;
802   memcpy(&result, &val, sizeof(val));
803   return result - 1.0f;
804 }
805 
806 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)807 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
808   // IEEE754 doubles are formatted as follows (MSB first):
809   //    sign(1) exponent(11) mantissa(52)
810   // Conceptually construct the following:
811   //    sign == 0
812   //    exponent == 1023  -- an excess 1023 representation of a zero exponent
813   //    mantissa == 52 random bits
814   const uint32 mhi = x0 & 0xfffffu;  // upper 20 bits of mantissa
815   const uint32 mlo = x1;             // lower 32 bits of mantissa
816   const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
817   const uint64 exp = static_cast<uint64>(1023);
818   const uint64 val = (exp << 52) | man;
819   // Assumes that endian-ness is same for double and uint64.
820   double result;
821   memcpy(&result, &val, sizeof(val));
822   return result - 1.0;
823 }
824 
825 }  // namespace random
826 }  // namespace tensorflow
827 
828 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
829