1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Implement the Philox algorithm to generate random numbers in parallel. 17 // Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. 18 // http://www.thesalmons.org/john/random123/papers/random123sc11.pdf 19 20 #ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ 21 #define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ 22 23 #include <stdlib.h> 24 25 #include "tensorflow/core/platform/types.h" 26 27 // Function qualifiers that need to work on both CPU and GPU. 28 #if defined(__CUDACC__) || defined(__HIPCC__) 29 // For nvcc. 30 #define PHILOX_DEVICE_FUNC __host__ __device__ 31 #define PHILOX_INLINE __inline__ 32 #else 33 // For non-nvcc. 34 #define PHILOX_DEVICE_FUNC 35 #define PHILOX_INLINE inline 36 #endif 37 #define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE 38 39 #include <math.h> 40 41 namespace tensorflow { 42 namespace random { 43 44 // A class that represents an inline array. It can be used on both CPU and GPU, 45 // and also trivially copyable between CPU and GPU. 46 // Arguments: 47 // T: the array element type; 48 // ElementCount: the fixed size of the array; 49 template <typename T, int ElementCount> 50 class Array { 51 public: 52 static constexpr int kElementCount = ElementCount; Array()53 PHILOX_DEVICE_INLINE Array() { 54 for (int i = 0; i < ElementCount; ++i) { 55 data_[i] = T(0); 56 } 57 } 58 59 PHILOX_DEVICE_INLINE const T& operator[](int index) const { 60 return data_[index]; 61 } 62 63 PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; } 64 size()65 size_t size() const { return ElementCount; } 66 67 private: 68 T data_[ElementCount]; 69 }; 70 71 // A class that encapsulates all the states for a random number generator using 72 // the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits 73 // in the form of four uint32. 74 // There are multiple variants of this algorithm, we picked the 4x32_10 version 75 // that is most suited for our applications. 76 // Since this class is meant to be copied between CPU to GPU, it maintains a 77 // value semantics. 78 // 79 // For example: To use this class and populate an array of 1024 randoms on CPU 80 // with two threads, 81 // 82 // void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { 83 // assert(start % 4 == 0); 84 // assert(limit % 4 == 0); 85 // rnd.Skip(start / 4); 86 // for (int i = start; i < limit; i += 4) { 87 // auto sample = rnd(); 88 // ... copy sample[0..3] to output[i..i+3] 89 // } 90 // } 91 // 92 // PhiloxRandom rng(seed); 93 // PhiloxRandom rng_copy = rng; 94 // rng.Skip(1000/4); 95 // 96 // ... schedule Fill(rng_copy, output, 0, 512) in thread 1; 97 // ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; 98 // ... wait for thread 1 & 2 to finish executing Fill(). 99 // 100 // NOTE: 101 // 1. PhiloxRandom is trivially copyable. 102 // 2. PhiloxRandom is compilable by gcc and nvcc. 103 class PhiloxRandom { 104 public: 105 using ResultType = Array<uint32, 4>; 106 using ResultElementType = uint32; 107 // The number of elements that will be returned. 108 static constexpr int kResultElementCount = 4; 109 // Cost of generation of a single element (in cycles). 110 static constexpr int kElementCost = 10; 111 // The type for the 64-bit key stored in the form of two 32-bit uint 112 // that are used in the diffusion process. 113 using Key = Array<uint32, 2>; 114 115 PHILOX_DEVICE_INLINE PhiloxRandom()116 PhiloxRandom() {} 117 118 PHILOX_DEVICE_INLINE PhiloxRandom(uint64 seed)119 explicit PhiloxRandom(uint64 seed) { 120 key_[0] = static_cast<uint32>(seed); 121 key_[1] = static_cast<uint32>(seed >> 32); 122 } 123 124 PHILOX_DEVICE_INLINE PhiloxRandom(uint64 seed_lo,uint64 seed_hi)125 explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) { 126 key_[0] = static_cast<uint32>(seed_lo); 127 key_[1] = static_cast<uint32>(seed_lo >> 32); 128 counter_[2] = static_cast<uint32>(seed_hi); 129 counter_[3] = static_cast<uint32>(seed_hi >> 32); 130 } 131 132 PHILOX_DEVICE_INLINE PhiloxRandom(ResultType counter,Key key)133 PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {} 134 135 PHILOX_DEVICE_INLINE counter()136 ResultType const& counter() const { return counter_; } 137 138 PHILOX_DEVICE_INLINE key()139 Key const& key() const { return key_; } 140 141 // Skip the specified number of samples of 128-bits in the current stream. 142 PHILOX_DEVICE_INLINE Skip(uint64 count)143 void Skip(uint64 count) { 144 const uint32 count_lo = static_cast<uint32>(count); 145 uint32 count_hi = static_cast<uint32>(count >> 32); 146 147 counter_[0] += count_lo; 148 if (counter_[0] < count_lo) { 149 ++count_hi; 150 } 151 152 counter_[1] += count_hi; 153 if (counter_[1] < count_hi) { 154 if (++counter_[2] == 0) { 155 ++counter_[3]; 156 } 157 } 158 } 159 160 // Returns a group of four random numbers using the underlying Philox 161 // algorithm. operator()162 PHILOX_DEVICE_INLINE ResultType operator()() { 163 ResultType counter = counter_; 164 Key key = key_; 165 166 // Run the single rounds for ten times. Manually unrolling the loop 167 // for better performance. 168 counter = ComputeSingleRound(counter, key); 169 RaiseKey(&key); 170 counter = ComputeSingleRound(counter, key); 171 RaiseKey(&key); 172 counter = ComputeSingleRound(counter, key); 173 RaiseKey(&key); 174 counter = ComputeSingleRound(counter, key); 175 RaiseKey(&key); 176 counter = ComputeSingleRound(counter, key); 177 RaiseKey(&key); 178 counter = ComputeSingleRound(counter, key); 179 RaiseKey(&key); 180 counter = ComputeSingleRound(counter, key); 181 RaiseKey(&key); 182 counter = ComputeSingleRound(counter, key); 183 RaiseKey(&key); 184 counter = ComputeSingleRound(counter, key); 185 RaiseKey(&key); 186 counter = ComputeSingleRound(counter, key); 187 188 SkipOne(); 189 190 return counter; 191 } 192 193 private: 194 // We use the same constants as recommended by the original paper. 195 static constexpr uint32 kPhiloxW32A = 0x9E3779B9; 196 static constexpr uint32 kPhiloxW32B = 0xBB67AE85; 197 static constexpr uint32 kPhiloxM4x32A = 0xD2511F53; 198 static constexpr uint32 kPhiloxM4x32B = 0xCD9E8D57; 199 200 // Helper function to skip the next sample of 128-bits in the current stream. SkipOne()201 PHILOX_DEVICE_INLINE void SkipOne() { 202 if (++counter_[0] == 0) { 203 if (++counter_[1] == 0) { 204 if (++counter_[2] == 0) { 205 ++counter_[3]; 206 } 207 } 208 } 209 } 210 211 // Helper function to return the lower and higher 32-bits from two 32-bit 212 // integer multiplications. 213 PHILOX_DEVICE_INLINE MultiplyHighLow(uint32 a,uint32 b,uint32 * result_low,uint32 * result_high)214 static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low, 215 uint32* result_high) { 216 #ifndef __CUDA_ARCH__ 217 const uint64 product = static_cast<uint64>(a) * b; 218 *result_low = static_cast<uint32>(product); 219 *result_high = static_cast<uint32>(product >> 32); 220 #else 221 *result_low = a * b; 222 *result_high = __umulhi(a, b); 223 #endif 224 } 225 226 // Helper function for a single round of the underlying Philox algorithm. ComputeSingleRound(const ResultType & counter,const Key & key)227 PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound( 228 const ResultType& counter, const Key& key) { 229 uint32 lo0; 230 uint32 hi0; 231 MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); 232 233 uint32 lo1; 234 uint32 hi1; 235 MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); 236 237 ResultType result; 238 result[0] = hi1 ^ counter[1] ^ key[0]; 239 result[1] = lo1; 240 result[2] = hi0 ^ counter[3] ^ key[1]; 241 result[3] = lo0; 242 return result; 243 } 244 RaiseKey(Key * key)245 PHILOX_DEVICE_INLINE void RaiseKey(Key* key) { 246 (*key)[0] += kPhiloxW32A; 247 (*key)[1] += kPhiloxW32B; 248 } 249 250 private: 251 ResultType counter_; 252 Key key_; 253 }; 254 255 } // namespace random 256 } // namespace tensorflow 257 258 #endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ 259