• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/random_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include <algorithm>
21 #include <cmath>
22 #include <memory>
23 
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_util.h"
29 #include "tensorflow/core/kernels/random_op_cpu.h"
30 #include "tensorflow/core/lib/hash/crc32c.h"
31 #include "tensorflow/core/lib/random/random_distributions.h"
32 #include "tensorflow/core/lib/random/simple_philox.h"
33 #include "tensorflow/core/platform/logging.h"
34 #include "tensorflow/core/util/guarded_philox_random.h"
35 #include "tensorflow/core/util/work_sharder.h"
36 
37 #if EIGEN_COMP_GNUC && __cplusplus > 199711L
38 #define DISABLE_FLOAT_EQUALITY_WARNING \
39   _Pragma("GCC diagnostic push")       \
40       _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
41 #define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
42 #else
43 #define DISABLE_FLOAT_EQUALITY_WARNING
44 #define ENABLE_FLOAT_EQUALITY_WARNING
45 #endif
46 
47 namespace tensorflow {
48 
49 typedef Eigen::ThreadPoolDevice CPUDevice;
50 typedef Eigen::GpuDevice GPUDevice;
51 
52 namespace {
53 
AllocateOutputWithShape(OpKernelContext * ctx,const Tensor & shape,int index,Tensor ** output)54 static Status AllocateOutputWithShape(OpKernelContext* ctx, const Tensor& shape,
55                                       int index, Tensor** output) {
56   TensorShape tensor_shape;
57   TF_RETURN_IF_ERROR(tensor::MakeShape(shape, &tensor_shape));
58   return ctx->allocate_output(index, tensor_shape, output);
59 }
60 
61 // For now, use the same interface as RandomOp, so we can choose either one
62 // at the run-time.
63 template <typename Device, class Distribution>
64 class PhiloxRandomOp : public OpKernel {
65  public:
66   typedef typename Distribution::ResultElementType T;
PhiloxRandomOp(OpKernelConstruction * ctx)67   explicit PhiloxRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
68     OP_REQUIRES_OK(ctx, generator_.Init(ctx));
69   }
70 
Compute(OpKernelContext * ctx)71   void Compute(OpKernelContext* ctx) override {
72     const Tensor& shape = ctx->input(0);
73     Tensor* output;
74     OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output));
75     auto output_flat = output->flat<T>();
76     functor::FillPhiloxRandom<Device, Distribution>()(
77         ctx, ctx->eigen_device<Device>(), /*key=*/nullptr, /*counter=*/nullptr,
78         // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
79         // it just here.
80         generator_.ReserveRandomOutputs(output_flat.size(), 256),
81         output_flat.data(), output_flat.size(), Distribution());
82   }
83 
84  private:
85   GuardedPhiloxRandom generator_;
86 };
87 
88 template <typename Device, class IntType>
89 class RandomUniformIntOp : public OpKernel {
90  public:
RandomUniformIntOp(OpKernelConstruction * ctx)91   explicit RandomUniformIntOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
92     OP_REQUIRES_OK(ctx, generator_.Init(ctx));
93   }
94 
Compute(OpKernelContext * ctx)95   void Compute(OpKernelContext* ctx) override {
96     const Tensor& shape = ctx->input(0);
97     const Tensor& minval = ctx->input(1);
98     const Tensor& maxval = ctx->input(2);
99     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
100                 errors::InvalidArgument("minval must be 0-D, got shape ",
101                                         minval.shape().DebugString()));
102     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval.shape()),
103                 errors::InvalidArgument("maxval must be 0-D, got shape ",
104                                         maxval.shape().DebugString()));
105 
106     // Allocate output, and exit early if possible
107     Tensor* output;
108     OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output));
109     if (output->NumElements() == 0) return;
110 
111     // Verify that minval < maxval.  This check intentionally happens after the
112     // early exit for empty output.  Zero impossible things are fine.
113     IntType lo = minval.scalar<IntType>()();
114     IntType hi = maxval.scalar<IntType>()();
115     OP_REQUIRES(
116         ctx, lo < hi,
117         errors::InvalidArgument("Need minval < maxval, got ", lo, " >= ", hi));
118 
119     // Build distribution
120     typedef random::UniformDistribution<random::PhiloxRandom, IntType>
121         Distribution;
122     Distribution dist(lo, hi);
123 
124     auto output_flat = output->flat<IntType>();
125     functor::FillPhiloxRandom<Device, Distribution>()(
126         ctx, ctx->eigen_device<Device>(), /*key=*/nullptr, /*counter=*/nullptr,
127         // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
128         // it just here.
129         generator_.ReserveRandomOutputs(output_flat.size(), 256),
130         output_flat.data(), output_flat.size(), dist);
131   }
132 
133  private:
134   GuardedPhiloxRandom generator_;
135 };
136 
137 // Samples from one or more gamma distributions. All internal computations are
138 // done with double precision for numerical stability.
139 template <typename T>
140 class RandomGammaOp : public OpKernel {
141  public:
RandomGammaOp(OpKernelConstruction * context)142   explicit RandomGammaOp(OpKernelConstruction* context) : OpKernel(context) {
143     OP_REQUIRES_OK(context, generator_.Init(context));
144   }
145 
Compute(OpKernelContext * ctx)146   void Compute(OpKernelContext* ctx) override {
147     const Tensor& shape_t = ctx->input(0);
148     const Tensor& alpha_t = ctx->input(1);
149 
150     OP_REQUIRES(ctx,
151                 TensorShapeUtils::IsVector(shape_t.shape()) &&
152                     (shape_t.dtype() == DataType::DT_INT32 ||
153                      shape_t.dtype() == DataType::DT_INT64),
154                 errors::InvalidArgument(
155                     "shape must be a vector of {int32,int64}, got shape: ",
156                     shape_t.DebugString()));
157     TensorShape samples_shape;
158     if (shape_t.dtype() == DataType::DT_INT32) {
159       auto vec = shape_t.flat<int32>();
160       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(vec.data(), vec.size(),
161                                                       &samples_shape));
162     } else if (shape_t.dtype() == DataType::DT_INT64) {
163       auto vec = shape_t.flat<int64_t>();
164       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(vec.data(), vec.size(),
165                                                       &samples_shape));
166     }
167     const int64_t samples_per_alpha = samples_shape.num_elements();
168 
169     OP_REQUIRES_OK(ctx, samples_shape.AppendShapeWithStatus(alpha_t.shape()));
170     // Allocate output samples.
171     Tensor* samples_t = nullptr;
172     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, samples_shape, &samples_t));
173 
174     if (samples_shape.num_elements() == 0) return;
175 
176     using random::PhiloxRandom;
177 
178     typedef random::NormalDistribution<PhiloxRandom, double> Normal;
179     typedef random::UniformDistribution<PhiloxRandom, double> Uniform;
180 #define UNIFORM(X)                                    \
181   if (uniform_remaining == 0) {                       \
182     uniform_remaining = Uniform::kResultElementCount; \
183     uniform_result = uniform(&gen);                   \
184   }                                                   \
185   uniform_remaining--;                                \
186   double X = uniform_result[uniform_remaining]
187 
188     // Each attempt is 95+% successful, and requires 1-2 normal + 1 uniform
189     static constexpr int kReservedSamplesPerOutput = 256;
190 
191     const auto alpha_flat = alpha_t.flat<T>().data();
192     const int64_t num_alphas = alpha_t.NumElements();
193     OP_REQUIRES(ctx, num_alphas > 0,
194                 errors::InvalidArgument(
195                     "Input alpha should have non-zero element count, got: ",
196                     num_alphas));
197     auto samples_flat = samples_t->flat<T>().data();
198     PhiloxRandom rng = generator_.ReserveRandomOutputs(
199         samples_per_alpha * num_alphas, kReservedSamplesPerOutput);
200 
201     // We partition work first across alphas then across samples-per-alpha to
202     // avoid a couple flops which can be done on a per-alpha basis.
203 
204     auto DoWork = [samples_per_alpha, num_alphas, &rng, samples_flat,
205                    alpha_flat](int64_t start_output, int64_t limit_output) {
206       using Eigen::numext::exp;
207       using Eigen::numext::log;
208       using Eigen::numext::log1p;
209       using Eigen::numext::pow;
210 
211       // Capturing "rng" by-value would only make a copy for the _shared_
212       // lambda.  Since we want to let each worker have its own copy, we pass
213       // "rng" by reference and explicitly do a copy assignment.
214 
215       Normal normal;
216       Uniform uniform;
217       typename Normal::ResultType norm_result;
218       typename Uniform::ResultType uniform_result;
219       for (int64_t output_idx = start_output; output_idx < limit_output;
220            /* output_idx incremented within inner loop below */) {
221         int64_t alpha_idx = output_idx / samples_per_alpha;
222 
223         // Instead of +alpha_idx for each sample, we offset the pointer once.
224         T* const samples_alpha_offset = samples_flat + alpha_idx;
225 
226         // Several calculations can be done on a per-alpha basis.
227         const double alpha = static_cast<double>(alpha_flat[alpha_idx]);
228 
229         DISABLE_FLOAT_EQUALITY_WARNING
230         if (alpha == static_cast<double>(1.0)) {
231           ENABLE_FLOAT_EQUALITY_WARNING
232           // Sample from an exponential distribution.
233           for (int64_t sample_idx = output_idx % samples_per_alpha;
234                sample_idx < samples_per_alpha && output_idx < limit_output;
235                sample_idx++, output_idx++) {
236             // As we want data stable regardless of sharding
237             // (including eventually on GPU), we skip on a per-sample basis.
238             PhiloxRandom gen = rng;
239             gen.Skip(kReservedSamplesPerOutput * output_idx);
240             int16_t uniform_remaining = 0;
241             UNIFORM(u);
242             const double res = -log1p(-u);
243             samples_alpha_offset[sample_idx * num_alphas] = static_cast<T>(res);
244           }       // for (sample_idx)
245         } else {  // if alpha != 1.0
246           // Transformation-rejection from pairs of uniform and normal random
247           // variables. http://dl.acm.org/citation.cfm?id=358414
248           //
249           // The algorithm has an acceptance rate of ~95% for small alpha (~1),
250           // and higher accept rates for higher alpha, so runtime is
251           // O(NumAlphas * NumSamples * k) with k ~ 1 / 0.95.
252           //
253           // For alpha<1, we add one to d=alpha-1/3, and multiply the final
254           // result by uniform()^(1/alpha)
255           const bool alpha_less_than_one = alpha < 1;
256           const double d = alpha + (alpha_less_than_one ? 2.0 / 3 : -1.0 / 3);
257           const double c = 1.0 / 3 / sqrt(d);
258 
259           // Compute the rest of the samples for the current alpha value.
260           for (int64_t sample_idx = output_idx % samples_per_alpha;
261                sample_idx < samples_per_alpha && output_idx < limit_output;
262                sample_idx++, output_idx++) {
263             // Since each sample may use a variable number of normal/uniform
264             // samples, and we want data stable regardless of sharding
265             // (including eventually on GPU), we skip on a per-sample basis.
266             PhiloxRandom gen = rng;
267             gen.Skip(kReservedSamplesPerOutput * output_idx);
268             int16_t norm_remaining = 0;
269             int16_t uniform_remaining = 0;
270 
271             // Keep trying until we don't reject a sample. In practice, we will
272             // only reject ~5% at worst, for low alpha near 1.
273             while (true) {
274               if (norm_remaining == 0) {
275                 norm_remaining = Normal::kResultElementCount;
276                 norm_result = normal(&gen);
277               }
278               norm_remaining--;
279               const double x = norm_result[norm_remaining];
280               double v = 1 + c * x;
281               if (v <= 0) {
282                 continue;
283               }
284               v = v * v * v;
285               UNIFORM(u);
286               // The first option in the if is a "squeeze" short-circuit to
287               // dodge the two logs. Magic constant sourced from the paper
288               // linked above. Upward of .91 of the area covered by the log
289               // inequality is covered by the squeeze as well (larger coverage
290               // for smaller values of alpha).
291               if ((u < 1 - 0.0331 * (x * x) * (x * x)) ||
292                   (log(u) < 0.5 * x * x + d * (1 - v + log(v)))) {
293                 double res = d * v;
294                 if (alpha_less_than_one) {
295                   UNIFORM(b);
296                   res *= pow(b, 1 / alpha);
297                 }
298                 samples_alpha_offset[sample_idx * num_alphas] =
299                     static_cast<T>(res);
300                 break;
301               }
302             }  // while: true
303           }    // for: sample_idx
304         }      // if (alpha == 1.0)
305       }        // for: output_idx
306     };         // DoWork
307 #undef UNIFORM
308     // Two calls to log only occur for ~10% of samples reaching the log line.
309     //   2 x 100 (64-bit cycles per log) x 0.10 = ~20.
310     // Other ops: sqrt, +, *, /, %... something like 15 of these, at 3-6 cycles
311     // each = ~60.
312     // All of this /0.95 due to the rejection possibility = ~85.
313     static const int kElementCost = 85 + 2 * Normal::kElementCost +
314                                     Uniform::kElementCost +
315                                     3 * PhiloxRandom::kElementCost;
316     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
317     Shard(worker_threads.num_threads, worker_threads.workers,
318           num_alphas * samples_per_alpha, kElementCost, DoWork);
319   }
320 
321  private:
322   GuardedPhiloxRandom generator_;
323 
324   TF_DISALLOW_COPY_AND_ASSIGN(RandomGammaOp);
325 };
326 
327 }  // namespace
328 
329 #define REGISTER(TYPE)                                                         \
330   template struct functor::FillPhiloxRandom<                                   \
331       CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;     \
332   template struct functor::FillPhiloxRandom<                                   \
333       CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE>>;      \
334   template struct functor::FillPhiloxRandom<                                   \
335       CPUDevice,                                                               \
336       random::TruncatedNormalDistribution<                                     \
337           random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>;           \
338   REGISTER_KERNEL_BUILDER(                                                     \
339       Name("RandomUniform")                                                    \
340           .Device(DEVICE_CPU)                                                  \
341           .HostMemory("shape")                                                 \
342           .TypeConstraint<TYPE>("dtype"),                                      \
343       PhiloxRandomOp<CPUDevice, random::UniformDistribution<                   \
344                                     random::PhiloxRandom, TYPE>>);             \
345   REGISTER_KERNEL_BUILDER(                                                     \
346       Name("RandomStandardNormal")                                             \
347           .Device(DEVICE_CPU)                                                  \
348           .HostMemory("shape")                                                 \
349           .TypeConstraint<TYPE>("dtype"),                                      \
350       PhiloxRandomOp<CPUDevice,                                                \
351                      random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
352   REGISTER_KERNEL_BUILDER(                                                     \
353       Name("TruncatedNormal")                                                  \
354           .Device(DEVICE_CPU)                                                  \
355           .HostMemory("shape")                                                 \
356           .TypeConstraint<TYPE>("dtype"),                                      \
357       PhiloxRandomOp<                                                          \
358           CPUDevice,                                                           \
359           random::TruncatedNormalDistribution<                                 \
360               random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);      \
361   REGISTER_KERNEL_BUILDER(                                                     \
362       Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
363       RandomGammaOp<TYPE>)
364 
365 #define REGISTER_FULL_INT(IntType)           \
366   template struct functor::FillPhiloxRandom< \
367       CPUDevice,                             \
368       random::UniformFullIntDistribution<random::PhiloxRandom, IntType>>
369 
370 #define REGISTER_INT(IntType)                                                 \
371   REGISTER_FULL_INT(IntType);                                                 \
372   template struct functor::FillPhiloxRandom<                                  \
373       CPUDevice, random::UniformDistribution<random::PhiloxRandom, IntType>>; \
374   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")                            \
375                               .Device(DEVICE_CPU)                             \
376                               .HostMemory("shape")                            \
377                               .HostMemory("minval")                           \
378                               .HostMemory("maxval")                           \
379                               .TypeConstraint<IntType>("Tout"),               \
380                           RandomUniformIntOp<CPUDevice, IntType>);
381 
382 TF_CALL_half(REGISTER);
383 TF_CALL_bfloat16(REGISTER);
384 TF_CALL_float(REGISTER);
385 TF_CALL_double(REGISTER);
386 TF_CALL_int32(REGISTER_INT);
387 TF_CALL_int64(REGISTER_INT);
388 TF_CALL_uint32(REGISTER_FULL_INT);
389 TF_CALL_uint64(REGISTER_FULL_INT);
390 
391 #undef REGISTER
392 #undef REGISTER_INT
393 #undef REGISTER_FULL_INT
394 
395 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
396 
397 #define REGISTER(TYPE)                                                         \
398   REGISTER_KERNEL_BUILDER(                                                     \
399       Name("RandomUniform")                                                    \
400           .Device(DEVICE_GPU)                                                  \
401           .HostMemory("shape")                                                 \
402           .TypeConstraint<int32>("T")                                          \
403           .TypeConstraint<TYPE>("dtype"),                                      \
404       PhiloxRandomOp<GPUDevice, random::UniformDistribution<                   \
405                                     random::PhiloxRandom, TYPE>>);             \
406   REGISTER_KERNEL_BUILDER(                                                     \
407       Name("RandomStandardNormal")                                             \
408           .Device(DEVICE_GPU)                                                  \
409           .HostMemory("shape")                                                 \
410           .TypeConstraint<int32>("T")                                          \
411           .TypeConstraint<TYPE>("dtype"),                                      \
412       PhiloxRandomOp<GPUDevice,                                                \
413                      random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
414   REGISTER_KERNEL_BUILDER(                                                     \
415       Name("TruncatedNormal")                                                  \
416           .Device(DEVICE_GPU)                                                  \
417           .HostMemory("shape")                                                 \
418           .TypeConstraint<int32>("T")                                          \
419           .TypeConstraint<TYPE>("dtype"),                                      \
420       PhiloxRandomOp<                                                          \
421           GPUDevice,                                                           \
422           random::TruncatedNormalDistribution<                                 \
423               random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
424 
425 #define REGISTER_FULL_INT(IntType)           \
426   template struct functor::FillPhiloxRandom< \
427       GPUDevice,                             \
428       random::UniformFullIntDistribution<random::PhiloxRandom, IntType>>
429 
430 #define REGISTER_INT(IntType)                                                 \
431   REGISTER_FULL_INT(IntType);                                                 \
432   template struct functor::FillPhiloxRandom<                                  \
433       GPUDevice, random::UniformDistribution<random::PhiloxRandom, IntType>>; \
434   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")                            \
435                               .Device(DEVICE_GPU)                             \
436                               .HostMemory("shape")                            \
437                               .HostMemory("minval")                           \
438                               .HostMemory("maxval")                           \
439                               .TypeConstraint<int32>("T")                     \
440                               .TypeConstraint<IntType>("Tout"),               \
441                           RandomUniformIntOp<GPUDevice, IntType>);
442 
443 TF_CALL_half(REGISTER);
444 TF_CALL_float(REGISTER);
445 TF_CALL_double(REGISTER);
446 TF_CALL_int32(REGISTER_INT);
447 TF_CALL_int64(REGISTER_INT);
448 TF_CALL_uint32(REGISTER_FULL_INT);
449 TF_CALL_uint64(REGISTER_FULL_INT);
450 
451 #undef REGISTER
452 #undef REGISTER_INT
453 #undef REGISTER_FULL_INT
454 
455 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
456 
457 
458 }  // end namespace tensorflow
459