1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || TENSORFLOW_USE_ROCM
17
18 #define EIGEN_USE_GPU
19
20 #include "tensorflow/core/framework/register_types.h"
21 #include "tensorflow/core/framework/tensor_types.h"
22 #include "tensorflow/core/kernels/in_topk_op.h"
23 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
24 #include "tensorflow/core/kernels/reduction_ops.h"
25 #include "tensorflow/core/util/gpu_kernel_helper.h"
26
27 namespace tensorflow {
28 typedef Eigen::GpuDevice GPUDevice;
29
30 namespace functor {
31
32 // Compare each prediction in 'predictions' with a target prediction for the
33 // batch, and write result to the 'mask':
34 // -1: If the target class is out of range, or if the prediction value is not
35 // finite and can't be compared to target prediction (and vice versa).
36 // 0: If prediction is smaller than the target prediction for the batch.
37 // 1: If prediction is larger than the target prediction for the batch.
38 template <typename T, typename TargetT>
ComputePredictionMaskKernel(const T * __restrict__ predictions,const TargetT * __restrict__ targets,int64 * __restrict__ mask,int num_targets,int num_classes)39 __global__ void ComputePredictionMaskKernel(
40 const T* __restrict__ predictions, // dims: [ num_targets x num_classes ]
41 const TargetT* __restrict__ targets, // dims: [ num_targets ]
42 int64* __restrict__ mask, // dims: [ num_targets x num_classes ]
43 int num_targets, int num_classes) {
44 GPU_1D_KERNEL_LOOP(i, num_targets * num_classes) {
45 const int batch_index = i / num_classes;
46 TargetT target_idx = ldg(targets + batch_index);
47
48 if (!FastBoundsCheck(target_idx, num_classes)) {
49 mask[i] = -1;
50 return;
51 }
52
53 T prediction = ldg(predictions + i);
54 T target_prediction =
55 ldg(predictions + batch_index * num_classes + target_idx);
56
57 if (!Eigen::numext::isfinite(prediction) ||
58 !Eigen::numext::isfinite(target_prediction)) {
59 mask[i] = -1;
60 } else {
61 mask[i] = prediction > target_prediction ? 1 : 0;
62 }
63 }
64 }
65
66 // Reduce all prediction masks either to the sum of '1' for each prediction
67 // larger than the target, or to '-1' if target class in invalid of predictions
68 // in a batch have non-finite values.
69 struct MaskSum {
operator ()tensorflow::functor::MaskSum70 __host__ __device__ int64 operator()(const int64& a, const int64& b) const {
71 if (a < 0 || b < 0)
72 return -1;
73 else
74 return a + b;
75 }
76 };
77
78 namespace reduction_op_helper {
79 template <>
80 struct IdentityValue<int64, MaskSum> {
operator ()tensorflow::functor::reduction_op_helper::IdentityValue81 int64 operator()() { return 0; }
82 };
83
84 } // namespace reduction_op_helper
85
86 template <typename T, typename TargetT>
87 struct InTopKFunctor<GPUDevice, T, TargetT> {
88 template <int ndims>
89 using Dims = Eigen::DSizes<Eigen::Index, ndims>;
90
operator ()tensorflow::functor::InTopKFunctor91 void operator()(OpKernelContext* context,
92 typename TTypes<T, 2>::ConstTensor predictions,
93 typename TTypes<TargetT>::ConstVec targets, const TopKArg k,
94 typename TTypes<bool>::Vec output) {
95 const Eigen::Index num_targets = predictions.dimension(0);
96 const Eigen::Index num_classes = predictions.dimension(1);
97
98 OP_REQUIRES(
99 context, num_targets * num_classes < std::numeric_limits<int>::max(),
100 errors::InvalidArgument(
101 "Number of targets * number of classes must be less than INT_MAX"));
102
103 if (num_targets == 0 || num_classes == 0) {
104 // Result is empty, so shortcut the rest of the function to avoid
105 // launching kernels with empty input.
106 return;
107 }
108
109 // Temporary storage for a mask computed by `ComputePredictionMaskKernel`.
110 Tensor predictions_mask;
111 OP_REQUIRES_OK(
112 context, context->allocate_temp(DT_INT64,
113 TensorShape({num_targets, num_classes}),
114 &predictions_mask));
115
116 // Number of predictions for each target that are larger than the target
117 // prediction (or -1 if we can't compute this number, because not all
118 // predictions are finite or target class is out of range).
119 Tensor num_larger_prediction;
120 OP_REQUIRES_OK(context,
121 context->allocate_temp(DT_INT64, TensorShape({num_targets}),
122 &num_larger_prediction));
123
124 const auto& d = context->eigen_device<GPUDevice>();
125
126 // Compute a mask for all predictions.
127 GpuLaunchConfig config = GetGpuLaunchConfig(num_targets * num_classes, d);
128 OP_REQUIRES_OK(
129 context, GpuLaunchKernel(ComputePredictionMaskKernel<T, TargetT>,
130 config.block_count, config.thread_per_block, 0,
131 d.stream(), predictions.data(), targets.data(),
132 predictions_mask.flat<int64_t>().data(),
133 num_targets, num_classes));
134
135 // Reduce prediction masks to number of predictions larger than the target
136 // prediction, or to the negative value if we can't compute an answer.
137 {
138 auto in = predictions_mask.matrix<int64_t>();
139 auto out = num_larger_prediction.flat<int64_t>();
140
141 ReduceImpl<int64, MaskSum, int64*, int64*, Dims<1>>(
142 context, (int64*)out.data(), (int64*)in.data(), in.rank(),
143 in.dimension(0), in.rank() >= 2 ? in.dimension(1) : 1,
144 in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), Dims<1>(1),
145 MaskSum());
146 }
147
148 // Compute if target prediction is in top K predictions.
149 auto cnt = num_larger_prediction.flat<int64_t>();
150
151 if (k.k_tensor != nullptr) {
152 if (k.k_tensor->dtype() == DT_INT32) {
153 output.device(d) =
154 (cnt >= cnt.constant(0)) &&
155 (cnt < k.k_tensor->flat<int32>().template cast<int64_t>().broadcast(
156 Dims<1>(num_targets)));
157 } else {
158 output.device(d) =
159 (cnt >= cnt.constant(0)) &&
160 (cnt < k.k_tensor->flat<int64_t>().broadcast(Dims<1>(num_targets)));
161 }
162 } else {
163 output.device(d) =
164 (cnt >= cnt.constant(0)) && (cnt < targets.constant(k.k_value));
165 }
166 }
167 };
168
169 } // namespace functor
170
171 // Definition of the GPU implementations declared in in_topk_op.cc.
172 #define DEFINE_GPU_KERNELS(T, TARGET_T) \
173 template struct functor::InTopKFunctor<GPUDevice, T, TARGET_T>;
174
175 DEFINE_GPU_KERNELS(float, int32);
176 DEFINE_GPU_KERNELS(float, int64);
177
178 #undef DEFINE_GPU_KERNELS
179
180 } // end namespace tensorflow
181
182 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
183