• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 #define EIGEN_USE_GPU
18 #include "tensorflow/core/kernels/image/non_max_suppression_op.h"
19 
20 #include <limits>
21 
22 #include "absl/strings/str_cat.h"
23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
24 #include "tensorflow/core/framework/numeric_types.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/tensor_types.h"
27 #include "tensorflow/core/kernels/gpu_prim.h"
28 #include "tensorflow/core/platform/statusor.h"
29 #include "tensorflow/core/util/gpu_kernel_helper.h"
30 #include "tensorflow/core/util/gpu_launch_config.h"
31 #include "tensorflow/stream_executor/stream_executor.h"
32 
33 namespace tensorflow {
34 namespace {
35 
36 struct
37 #if GOOGLE_CUDA
38     __align__(16)
39 #endif
40         Box {
41   float x1, y1, x2, y2;
42 };
43 typedef Eigen::GpuDevice GPUDevice;
44 typedef Eigen::ThreadPoolDevice CPUDevice;
45 
46 // This is the width of the bitmask for masking boxes for each thread.  This
47 // needs to be a multiple of 2 (a POD width usually) so that division and modulo
48 // can be implemented as bit operations during host selection.
49 constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
50 
51 // Helper to calculate modulo mask and shift bits.
52 //
53 // For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F, thus
54 // i % 32 == i & 31. Similarly ShiftBits will be 5 so that
55 // i / 32 == i >> 5. Using these bit operations should reduce the stall on host
56 // thread.
NumBits(int n)57 constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
58 constexpr int kNmsBoxesPerThreadModuloMask = kNmsBoxesPerThread - 1;
59 constexpr int kNmsBoxesPerThreadShiftBits =
60     NumBits(kNmsBoxesPerThreadModuloMask);
61 
62 constexpr int kNmsBlockDim = 16;
63 constexpr int kNmsBlockDimMax = 128;
64 constexpr int kNmsChunkSize = 2000;
65 
66 template <typename T>
Swap(T & a,T & b)67 __device__ EIGEN_STRONG_INLINE void Swap(T& a, T& b) {
68   T c(a);
69   a = b;
70   b = c;
71 }
72 
73 // Check whether two boxes have an IoU greater than threshold.
74 template <typename T>
OverThreshold(const Box * a,const Box * b,const float a_area,const T iou_threshold)75 __device__ EIGEN_STRONG_INLINE bool OverThreshold(const Box* a, const Box* b,
76                                                   const float a_area,
77                                                   const T iou_threshold) {
78   const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
79   if (a_area == 0.0f || b_area == 0.0f) return false;
80   const float xx1 = fmaxf(a->x1, b->x1);
81   const float yy1 = fmaxf(a->y1, b->y1);
82   const float xx2 = fminf(a->x2, b->x2);
83   const float yy2 = fminf(a->y2, b->y2);
84 
85   // fdimf computes the positive difference between xx2+1 and xx1.
86   const float w = fdimf(xx2, xx1);
87   const float h = fdimf(yy2, yy1);
88   const float intersection = w * h;
89 
90   // Testing for aa/bb > t
91   // eq with aa > bb*t (b is !=0)
92   // avoiding divisions.
93   const float aa = intersection;
94   const float bb = a_area + b_area - intersection;
95   const float bt = bb * iou_threshold;
96   return aa > bt;
97 }
98 
99 template <bool flip_box>
100 __device__ EIGEN_STRONG_INLINE void Flipped(Box& box);
101 
102 template <>
Flipped(Box & box)103 __device__ EIGEN_STRONG_INLINE void Flipped<false>(Box& box) {}
104 
105 template <>
Flipped(Box & box)106 __device__ EIGEN_STRONG_INLINE void Flipped<true>(Box& box) {
107   if (box.x1 > box.x2) Swap(box.x1, box.x2);
108   if (box.y1 > box.y2) Swap(box.y1, box.y2);
109 }
110 template <typename T>
CheckBit(T * bit_mask,uint32 bit)111 __device__ EIGEN_STRONG_INLINE bool CheckBit(T* bit_mask, uint32 bit) {
112   constexpr uint32 kNumBits = 8 * sizeof(T);
113   return (bit_mask[bit / kNumBits] >> (bit % kNumBits)) & 1;
114 }
115 
116 // Produce a global bitmask (result_mask) of selected boxes from bitmask
117 // generated by NMSKernel. Abort early if max_boxes boxes are selected. Bitmask
118 // is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
NMSReduce(const int * bitmask,const int bit_mask_len,const int num_boxes,const int max_boxes,char * result_mask)119 __global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
120                           const int num_boxes, const int max_boxes,
121                           char* result_mask) {
122   extern __shared__ int local[];
123   // Set global mask to accept all boxes.
124   for (int box : GpuGridRangeX(bit_mask_len)) {
125     local[box] = 0xFFFFFFFF;
126   }
127   __syncthreads();
128 
129   int accepted_boxes = 0;
130   for (int box = 0; box < num_boxes - 1; ++box) {
131     // If current box is masked by an earlier box, skip it.
132     if (!CheckBit(local, box)) {
133       continue;
134     }
135     accepted_boxes += 1;
136     int offset = box * bit_mask_len;
137     // Update global mask with current box's mask.
138     for (int b : GpuGridRangeX(bit_mask_len)) {
139       local[b] &= ~bitmask[offset + b];
140     }
141     __syncthreads();
142     if (accepted_boxes > max_boxes) break;
143   }
144 
145   // Copy global mask to result_max char array, which we use in
146   // cub::DeviceSelect later.  In theory we could skip this test and use the
147   // bitmask in DeviceSelect directly, but in practice this part of the kernel
148   // is very cheap anyway.
149   for (int box : GpuGridRangeX(num_boxes)) {
150     result_mask[box] = CheckBit(local, box);
151   }
152 }
153 
154 // For each box, compute a bitmask of boxes which has an overlap with given box
155 // above threshold.
156 //
157 // Starting from highes scoring box, mark any box which has IoU>threshold with
158 // given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
159 // each box has bitmask of overlaps of length bit_mask_len.
160 //
161 // If flip_box is true boxes may have x1>x2 and or y1>y2. If so change the
162 // coordinates such that for all boxes x1<x2 and y1<y2. Else boxes should have
163 // x1<x2 and y1<y2.
164 template <bool flip_box>
165 __launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
NMSKernel(const Box * d_desc_sorted_boxes,const int num_boxes,const float iou_threshold,const int bit_mask_len,int * d_delete_mask)166     void NMSKernel(const Box* d_desc_sorted_boxes, const int num_boxes,
167                    const float iou_threshold, const int bit_mask_len,
168                    int* d_delete_mask) {
169   // Storing boxes used by this CUDA block in the shared memory.
170   __shared__ Box shared_i_boxes[kNmsBlockDim];
171   // Same thing with areas
172   __shared__ float shared_i_areas[kNmsBlockDim];
173   // The condition of the for loop is common to all threads in the block.
174   // This is necessary to be able to call __syncthreads() inside of the loop.
175   for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
176        i_block_offset += blockDim.x * gridDim.x) {
177     const int i = i_block_offset + threadIdx.x;
178     if (i < num_boxes) {
179       // One 1D line load the boxes for x-dimension.
180       if (threadIdx.y == 0) {
181         Box box = d_desc_sorted_boxes[i];
182         Flipped<flip_box>(box);
183         shared_i_boxes[threadIdx.x] = box;
184         shared_i_areas[threadIdx.x] = (box.x2 - box.x1) * (box.y2 - box.y1);
185       }
186     }
187     __syncthreads();
188     for (int j_thread_offset =
189              kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
190          j_thread_offset < num_boxes;
191          j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
192       // Note : We can do everything using multiplication,
193       // and use fp16 - we are comparing against a low precision
194       // threshold.
195       int above_threshold = 0;
196       // Make sure that threads are within valid domain.
197       bool valid = false;
198       // Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
199       // if it is overlapping with current box
200       for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
201         // This thread will compare Box i and Box j.
202         const int j = j_thread_offset + ib;
203         if (i >= j || i >= num_boxes || j >= num_boxes) continue;
204         valid = true;
205         Box j_box = d_desc_sorted_boxes[j];
206         const Box i_box = shared_i_boxes[threadIdx.x];
207         Flipped<flip_box>(j_box);
208         if (OverThreshold<float>(&i_box, &j_box, shared_i_areas[threadIdx.x],
209                                  iou_threshold)) {
210           // we have score[j] <= score[i].
211           above_threshold |= (1U << ib);
212         }
213       }
214       if (valid) {
215         d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
216             above_threshold;
217       }
218     }
219     __syncthreads();  // making sure everyone is done reading shared memory.
220   }
221 }
222 // Variadic template helpers for Index selecting multiple arrays at the same
223 // time
224 template <typename Index>
SelectHelper(const Index i_selected,const Index i_original)225 __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
226                                                  const Index i_original) {}
227 
228 template <typename Index, typename T, typename... Args>
SelectHelper(const Index i_selected,const Index i_original,const T * original,T * selected,Args...args)229 __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
230                                                  const Index i_original,
231                                                  const T* original, T* selected,
232                                                  Args... args) {
233   selected[i_selected] = original[i_original];
234   SelectHelper(i_selected, i_original, args...);
235 }
236 
237 // Helper template to select elements from original arrays using the index
238 // mapping and store into selected array. Each array sharing same mapping need
239 // to be passed as pairs of pointers to original and selected arrays. For
240 // selecting 2 arrays call would be
241 // IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
242 // selected2).
243 template <typename Index, typename T, typename... Args>
IndexMultiSelect(const int num_elements,const Index * indices,const T * original,T * selected,Args...args)244 __global__ void IndexMultiSelect(const int num_elements, const Index* indices,
245                                  const T* original, T* selected, Args... args) {
246   for (const int idx : GpuGridRangeX(num_elements)) {
247     SelectHelper(idx, indices[idx], original, selected, args...);
248   }
249 }
250 
251 template <typename T>
Iota(const int num_elements,const T offset,T * to_fill)252 __global__ void Iota(const int num_elements, const T offset, T* to_fill) {
253   for (int idx : GpuGridRangeX(num_elements)) {
254     to_fill[idx] = static_cast<T>(idx) + offset;
255   }
256 }
257 
258 // TensorFlow with nvcc doesn't build with --extended-lambda, so we have to use
259 // an explicit functor instead of a device lambda.
260 struct GreaterThanCubOp {
261   float threshold_;
GreaterThanCubOptensorflow::__anone5e413800111::GreaterThanCubOp262   __host__ __device__ __forceinline__ GreaterThanCubOp(float threshold)
263       : threshold_(threshold) {}
operator ()tensorflow::__anone5e413800111::GreaterThanCubOp264   __host__ __device__ __forceinline__ bool operator()(const float& val) const {
265     return (val > threshold_);
266   }
267 };
268 
269 // Uses DeviceSelect::If to count number of elements.
270 //
271 // (It might be better to use DeviceReduce::Sum with a custom iterator to do the
272 // count.  But in practice SelectIf is quite fast.)
273 template <typename Op>
CountIf(OpKernelContext * context,const float * dev_array,const Op & op,int num_elements)274 StatusOr<int> CountIf(OpKernelContext* context, const float* dev_array,
275                       const Op& op, int num_elements) {
276   size_t workspace_size = 0;
277   auto cuda_stream = tensorflow::GetGpuStream(context);
278   auto device = context->eigen_gpu_device();
279   gpuprim::DeviceSelect::If(nullptr, workspace_size,
280                             static_cast<float*>(nullptr),
281                             static_cast<float*>(nullptr),
282                             static_cast<int*>(nullptr), num_elements, op);
283 
284   Tensor scratch_output;
285   TF_RETURN_IF_ERROR(context->allocate_temp(
286       DataType::DT_FLOAT, TensorShape({num_elements}), &scratch_output));
287 
288   Tensor workspace;
289   TF_RETURN_IF_ERROR(context->allocate_temp(
290       DataType::DT_INT8, TensorShape({(int64)workspace_size}), &workspace));
291 
292   // num_selected is a host pinned tensor.  The GPU kernel can write to it
293   // directly, instead of writing to GPU memory and then copying down to
294   // num_selected, saving us a small D2H memcpy.  We've observed that even small
295   // D2H copies on the compute stream can have an outsized effect on latency.
296   Tensor num_selected;
297   AllocatorAttributes pinned_alloc_attrs;
298   pinned_alloc_attrs.set_on_host(true);
299   pinned_alloc_attrs.set_gpu_compatible(true);
300   TF_RETURN_IF_ERROR(context->allocate_temp(
301       DataType::DT_INT32, TensorShape({1}), &num_selected, pinned_alloc_attrs));
302 
303   gpuEvent_t copy_done;
304   TF_RETURN_IF_CUDA_ERROR(
305       gpuEventCreateWithFlags(&copy_done, gpuEventDisableTiming));
306   TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::If(
307       workspace.flat<int8>().data(), workspace_size, dev_array,
308       scratch_output.flat<float>().data(), num_selected.flat<int32>().data(),
309       num_elements, op, cuda_stream));
310   TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
311   TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
312   return *num_selected.flat<int32>().data();
313 }
314 
DoNMS(OpKernelContext * context,const Tensor & boxes,const Tensor & scores,const int64_t max_output_size,const float iou_threshold_val,const float score_threshold,bool pad_to_max_output,int * num_saved_outputs)315 Status DoNMS(OpKernelContext* context, const Tensor& boxes,
316              const Tensor& scores, const int64_t max_output_size,
317              const float iou_threshold_val, const float score_threshold,
318              bool pad_to_max_output, int* num_saved_outputs) {
319   int num_boxes = boxes.dim_size(0);
320   size_t cub_sort_temp_storage_bytes = 0;
321   auto cuda_stream = GetGpuStream(context);
322   auto device = context->eigen_gpu_device();
323   // Calling cub with nullptrs as inputs will make it return
324   // workspace size needed for the operation instead of doing the operation.
325   // In this specific instance, cub_sort_temp_storage_bytes will contain the
326   // necessary workspace size for sorting after the call.
327   if (num_boxes == 0) {
328     Tensor* output_indices = nullptr;
329     TF_RETURN_IF_ERROR(
330         context->allocate_output(0, TensorShape({0}), &output_indices));
331     return Status::OK();
332   }
333 
334   cudaError_t cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending(
335       nullptr, cub_sort_temp_storage_bytes,
336       static_cast<float*>(nullptr),  // scores
337       static_cast<float*>(nullptr),  // sorted scores
338       static_cast<int*>(nullptr),    // input indices
339       static_cast<int*>(nullptr),    // sorted indices
340       num_boxes,                     // num items
341       0, 8 * sizeof(float),          // sort all bits
342       cuda_stream);
343   TF_RETURN_IF_CUDA_ERROR(cuda_ret);
344   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
345 
346   Tensor d_cub_sort_buffer;
347   TF_RETURN_IF_ERROR(context->allocate_temp(
348       DataType::DT_INT8, TensorShape({(int64)cub_sort_temp_storage_bytes}),
349       &d_cub_sort_buffer));
350   Tensor d_indices;
351   TF_RETURN_IF_ERROR(context->allocate_temp(
352       DataType::DT_INT32, TensorShape({num_boxes}), &d_indices));
353   Tensor d_sorted_indices;
354   TF_RETURN_IF_ERROR(context->allocate_temp(
355       DataType::DT_INT32, TensorShape({num_boxes}), &d_sorted_indices));
356   Tensor d_selected_indices;
357   TF_RETURN_IF_ERROR(context->allocate_temp(
358       DataType::DT_INT32, TensorShape({num_boxes}), &d_selected_indices));
359   Tensor d_sorted_scores;
360   TF_RETURN_IF_ERROR(context->allocate_temp(
361       DataType::DT_FLOAT, TensorShape({num_boxes}), &d_sorted_scores));
362   Tensor d_sorted_boxes;
363   TF_RETURN_IF_ERROR(context->allocate_temp(
364       DataType::DT_FLOAT, TensorShape({num_boxes, 4}), &d_sorted_boxes));
365 
366   // this will return sorted scores and their indices
367   auto config = GetGpuLaunchConfig(num_boxes, device);
368   // initialize box and score indices
369   TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
370                               config.thread_per_block, 0, device.stream(),
371                               config.virtual_thread_count, 0,
372                               d_indices.flat<int>().data()));
373   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
374   cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending(
375       d_cub_sort_buffer.flat<int8>().data(), cub_sort_temp_storage_bytes,
376       scores.flat<float>().data(), d_sorted_scores.flat<float>().data(),
377       d_indices.flat<int>().data(), d_sorted_indices.flat<int>().data(),
378       num_boxes, 0,
379       8 * sizeof(float),  // sort all bits
380       cuda_stream);
381   TF_RETURN_IF_CUDA_ERROR(cuda_ret);
382 
383   // get pointers for easy access
384   const float4* original_boxes =
385       reinterpret_cast<const float4*>(boxes.flat<float>().data());
386   float4* sorted_boxes =
387       reinterpret_cast<float4*>(d_sorted_boxes.flat<float>().data());
388   const int* sorted_indices = d_sorted_indices.flat<int>().data();
389   // sort boxes using indices
390   TF_CHECK_OK(GpuLaunchKernel(IndexMultiSelect<int, float4>, config.block_count,
391                               config.thread_per_block, 0, device.stream(),
392                               config.virtual_thread_count, sorted_indices,
393                               original_boxes, sorted_boxes));
394   int limited_num_boxes = num_boxes;
395   // filter boxes by scores if nms v3
396   if (score_threshold > std::numeric_limits<float>::lowest()) {
397     GreaterThanCubOp score_limit(score_threshold);
398     TF_ASSIGN_OR_RETURN(limited_num_boxes,
399                         CountIf(context, d_sorted_scores.flat<float>().data(),
400                                 score_limit, num_boxes));
401     if (limited_num_boxes == 0) {
402       Tensor* output_indices = nullptr;
403       VLOG(1) << "Number of boxes above score threshold " << score_threshold
404               << " is 0";
405       int len_output = pad_to_max_output ? max_output_size : 0;
406       *num_saved_outputs = 0;
407       TF_RETURN_IF_ERROR(context->allocate_output(0, TensorShape({len_output}),
408                                                   &output_indices));
409       return Status::OK();
410     } else {
411       VLOG(2) << "Number of boxes above threshold=" << score_threshold << " is "
412               << limited_num_boxes;
413     }
414   }
415   int num_to_keep = 0;
416   // There is no guarantee that boxes are given in the for x1<x2 and/or y1<y2,
417   // flip boxes if necessary!
418   const bool flip_boxes = true;
419   auto status = NmsGpu(d_sorted_boxes.flat<float>().data(), limited_num_boxes,
420                        iou_threshold_val, d_selected_indices.flat<int>().data(),
421                        &num_to_keep, context, max_output_size, flip_boxes);
422   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
423   if (!status.ok()) {
424     context->SetStatus(status);
425     return status;
426   }
427   Tensor* output_indices = nullptr;
428   int num_outputs = std::min(num_to_keep, (int)max_output_size);  // no padding!
429   if (pad_to_max_output && num_outputs != max_output_size) {
430     TF_RETURN_IF_ERROR(context->allocate_output(
431         0, TensorShape({max_output_size}), &output_indices));
432     config = GetGpuLaunchConfig(max_output_size, device);
433     TF_CHECK_OK(GpuLaunchKernel(SetZero<int>, config.block_count,
434                                 config.thread_per_block, 0, device.stream(),
435                                 config.virtual_thread_count,
436                                 output_indices->flat<int>().data()));
437 
438   } else {
439     TF_RETURN_IF_ERROR(context->allocate_output(0, TensorShape({num_outputs}),
440                                                 &output_indices));
441   }
442   if (num_outputs == 0) {
443     *num_saved_outputs = num_outputs;
444     return Status::OK();
445   }
446   config = GetGpuLaunchConfig(num_outputs, device);
447   TF_CHECK_OK(GpuLaunchKernel(
448       IndexMultiSelect<int, int>, config.block_count, config.thread_per_block,
449       0, device.stream(), config.virtual_thread_count,
450       d_selected_indices.flat<int>().data(), sorted_indices,
451       (*output_indices).flat<int>().data()));
452   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
453   *num_saved_outputs = num_outputs;
454   return Status::OK();
455 }
456 
CheckValidInputs(const Tensor & boxes,const Tensor & scores,const Tensor & max_output_size,const Tensor & iou_threshold)457 Status CheckValidInputs(const Tensor& boxes, const Tensor& scores,
458                         const Tensor& max_output_size,
459                         const Tensor& iou_threshold) {
460   if (!TensorShapeUtils::IsScalar(max_output_size.shape())) {
461     return errors::InvalidArgument("max_output_size must be 0-D, got shape ",
462                                    max_output_size.shape().DebugString(),
463                                    " (Shape must be rank 0 but is ", "rank ",
464                                    max_output_size.dims(), ")");
465   }
466   if (!TensorShapeUtils::IsScalar(iou_threshold.shape())) {
467     return errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
468                                    iou_threshold.shape().DebugString(),
469                                    " (Shape must be rank 0 but is rank ",
470                                    iou_threshold.dims(), ")");
471   }
472   const float iou_threshold_val = iou_threshold.scalar<float>()();
473   if (iou_threshold_val < 0 || iou_threshold_val > 1) {
474     return errors::InvalidArgument("iou_threshold must be in [0, 1]");
475   }
476   if (boxes.dims() != 2) {
477     return errors::InvalidArgument(
478         "boxes must be a rank 2 tensor! (Shape must "
479         "be rank 2 but is rank ",
480         boxes.dims(), ")");
481   }
482   int num_boxes = boxes.dim_size(0);
483   if (boxes.dim_size(1) != 4) {
484     return errors::InvalidArgument(
485         "boxes must be Nx4 (Dimension must be 4 but"
486         " is ",
487         boxes.dim_size(1), ")");
488   }
489   if (scores.dims() != 1) {
490     return errors::InvalidArgument(
491         "scores must be a vector! (Shape must be "
492         "rank 1 but is rank ",
493         scores.dims(), ")");
494   }
495   if (scores.dim_size(0) != num_boxes) {
496     return errors::InvalidArgument(
497         "scores has incompatible shape "        // message must be exactly this
498         "(Dimensions must be equal, but are ",  // otherwise tests fail!
499         num_boxes, " and ", scores.dim_size(0), ")");
500   }
501   return Status::OK();
502 }
503 class NonMaxSuppressionV2GPUOp : public OpKernel {
504  public:
NonMaxSuppressionV2GPUOp(OpKernelConstruction * context)505   explicit NonMaxSuppressionV2GPUOp(OpKernelConstruction* context)
506       : OpKernel(context) {}
507 
Compute(OpKernelContext * context)508   void Compute(OpKernelContext* context) override {
509     // boxes: [num_boxes, 4]
510     const Tensor& boxes = context->input(0);
511     // scores: [num_boxes]
512     const Tensor& scores = context->input(1);
513     // max_output_size: scalar
514     const Tensor& max_output_size = context->input(2);
515     // iou_threshold: scalar
516     const Tensor& iou_threshold = context->input(3);
517     auto valid =
518         CheckValidInputs(boxes, scores, max_output_size, iou_threshold);
519     if (!valid.ok()) {
520       context->SetStatus(valid);
521       return;
522     }
523     int num_boxes = boxes.dim_size(0);
524     if (num_boxes == 0) {
525       Tensor* output_indices = nullptr;
526       OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({0}),
527                                                        &output_indices));
528       return;
529     }
530     const float iou_threshold_val = iou_threshold.scalar<float>()();
531     const int64_t output_size = max_output_size.scalar<int>()();
532 
533     OP_REQUIRES_OK(
534         context,
535         DoNMS(context, boxes, scores, output_size, iou_threshold_val,
536               /*score_threshold is float lowest if score threshold is disabled*/
537               std::numeric_limits<float>::lowest(),
538               /*pad_to_max_output=*/false, &num_boxes));
539   }
540 };
541 
542 class NonMaxSuppressionV3GPUOp : public OpKernel {
543  public:
NonMaxSuppressionV3GPUOp(OpKernelConstruction * context)544   explicit NonMaxSuppressionV3GPUOp(OpKernelConstruction* context)
545       : OpKernel(context) {}
546 
Compute(OpKernelContext * context)547   void Compute(OpKernelContext* context) override {
548     // boxes: [num_boxes, 4]
549     const Tensor& boxes = context->input(0);
550     // scores: [num_boxes]
551     const Tensor& scores = context->input(1);
552     // max_output_size: scalar
553     const Tensor& max_output_size = context->input(2);
554     // iou_threshold: scalar
555     const Tensor& iou_threshold = context->input(3);
556     auto valid =
557         CheckValidInputs(boxes, scores, max_output_size, iou_threshold);
558     if (!valid.ok()) {
559       context->SetStatus(valid);
560       return;
561     }
562 
563     const Tensor& score_threshold = context->input(4);
564     OP_REQUIRES(
565         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
566         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
567                                 score_threshold.shape().DebugString()));
568     const float score_threshold_val = score_threshold.scalar<float>()();
569     int num_boxes = boxes.dim_size(0);
570     if (num_boxes == 0) {
571       Tensor* output_indices = nullptr;
572       OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({0}),
573                                                        &output_indices));
574       return;
575     }
576     const float iou_threshold_val = iou_threshold.scalar<float>()();
577     const int64_t output_size = max_output_size.scalar<int>()();
578     OP_REQUIRES_OK(context, DoNMS(context, boxes, scores, output_size,
579                                   iou_threshold_val, score_threshold_val,
580                                   /*pad_to_max_output=*/false, &num_boxes));
581   }
582 };
583 
584 class NonMaxSuppressionV4GPUOp : public OpKernel {
585  public:
NonMaxSuppressionV4GPUOp(OpKernelConstruction * context)586   explicit NonMaxSuppressionV4GPUOp(OpKernelConstruction* context)
587       : OpKernel(context) {
588     OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
589                                              &pad_to_max_output_size_));
590   }
591 
Compute(OpKernelContext * context)592   void Compute(OpKernelContext* context) override {
593     // boxes: [num_boxes, 4]
594     const Tensor& boxes = context->input(0);
595     // scores: [num_boxes]
596     const Tensor& scores = context->input(1);
597     // max_output_size: scalar
598     const Tensor& max_output_size = context->input(2);
599     // iou_threshold: scalar
600     const Tensor& iou_threshold = context->input(3);
601     auto valid =
602         CheckValidInputs(boxes, scores, max_output_size, iou_threshold);
603     if (!valid.ok()) {
604       context->SetStatus(valid);
605       return;
606     }
607 
608     const Tensor& score_threshold = context->input(4);
609     OP_REQUIRES(
610         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
611         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
612                                 score_threshold.shape().DebugString()));
613     const float score_threshold_val = score_threshold.scalar<float>()();
614 
615     Tensor* num_outputs_t = nullptr;
616     OP_REQUIRES_OK(context,
617                    context->allocate_output(1, tensorflow::TensorShape({}),
618                                             &num_outputs_t));
619     auto device = context->eigen_gpu_device();
620     int num_boxes = boxes.dim_size(0);
621     if (num_boxes == 0) {
622       Tensor* output_indices = nullptr;
623       OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}),
624                                                        &output_indices));
625       device.memcpy(num_outputs_t->flat<int>().data(), &num_boxes, sizeof(int));
626       return;
627     }
628 
629     const float iou_threshold_val = iou_threshold.scalar<float>()();
630     const int64_t output_size = max_output_size.scalar<int>()();
631     int num_outputs = 0;
632     OP_REQUIRES_OK(context, DoNMS(context, boxes, scores, output_size,
633                                   iou_threshold_val, score_threshold_val,
634                                   pad_to_max_output_size_, &num_outputs));
635     device.memcpyHostToDevice(num_outputs_t->flat<int>().data(), &num_outputs,
636                               sizeof(int));
637     return;
638   }
639 
640  private:
641   bool pad_to_max_output_size_;
642 };
643 
644 }  // anonymous namespace
645 
NmsGpu(const float * d_sorted_boxes_float_ptr,const int num_boxes,const float iou_threshold,int * d_selected_indices,int * h_nkeep,OpKernelContext * context,const int max_boxes,bool flip_boxes)646 Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
647               const float iou_threshold, int* d_selected_indices, int* h_nkeep,
648               OpKernelContext* context, const int max_boxes, bool flip_boxes) {
649   // Making sure we respect the __align(16)__
650   // we promised to the compiler.
651   auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
652   if ((iptr & 15) != 0) {
653     return errors::InvalidArgument("Boxes should be aligned to 16 Bytes.");
654   }
655   // allocate bitmask arrays on host and on device
656   Tensor h_num_selected, d_nms_mask;
657   const int bit_mask_len =
658       (num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
659 
660   int64 max_nms_mask_size = num_boxes * bit_mask_len;
661   TF_RETURN_IF_ERROR(context->allocate_temp(
662       DataType::DT_INT32, TensorShape({max_nms_mask_size}), &d_nms_mask));
663   // reset data sensitive tensors
664   auto device = context->eigen_gpu_device();
665   auto config = GetGpuLaunchConfig(d_nms_mask.NumElements(), device);
666   TF_CHECK_OK(GpuLaunchKernel(SetZero<int>, config.block_count,
667                               config.thread_per_block, 0, device.stream(),
668                               config.virtual_thread_count,
669                               d_nms_mask.flat<int32>().data()));
670 
671   // h_num_selected is a host pinned tensor.  The GPU kernel can write to it
672   // directly, instead of writing to GPU memory and then copying down to
673   // num_selected, saving us a small D2H memcpy.  We've observed that even small
674   // D2H copies on the compute stream can have an outsized effect on latency.
675   AllocatorAttributes pinned_alloc_attrs;
676   pinned_alloc_attrs.set_on_host(true);
677   pinned_alloc_attrs.set_gpu_compatible(true);
678   TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
679                                             TensorShape({1}), &h_num_selected,
680                                             pinned_alloc_attrs));
681 
682   int* d_delete_mask = d_nms_mask.flat<int>().data();
683   int* h_selected_count = h_num_selected.flat<int>().data();
684   const Box* d_sorted_boxes =
685       reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
686   dim3 block_dim, thread_block;
687   int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
688   num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
689   block_dim.x = num_blocks;
690   block_dim.y = num_blocks;
691   block_dim.z = 1;
692   thread_block.x = kNmsBlockDim;
693   thread_block.y = kNmsBlockDim;
694   thread_block.z = 1;
695   if (flip_boxes) {
696     TF_CHECK_OK(GpuLaunchKernel(NMSKernel<true>, block_dim, thread_block, 0,
697                                 device.stream(), d_sorted_boxes, num_boxes,
698                                 iou_threshold, bit_mask_len, d_delete_mask));
699   } else {
700     TF_CHECK_OK(GpuLaunchKernel(NMSKernel<false>, block_dim, thread_block, 0,
701                                 device.stream(), d_sorted_boxes, num_boxes,
702                                 iou_threshold, bit_mask_len, d_delete_mask));
703   }
704   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
705   // Overlapping CPU computes and D2H memcpy
706   // both take about the same time
707 
708   config = GetGpuLaunchConfig(num_boxes, device);
709   Tensor selected_boxes;
710   TF_RETURN_IF_ERROR(context->allocate_temp(
711       DataType::DT_INT8, TensorShape({num_boxes}), &selected_boxes));
712   Tensor d_indices;
713   TF_RETURN_IF_ERROR(context->allocate_temp(
714       DataType::DT_INT32, TensorShape({num_boxes}), &d_indices));
715   TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
716                               config.thread_per_block, 0, device.stream(),
717                               config.virtual_thread_count, 0,
718                               d_indices.flat<int>().data()));
719 
720   char* selected = (char*)(selected_boxes.flat<int8>().data());
721   TF_CHECK_OK(GpuLaunchKernel(NMSReduce, 1, 1024, bit_mask_len * sizeof(int),
722                               device.stream(), d_delete_mask, bit_mask_len,
723                               num_boxes, max_boxes, selected));
724   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
725   // do Cub::deviceSelect::flagged
726   size_t flagged_buffer_size = 0;
727   gpuprim::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
728                                  flagged_buffer_size,
729                                  static_cast<int*>(nullptr),   // input
730                                  static_cast<char*>(nullptr),  // selection flag
731                                  static_cast<int*>(nullptr),   // selected items
732                                  static_cast<int*>(nullptr),   // num_selected
733                                  num_boxes, device.stream());
734   Tensor cub_scratch;
735   TF_RETURN_IF_ERROR(context->allocate_temp(
736       DataType::DT_INT8, TensorShape({(int64)flagged_buffer_size}),
737       &cub_scratch));
738   Tensor d_num_selected;
739   TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
740                                             TensorShape({1}), &d_num_selected));
741 
742   gpuprim::DeviceSelect::Flagged(
743       (void*)cub_scratch.flat<int8>().data(),  // temp_storage
744       flagged_buffer_size,
745       d_indices.flat<int>().data(),  // input
746       selected,                      // selection flag
747       d_selected_indices,            // selected items
748       h_selected_count, num_boxes, device.stream());
749   gpuEvent_t copy_done;
750   TF_RETURN_IF_CUDA_ERROR(
751       gpuEventCreateWithFlags(&copy_done, gpuEventDisableTiming));
752   TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
753   TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
754   gpuEventDestroy(copy_done);
755 
756   *h_nkeep = *h_selected_count;
757   return Status::OK();
758 }
759 
760 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2")
761                             .TypeConstraint<float>("T")
762                             .Device(DEVICE_GPU)
763                             .HostMemory("iou_threshold")
764                             .HostMemory("max_output_size"),
765                         NonMaxSuppressionV2GPUOp);
766 
767 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
768                             .TypeConstraint<float>("T")
769                             .Device(DEVICE_GPU)
770                             .HostMemory("iou_threshold")
771                             .HostMemory("max_output_size")
772                             .HostMemory("score_threshold"),
773                         NonMaxSuppressionV3GPUOp);
774 
775 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV4")
776                             .TypeConstraint<float>("T")
777                             .Device(DEVICE_GPU)
778                             .HostMemory("iou_threshold")
779                             .HostMemory("max_output_size")
780                             .HostMemory("score_threshold"),
781                         NonMaxSuppressionV4GPUOp);
782 
783 }  // namespace tensorflow
784 #endif
785