1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 #define EIGEN_USE_GPU
18 #include "tensorflow/core/kernels/image/non_max_suppression_op.h"
19
20 #include <limits>
21
22 #include "absl/strings/str_cat.h"
23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
24 #include "tensorflow/core/framework/numeric_types.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/tensor_types.h"
27 #include "tensorflow/core/kernels/gpu_prim.h"
28 #include "tensorflow/core/util/gpu_kernel_helper.h"
29 #include "tensorflow/core/util/gpu_launch_config.h"
30 #include "tensorflow/stream_executor/stream_executor.h"
31
32 struct
33 #if GOOGLE_CUDA
34 __align__(16)
35 #endif
36 Box {
37 float x1, y1, x2, y2;
38 };
39
40 namespace tensorflow {
41 typedef Eigen::GpuDevice GPUDevice;
42 typedef Eigen::ThreadPoolDevice CPUDevice;
43
44 // This is the width of the bitmask for masking boxes for each thread.
45 // This needs to be a multiple of 2(a POD width usually) so that division and
46 // modulo can be implemented as bit operations during host selection.
47 constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
48 // Helper to calculate modulo mask and shift bits.
49 // For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
50 // i % 32 == i & 31. Similarly ShiftBits will be 5 so that
51 // i / 32 == i >> 5. Using these bit operations should reduce the stall on host
52 // thread.
NumBits(int n)53 constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
54 constexpr int kNmsBoxesPerThreadModuloMask = kNmsBoxesPerThread - 1;
55 constexpr int kNmsBoxesPerThreadShiftBits =
56 NumBits(kNmsBoxesPerThreadModuloMask);
57
58 constexpr int kNmsBlockDim = 16;
59 constexpr int kNmsBlockDimMax = 128;
60 constexpr int kNmsChunkSize = 2000;
61
62 template <typename T>
Swap(T & a,T & b)63 __device__ EIGEN_STRONG_INLINE void Swap(T& a, T& b) {
64 T c(a);
65 a = b;
66 b = c;
67 }
68
69 // Check whether two boxes have an IoU greater than threshold.
70 template <typename T>
OverThreshold(const Box * a,const Box * b,const float a_area,const T iou_threshold)71 __device__ EIGEN_STRONG_INLINE bool OverThreshold(const Box* a, const Box* b,
72 const float a_area,
73 const T iou_threshold) {
74 const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
75 if (a_area == 0.0f || b_area == 0.0f) return false;
76 const float xx1 = fmaxf(a->x1, b->x1);
77 const float yy1 = fmaxf(a->y1, b->y1);
78 const float xx2 = fminf(a->x2, b->x2);
79 const float yy2 = fminf(a->y2, b->y2);
80
81 // fdimf computes the positive difference between xx2+1 and xx1.
82 const float w = fdimf(xx2, xx1);
83 const float h = fdimf(yy2, yy1);
84 const float intersection = w * h;
85
86 // Testing for aa/bb > t
87 // eq with aa > bb*t (b is !=0)
88 // avoiding divisions.
89 const float aa = intersection;
90 const float bb = a_area + b_area - intersection;
91 const float bt = bb * iou_threshold;
92 return aa > bt;
93 }
94
95 template <bool flip_box>
96 __device__ EIGEN_STRONG_INLINE void Flipped(Box& box);
97
98 template <>
Flipped(Box & box)99 __device__ EIGEN_STRONG_INLINE void Flipped<false>(Box& box) {}
100
101 template <>
Flipped(Box & box)102 __device__ EIGEN_STRONG_INLINE void Flipped<true>(Box& box) {
103 if (box.x1 > box.x2) Swap(box.x1, box.x2);
104 if (box.y1 > box.y2) Swap(box.y1, box.y2);
105 }
106 template <typename T>
CheckBit(T * bit_mask,int bit)107 __device__ EIGEN_STRONG_INLINE bool CheckBit(T* bit_mask, int bit) {
108 constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
109 constexpr int kRemainderMask = 8 * sizeof(T) - 1;
110 int bin = bit >> kShiftLen;
111 return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
112 }
113
114 // Produce a global bitmask (result_mask) of selected boxes from bitmask
115 // generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
116 // is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
NMSReduce(const int * bitmask,const int bit_mask_len,const int num_boxes,const int max_boxes,char * result_mask)117 __global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
118 const int num_boxes, const int max_boxes,
119 char* result_mask) {
120 extern __shared__ int local[];
121 // set global mask to accept all boxes
122 for (int box : GpuGridRangeX(bit_mask_len)) {
123 local[box] = 0xFFFFFFFF;
124 }
125 __syncthreads();
126 int accepted_boxes = 0;
127 for (int box = 0; box < num_boxes - 1; ++box) {
128 // if current box is masked by an earlier box, skip it.
129 if (!CheckBit(local, box)) {
130 continue;
131 }
132 accepted_boxes += 1;
133 int offset = box * bit_mask_len;
134 // update global mask with current box's mask
135 for (int b : GpuGridRangeX(bit_mask_len)) {
136 local[b] &= ~bitmask[offset + b];
137 }
138 __syncthreads();
139 if (accepted_boxes > max_boxes) break;
140 }
141 // copy global mask to result_max char array. char array is needed for
142 // cub::DeviceSelect later.
143 for (int box : GpuGridRangeX(num_boxes)) {
144 result_mask[box] = CheckBit(local, box);
145 }
146 }
147
148 // For each box, compute a bitmask of boxes which has an overlap with given box
149 // above threshold.
150 //
151 // Starting from highes scoring box, mark any box which has IoU>threshold with
152 // given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
153 // each box has bitmask of overlaps of length bit_mask_len.
154 //
155 // If flip_box is true boxes may have x1>x2 and or y1>y2. If so change the
156 // coordinates such that for all boxes x1<x2 and y1<y2. Else boxes should have
157 // x1<x2 and y1<y2.
158 template <bool flip_box>
159 __launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
NMSKernel(const Box * d_desc_sorted_boxes,const int num_boxes,const float iou_threshold,const int bit_mask_len,int * d_delete_mask)160 void NMSKernel(const Box* d_desc_sorted_boxes, const int num_boxes,
161 const float iou_threshold, const int bit_mask_len,
162 int* d_delete_mask) {
163 // Storing boxes used by this CUDA block in the shared memory.
164 __shared__ Box shared_i_boxes[kNmsBlockDim];
165 // Same thing with areas
166 __shared__ float shared_i_areas[kNmsBlockDim];
167 // The condition of the for loop is common to all threads in the block.
168 // This is necessary to be able to call __syncthreads() inside of the loop.
169 for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
170 i_block_offset += blockDim.x * gridDim.x) {
171 const int i = i_block_offset + threadIdx.x;
172 if (i < num_boxes) {
173 // One 1D line load the boxes for x-dimension.
174 if (threadIdx.y == 0) {
175 Box box = d_desc_sorted_boxes[i];
176 Flipped<flip_box>(box);
177 shared_i_boxes[threadIdx.x] = box;
178 shared_i_areas[threadIdx.x] = (box.x2 - box.x1) * (box.y2 - box.y1);
179 }
180 }
181 __syncthreads();
182 for (int j_thread_offset =
183 kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
184 j_thread_offset < num_boxes;
185 j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
186 // Note : We can do everything using multiplication,
187 // and use fp16 - we are comparing against a low precision
188 // threshold.
189 int above_threshold = 0;
190 // Make sure that threads are within valid domain.
191 bool valid = false;
192 // Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
193 // if it is overlapping with current box
194 for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
195 // This thread will compare Box i and Box j.
196 const int j = j_thread_offset + ib;
197 if (i >= j || i >= num_boxes || j >= num_boxes) continue;
198 valid = true;
199 Box j_box = d_desc_sorted_boxes[j];
200 const Box i_box = shared_i_boxes[threadIdx.x];
201 Flipped<flip_box>(j_box);
202 if (OverThreshold<float>(&i_box, &j_box, shared_i_areas[threadIdx.x],
203 iou_threshold)) {
204 // we have score[j] <= score[i].
205 above_threshold |= (1U << ib);
206 }
207 }
208 if (valid) {
209 d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
210 above_threshold;
211 }
212 }
213 __syncthreads(); // making sure everyone is done reading shared memory.
214 }
215 }
216 // Variadic template helpers for Index selecting multiple arrays at the same
217 // time
218 template <typename Index>
SelectHelper(const Index i_selected,const Index i_original)219 __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
220 const Index i_original) {}
221
222 template <typename Index, typename T, typename... Args>
SelectHelper(const Index i_selected,const Index i_original,const T * original,T * selected,Args...args)223 __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
224 const Index i_original,
225 const T* original, T* selected,
226 Args... args) {
227 selected[i_selected] = original[i_original];
228 SelectHelper(i_selected, i_original, args...);
229 }
230
231 // Helper template to select elements from original arrays using the index
232 // mapping and store into selected array. Each array sharing same mapping need
233 // to be passed as pairs of pointers to original and selected arrays. For
234 // selecting 2 arrays call would be
235 // IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
236 // selected2).
237 template <typename Index, typename T, typename... Args>
IndexMultiSelect(const int num_elements,const Index * indices,const T * original,T * selected,Args...args)238 __global__ void IndexMultiSelect(const int num_elements, const Index* indices,
239 const T* original, T* selected, Args... args) {
240 for (const int idx : GpuGridRangeX(num_elements)) {
241 SelectHelper(idx, indices[idx], original, selected, args...);
242 }
243 }
244
245 template <typename T>
Iota(const int num_elements,const T offset,T * to_fill)246 __global__ void Iota(const int num_elements, const T offset, T* to_fill) {
247 for (int idx : GpuGridRangeX(num_elements)) {
248 to_fill[idx] = static_cast<T>(idx) + offset;
249 }
250 }
251
NmsGpu(const float * d_sorted_boxes_float_ptr,const int num_boxes,const float iou_threshold,int * d_selected_indices,int * h_nkeep,OpKernelContext * context,const int max_boxes,bool flip_boxes)252 Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
253 const float iou_threshold, int* d_selected_indices, int* h_nkeep,
254 OpKernelContext* context, const int max_boxes, bool flip_boxes) {
255 // Making sure we respect the __align(16)__
256 // we promised to the compiler.
257 auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
258 if ((iptr & 15) != 0) {
259 return errors::InvalidArgument("Boxes should be aligned to 16 Bytes.");
260 }
261 // allocate bitmask arrays on host and on device
262 Tensor h_num_selected, d_nms_mask;
263 const int bit_mask_len =
264 (num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
265
266 int64 max_nms_mask_size = num_boxes * bit_mask_len;
267 TF_RETURN_IF_ERROR(context->allocate_temp(
268 DataType::DT_INT32, TensorShape({max_nms_mask_size}), &d_nms_mask));
269 // reset data sensitive tensors
270 auto device = context->eigen_gpu_device();
271 auto config = GetGpuLaunchConfig(d_nms_mask.NumElements(), device);
272 TF_CHECK_OK(GpuLaunchKernel(SetZero<int>, config.block_count,
273 config.thread_per_block, 0, device.stream(),
274 config.virtual_thread_count,
275 d_nms_mask.flat<int32>().data()));
276
277 AllocatorAttributes alloc_attr;
278 alloc_attr.set_on_host(true);
279 alloc_attr.set_gpu_compatible(true);
280 // Size of this buffer can be reduced to kNmsChunkSize*bit_mask_len*2 and
281 // using it as a ring buffer. However savings should be a few MB .
282 TF_RETURN_IF_ERROR(context->allocate_temp(
283 DataType::DT_INT32, TensorShape({1}), &h_num_selected, alloc_attr));
284
285 int* d_delete_mask = d_nms_mask.flat<int>().data();
286 int* h_selected_count = h_num_selected.flat<int>().data();
287 const Box* d_sorted_boxes =
288 reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
289 dim3 block_dim, thread_block;
290 int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
291 num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
292 block_dim.x = num_blocks;
293 block_dim.y = num_blocks;
294 block_dim.z = 1;
295 thread_block.x = kNmsBlockDim;
296 thread_block.y = kNmsBlockDim;
297 thread_block.z = 1;
298 if (flip_boxes) {
299 TF_CHECK_OK(GpuLaunchKernel(NMSKernel<true>, block_dim, thread_block, 0,
300 device.stream(), d_sorted_boxes, num_boxes,
301 iou_threshold, bit_mask_len, d_delete_mask));
302 } else {
303 TF_CHECK_OK(GpuLaunchKernel(NMSKernel<false>, block_dim, thread_block, 0,
304 device.stream(), d_sorted_boxes, num_boxes,
305 iou_threshold, bit_mask_len, d_delete_mask));
306 }
307 TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
308 // Overlapping CPU computes and D2H memcpy
309 // both take about the same time
310
311 config = GetGpuLaunchConfig(num_boxes, device);
312 Tensor selected_boxes;
313 TF_RETURN_IF_ERROR(context->allocate_temp(
314 DataType::DT_INT8, TensorShape({num_boxes}), &selected_boxes));
315 Tensor d_indices;
316 TF_RETURN_IF_ERROR(context->allocate_temp(
317 DataType::DT_INT32, TensorShape({num_boxes}), &d_indices));
318 TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
319 config.thread_per_block, 0, device.stream(),
320 config.virtual_thread_count, 0,
321 d_indices.flat<int>().data()));
322
323 char* selected = (char*)(selected_boxes.flat<int8>().data());
324 TF_CHECK_OK(GpuLaunchKernel(NMSReduce, 1, 1024, bit_mask_len * sizeof(int),
325 device.stream(), d_delete_mask, bit_mask_len,
326 num_boxes, max_boxes, selected));
327 TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
328 // do Cub::deviceSelect::flagged
329 size_t flagged_buffer_size = 0;
330 gpuprim::DeviceSelect::Flagged(static_cast<void*>(nullptr), // temp_storage
331 flagged_buffer_size,
332 static_cast<int*>(nullptr), // input
333 static_cast<char*>(nullptr), // selection flag
334 static_cast<int*>(nullptr), // selected items
335 static_cast<int*>(nullptr), // num_selected
336 num_boxes, device.stream());
337 Tensor cub_scratch;
338 TF_RETURN_IF_ERROR(context->allocate_temp(
339 DataType::DT_INT8, TensorShape({(int64)flagged_buffer_size}),
340 &cub_scratch));
341 Tensor d_num_selected;
342 TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
343 TensorShape({1}), &d_num_selected));
344
345 gpuprim::DeviceSelect::Flagged(
346 (void*)cub_scratch.flat<int8>().data(), // temp_storage
347 flagged_buffer_size,
348 d_indices.flat<int>().data(), // input
349 selected, // selection flag
350 d_selected_indices, // selected items
351 d_num_selected.flat<int>().data(), num_boxes, device.stream());
352 gpuEvent_t copy_done;
353 TF_RETURN_IF_CUDA_ERROR(
354 gpuEventCreateWithFlags(©_done, gpuEventDisableTiming));
355 device.memcpyDeviceToHost(h_selected_count, d_num_selected.flat<int>().data(),
356 sizeof(int));
357 TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
358 TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
359 *h_nkeep = *h_selected_count;
360 gpuEventDestroy(copy_done);
361 return Status::OK();
362 }
363
364 struct GreaterThanCubOp {
365 float threshold_;
GreaterThanCubOptensorflow::GreaterThanCubOp366 __host__ __device__ __forceinline__ GreaterThanCubOp(float threshold)
367 : threshold_(threshold) {}
operator ()tensorflow::GreaterThanCubOp368 __host__ __device__ __forceinline__ bool operator()(const float& val) const {
369 return (val > threshold_);
370 }
371 };
372 // Use DeviceSelect::If to count number of elements.
373 // TODO(sami) Not really a good way. Perhaps consider using thrust?
374 template <typename Op>
CountIf(OpKernelContext * context,const float * dev_array,const Op & op,int num_elements,int * result)375 Status CountIf(OpKernelContext* context, const float* dev_array, const Op& op,
376 int num_elements, int* result) {
377 Tensor scratch_output;
378 Tensor workspace;
379 Tensor element_count;
380 size_t workspace_size = 0;
381 auto cuda_stream = tensorflow::GetGpuStream(context);
382 auto device = context->eigen_gpu_device();
383 gpuprim::DeviceSelect::If(nullptr, workspace_size,
384 static_cast<float*>(nullptr),
385 static_cast<float*>(nullptr),
386 static_cast<int*>(nullptr), num_elements, op);
387
388 TF_RETURN_IF_ERROR(context->allocate_temp(
389 DataType::DT_FLOAT, TensorShape({num_elements}), &scratch_output));
390 TF_RETURN_IF_ERROR(context->allocate_temp(
391 DataType::DT_INT8, TensorShape({(int64)workspace_size}), &workspace));
392 TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
393 TensorShape({1}), &element_count));
394 gpuEvent_t copy_done;
395 TF_RETURN_IF_CUDA_ERROR(
396 gpuEventCreateWithFlags(©_done, gpuEventDisableTiming));
397 TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::If(
398 workspace.flat<int8>().data(), workspace_size, dev_array,
399 scratch_output.flat<float>().data(), element_count.flat<int32>().data(),
400 num_elements, op, cuda_stream));
401 device.memcpyDeviceToHost(result, element_count.flat<int32>().data(),
402 sizeof(int));
403 TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
404 TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
405 return Status::OK();
406 }
407
DoNMS(OpKernelContext * context,const Tensor & boxes,const Tensor & scores,const int64_t max_output_size,const float iou_threshold_val,const float score_threshold,bool pad_to_max_output,int * num_saved_outputs)408 Status DoNMS(OpKernelContext* context, const Tensor& boxes,
409 const Tensor& scores, const int64_t max_output_size,
410 const float iou_threshold_val, const float score_threshold,
411 bool pad_to_max_output, int* num_saved_outputs) {
412 int num_boxes = boxes.dim_size(0);
413 size_t cub_sort_temp_storage_bytes = 0;
414 auto cuda_stream = GetGpuStream(context);
415 auto device = context->eigen_gpu_device();
416 // Calling cub with nullptrs as inputs will make it return
417 // workspace size needed for the operation instead of doing the operation.
418 // In this specific instance, cub_sort_temp_storage_bytes will contain the
419 // necessary workspace size for sorting after the call.
420 if (num_boxes == 0) {
421 Tensor* output_indices = nullptr;
422 TF_RETURN_IF_ERROR(
423 context->allocate_output(0, TensorShape({0}), &output_indices));
424 return Status::OK();
425 }
426
427 cudaError_t cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending(
428 nullptr, cub_sort_temp_storage_bytes,
429 static_cast<float*>(nullptr), // scores
430 static_cast<float*>(nullptr), // sorted scores
431 static_cast<int*>(nullptr), // input indices
432 static_cast<int*>(nullptr), // sorted indices
433 num_boxes, // num items
434 0, 8 * sizeof(float), // sort all bits
435 cuda_stream);
436 TF_RETURN_IF_CUDA_ERROR(cuda_ret);
437 TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
438
439 Tensor d_cub_sort_buffer;
440 TF_RETURN_IF_ERROR(context->allocate_temp(
441 DataType::DT_INT8, TensorShape({(int64)cub_sort_temp_storage_bytes}),
442 &d_cub_sort_buffer));
443 Tensor d_indices;
444 TF_RETURN_IF_ERROR(context->allocate_temp(
445 DataType::DT_INT32, TensorShape({num_boxes}), &d_indices));
446 Tensor d_sorted_indices;
447 TF_RETURN_IF_ERROR(context->allocate_temp(
448 DataType::DT_INT32, TensorShape({num_boxes}), &d_sorted_indices));
449 Tensor d_selected_indices;
450 TF_RETURN_IF_ERROR(context->allocate_temp(
451 DataType::DT_INT32, TensorShape({num_boxes}), &d_selected_indices));
452 Tensor d_sorted_scores;
453 TF_RETURN_IF_ERROR(context->allocate_temp(
454 DataType::DT_FLOAT, TensorShape({num_boxes}), &d_sorted_scores));
455 Tensor d_sorted_boxes;
456 TF_RETURN_IF_ERROR(context->allocate_temp(
457 DataType::DT_FLOAT, TensorShape({num_boxes, 4}), &d_sorted_boxes));
458
459 // this will return sorted scores and their indices
460 auto config = GetGpuLaunchConfig(num_boxes, device);
461 // initialize box and score indices
462 TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
463 config.thread_per_block, 0, device.stream(),
464 config.virtual_thread_count, 0,
465 d_indices.flat<int>().data()));
466 TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
467 cuda_ret = gpuprim::DeviceRadixSort::SortPairsDescending(
468 d_cub_sort_buffer.flat<int8>().data(), cub_sort_temp_storage_bytes,
469 scores.flat<float>().data(), d_sorted_scores.flat<float>().data(),
470 d_indices.flat<int>().data(), d_sorted_indices.flat<int>().data(),
471 num_boxes, 0,
472 8 * sizeof(float), // sort all bits
473 cuda_stream);
474 TF_RETURN_IF_CUDA_ERROR(cuda_ret);
475
476 // get pointers for easy access
477 const float4* original_boxes =
478 reinterpret_cast<const float4*>(boxes.flat<float>().data());
479 float4* sorted_boxes =
480 reinterpret_cast<float4*>(d_sorted_boxes.flat<float>().data());
481 const int* sorted_indices = d_sorted_indices.flat<int>().data();
482 // sort boxes using indices
483 TF_CHECK_OK(GpuLaunchKernel(IndexMultiSelect<int, float4>, config.block_count,
484 config.thread_per_block, 0, device.stream(),
485 config.virtual_thread_count, sorted_indices,
486 original_boxes, sorted_boxes));
487 int limited_num_boxes = num_boxes;
488 // filter boxes by scores if nms v3
489 if (score_threshold > std::numeric_limits<float>::lowest()) {
490 GreaterThanCubOp score_limit(score_threshold);
491 TF_RETURN_IF_ERROR(CountIf(context, d_sorted_scores.flat<float>().data(),
492 score_limit, num_boxes, &limited_num_boxes));
493 if (limited_num_boxes == 0) {
494 Tensor* output_indices = nullptr;
495 VLOG(1) << "Number of boxes above score threshold " << score_threshold
496 << " is 0";
497 int len_output = pad_to_max_output ? max_output_size : 0;
498 *num_saved_outputs = 0;
499 TF_RETURN_IF_ERROR(context->allocate_output(0, TensorShape({len_output}),
500 &output_indices));
501 return Status::OK();
502 } else {
503 VLOG(2) << "Number of boxes above threshold=" << score_threshold << " is "
504 << limited_num_boxes;
505 }
506 }
507 int num_to_keep = 0;
508 // There is no guarantee that boxes are given in the for x1<x2 and/or y1<y2,
509 // flip boxes if necessary!
510 const bool flip_boxes = true;
511 auto status = NmsGpu(d_sorted_boxes.flat<float>().data(), limited_num_boxes,
512 iou_threshold_val, d_selected_indices.flat<int>().data(),
513 &num_to_keep, context, max_output_size, flip_boxes);
514 TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
515 if (!status.ok()) {
516 context->SetStatus(status);
517 return status;
518 }
519 Tensor* output_indices = nullptr;
520 int num_outputs = std::min(num_to_keep, (int)max_output_size); // no padding!
521 if (pad_to_max_output && num_outputs != max_output_size) {
522 TF_RETURN_IF_ERROR(context->allocate_output(
523 0, TensorShape({max_output_size}), &output_indices));
524 config = GetGpuLaunchConfig(max_output_size, device);
525 TF_CHECK_OK(GpuLaunchKernel(SetZero<int>, config.block_count,
526 config.thread_per_block, 0, device.stream(),
527 config.virtual_thread_count,
528 output_indices->flat<int>().data()));
529
530 } else {
531 TF_RETURN_IF_ERROR(context->allocate_output(0, TensorShape({num_outputs}),
532 &output_indices));
533 }
534 if (num_outputs == 0) {
535 *num_saved_outputs = num_outputs;
536 return Status::OK();
537 }
538 config = GetGpuLaunchConfig(num_outputs, device);
539 TF_CHECK_OK(GpuLaunchKernel(
540 IndexMultiSelect<int, int>, config.block_count, config.thread_per_block,
541 0, device.stream(), config.virtual_thread_count,
542 d_selected_indices.flat<int>().data(), sorted_indices,
543 (*output_indices).flat<int>().data()));
544 TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
545 *num_saved_outputs = num_outputs;
546 return Status::OK();
547 }
548
CheckValidInputs(const Tensor & boxes,const Tensor & scores,const Tensor & max_output_size,const Tensor & iou_threshold)549 Status CheckValidInputs(const Tensor& boxes, const Tensor& scores,
550 const Tensor& max_output_size,
551 const Tensor& iou_threshold) {
552 if (!TensorShapeUtils::IsScalar(max_output_size.shape())) {
553 return errors::InvalidArgument("max_output_size must be 0-D, got shape ",
554 max_output_size.shape().DebugString(),
555 " (Shape must be rank 0 but is ", "rank ",
556 max_output_size.dims(), ")");
557 }
558 if (!TensorShapeUtils::IsScalar(iou_threshold.shape())) {
559 return errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
560 iou_threshold.shape().DebugString(),
561 " (Shape must be rank 0 but is rank ",
562 iou_threshold.dims(), ")");
563 }
564 const float iou_threshold_val = iou_threshold.scalar<float>()();
565 if (iou_threshold_val < 0 || iou_threshold_val > 1) {
566 return errors::InvalidArgument("iou_threshold must be in [0, 1]");
567 }
568 if (boxes.dims() != 2) {
569 return errors::InvalidArgument(
570 "boxes must be a rank 2 tensor! (Shape must "
571 "be rank 2 but is rank ",
572 boxes.dims(), ")");
573 }
574 int num_boxes = boxes.dim_size(0);
575 if (boxes.dim_size(1) != 4) {
576 return errors::InvalidArgument(
577 "boxes must be Nx4 (Dimension must be 4 but"
578 " is ",
579 boxes.dim_size(1), ")");
580 }
581 if (scores.dims() != 1) {
582 return errors::InvalidArgument(
583 "scores must be a vector! (Shape must be "
584 "rank 1 but is rank ",
585 scores.dims(), ")");
586 }
587 if (scores.dim_size(0) != num_boxes) {
588 return errors::InvalidArgument(
589 "scores has incompatible shape " // message must be exactly this
590 "(Dimensions must be equal, but are ", // otherwise tests fail!
591 num_boxes, " and ", scores.dim_size(0), ")");
592 }
593 return Status::OK();
594 }
595 class NonMaxSuppressionV2GPUOp : public OpKernel {
596 public:
NonMaxSuppressionV2GPUOp(OpKernelConstruction * context)597 explicit NonMaxSuppressionV2GPUOp(OpKernelConstruction* context)
598 : OpKernel(context) {}
599
Compute(OpKernelContext * context)600 void Compute(OpKernelContext* context) override {
601 // boxes: [num_boxes, 4]
602 const Tensor& boxes = context->input(0);
603 // scores: [num_boxes]
604 const Tensor& scores = context->input(1);
605 // max_output_size: scalar
606 const Tensor& max_output_size = context->input(2);
607 // iou_threshold: scalar
608 const Tensor& iou_threshold = context->input(3);
609 auto valid =
610 CheckValidInputs(boxes, scores, max_output_size, iou_threshold);
611 if (!valid.ok()) {
612 context->SetStatus(valid);
613 return;
614 }
615 int num_boxes = boxes.dim_size(0);
616 if (num_boxes == 0) {
617 Tensor* output_indices = nullptr;
618 OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({0}),
619 &output_indices));
620 return;
621 }
622 const float iou_threshold_val = iou_threshold.scalar<float>()();
623 const int64_t output_size = max_output_size.scalar<int>()();
624
625 OP_REQUIRES_OK(
626 context,
627 DoNMS(context, boxes, scores, output_size, iou_threshold_val,
628 /*score_threshold is float lowest if score threshold is disabled*/
629 std::numeric_limits<float>::lowest(),
630 /*pad_to_max_output*/ false, &num_boxes));
631 }
632 };
633
634 class NonMaxSuppressionV3GPUOp : public OpKernel {
635 public:
NonMaxSuppressionV3GPUOp(OpKernelConstruction * context)636 explicit NonMaxSuppressionV3GPUOp(OpKernelConstruction* context)
637 : OpKernel(context) {}
638
Compute(OpKernelContext * context)639 void Compute(OpKernelContext* context) override {
640 // boxes: [num_boxes, 4]
641 const Tensor& boxes = context->input(0);
642 // scores: [num_boxes]
643 const Tensor& scores = context->input(1);
644 // max_output_size: scalar
645 const Tensor& max_output_size = context->input(2);
646 // iou_threshold: scalar
647 const Tensor& iou_threshold = context->input(3);
648 auto valid =
649 CheckValidInputs(boxes, scores, max_output_size, iou_threshold);
650 if (!valid.ok()) {
651 context->SetStatus(valid);
652 return;
653 }
654
655 const Tensor& score_threshold = context->input(4);
656 OP_REQUIRES(
657 context, TensorShapeUtils::IsScalar(score_threshold.shape()),
658 errors::InvalidArgument("score_threshold must be 0-D, got shape ",
659 score_threshold.shape().DebugString()));
660 const float score_threshold_val = score_threshold.scalar<float>()();
661 int num_boxes = boxes.dim_size(0);
662 if (num_boxes == 0) {
663 Tensor* output_indices = nullptr;
664 OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({0}),
665 &output_indices));
666 return;
667 }
668 const float iou_threshold_val = iou_threshold.scalar<float>()();
669 const int64_t output_size = max_output_size.scalar<int>()();
670 OP_REQUIRES_OK(context, DoNMS(context, boxes, scores, output_size,
671 iou_threshold_val, score_threshold_val,
672 /*pad_to_max_output*/ false, &num_boxes));
673 }
674 };
675
676 class NonMaxSuppressionV4GPUOp : public OpKernel {
677 public:
NonMaxSuppressionV4GPUOp(OpKernelConstruction * context)678 explicit NonMaxSuppressionV4GPUOp(OpKernelConstruction* context)
679 : OpKernel(context) {
680 OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
681 &pad_to_max_output_size_));
682 }
683
Compute(OpKernelContext * context)684 void Compute(OpKernelContext* context) override {
685 // boxes: [num_boxes, 4]
686 const Tensor& boxes = context->input(0);
687 // scores: [num_boxes]
688 const Tensor& scores = context->input(1);
689 // max_output_size: scalar
690 const Tensor& max_output_size = context->input(2);
691 // iou_threshold: scalar
692 const Tensor& iou_threshold = context->input(3);
693 auto valid =
694 CheckValidInputs(boxes, scores, max_output_size, iou_threshold);
695 if (!valid.ok()) {
696 context->SetStatus(valid);
697 return;
698 }
699
700 const Tensor& score_threshold = context->input(4);
701 OP_REQUIRES(
702 context, TensorShapeUtils::IsScalar(score_threshold.shape()),
703 errors::InvalidArgument("score_threshold must be 0-D, got shape ",
704 score_threshold.shape().DebugString()));
705 const float score_threshold_val = score_threshold.scalar<float>()();
706
707 Tensor* num_outputs_t = nullptr;
708 OP_REQUIRES_OK(context,
709 context->allocate_output(1, tensorflow::TensorShape({}),
710 &num_outputs_t));
711 auto device = context->eigen_gpu_device();
712 int num_boxes = boxes.dim_size(0);
713 if (num_boxes == 0) {
714 Tensor* output_indices = nullptr;
715 OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}),
716 &output_indices));
717 device.memcpy((num_outputs_t->flat<int>().data()), &num_boxes,
718 sizeof(int));
719 return;
720 }
721
722 const float iou_threshold_val = iou_threshold.scalar<float>()();
723 const int64_t output_size = max_output_size.scalar<int>()();
724 int num_outputs = 0;
725 OP_REQUIRES_OK(context, DoNMS(context, boxes, scores, output_size,
726 iou_threshold_val, score_threshold_val,
727 pad_to_max_output_size_, &num_outputs));
728 device.memcpyHostToDevice((num_outputs_t->flat<int>().data()), &num_outputs,
729 sizeof(int));
730 return;
731 }
732
733 private:
734 bool pad_to_max_output_size_;
735 };
736
737 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2")
738 .TypeConstraint<float>("T")
739 .Device(DEVICE_GPU)
740 .HostMemory("iou_threshold")
741 .HostMemory("max_output_size"),
742 NonMaxSuppressionV2GPUOp);
743
744 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
745 .TypeConstraint<float>("T")
746 .Device(DEVICE_GPU)
747 .HostMemory("iou_threshold")
748 .HostMemory("max_output_size")
749 .HostMemory("score_threshold"),
750 NonMaxSuppressionV3GPUOp);
751
752 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV4")
753 .TypeConstraint<float>("T")
754 .Device(DEVICE_GPU)
755 .HostMemory("iou_threshold")
756 .HostMemory("max_output_size")
757 .HostMemory("score_threshold"),
758 NonMaxSuppressionV4GPUOp);
759
760 } // namespace tensorflow
761 #endif
762