• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/image_ops.cc
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "tensorflow/core/kernels/image/non_max_suppression_op.h"
21 
22 #include <cmath>
23 #include <functional>
24 #include <queue>
25 #include <vector>
26 
27 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
28 #include "tensorflow/core/framework/bounds_check.h"
29 #include "tensorflow/core/framework/op_kernel.h"
30 #include "tensorflow/core/framework/register_types.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/framework/types.h"
34 #include "tensorflow/core/lib/core/status.h"
35 #include "tensorflow/core/platform/logging.h"
36 
37 namespace tensorflow {
38 namespace {
39 
40 typedef Eigen::ThreadPoolDevice CPUDevice;
41 
CheckScoreSizes(OpKernelContext * context,int num_boxes,const Tensor & scores)42 static inline void CheckScoreSizes(OpKernelContext* context, int num_boxes,
43                                    const Tensor& scores) {
44   // The shape of 'scores' is [num_boxes]
45   OP_REQUIRES(context, scores.dims() == 1,
46               errors::InvalidArgument(
47                   "scores must be 1-D", scores.shape().DebugString(),
48                   " (Shape must be rank 1 but is rank ", scores.dims(), ")"));
49   OP_REQUIRES(
50       context, scores.dim_size(0) == num_boxes,
51       errors::InvalidArgument("scores has incompatible shape (Dimensions must "
52                               "be equal, but are ",
53                               num_boxes, " and ", scores.dim_size(0), ")"));
54 }
55 
ParseAndCheckOverlapSizes(OpKernelContext * context,const Tensor & overlaps,int * num_boxes)56 static inline void ParseAndCheckOverlapSizes(OpKernelContext* context,
57                                              const Tensor& overlaps,
58                                              int* num_boxes) {
59   // the shape of 'overlaps' is [num_boxes, num_boxes]
60   OP_REQUIRES(context, overlaps.dims() == 2,
61               errors::InvalidArgument("overlaps must be 2-D",
62                                       overlaps.shape().DebugString()));
63 
64   *num_boxes = overlaps.dim_size(0);
65   OP_REQUIRES(context, overlaps.dim_size(1) == *num_boxes,
66               errors::InvalidArgument("overlaps must be square",
67                                       overlaps.shape().DebugString()));
68 }
69 
ParseAndCheckBoxSizes(OpKernelContext * context,const Tensor & boxes,int * num_boxes)70 static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
71                                          const Tensor& boxes, int* num_boxes) {
72   // The shape of 'boxes' is [num_boxes, 4]
73   OP_REQUIRES(context, boxes.dims() == 2,
74               errors::InvalidArgument(
75                   "boxes must be 2-D", boxes.shape().DebugString(),
76                   " (Shape must be rank 2 but is rank ", boxes.dims(), ")"));
77   *num_boxes = boxes.dim_size(0);
78   OP_REQUIRES(context, boxes.dim_size(1) == 4,
79               errors::InvalidArgument("boxes must have 4 columns (Dimension "
80                                       "must be 4 but is ",
81                                       boxes.dim_size(1), ")"));
82 }
83 
CheckCombinedNMSScoreSizes(OpKernelContext * context,int num_boxes,const Tensor & scores)84 static inline void CheckCombinedNMSScoreSizes(OpKernelContext* context,
85                                               int num_boxes,
86                                               const Tensor& scores) {
87   // The shape of 'scores' is [batch_size, num_boxes, num_classes]
88   OP_REQUIRES(context, scores.dims() == 3,
89               errors::InvalidArgument("scores must be 3-D",
90                                       scores.shape().DebugString()));
91   OP_REQUIRES(context, scores.dim_size(1) == num_boxes,
92               errors::InvalidArgument("scores has incompatible shape"));
93 }
94 
ParseAndCheckCombinedNMSBoxSizes(OpKernelContext * context,const Tensor & boxes,int * num_boxes,const int num_classes)95 static inline void ParseAndCheckCombinedNMSBoxSizes(OpKernelContext* context,
96                                                     const Tensor& boxes,
97                                                     int* num_boxes,
98                                                     const int num_classes) {
99   // The shape of 'boxes' is [batch_size, num_boxes, q, 4]
100   OP_REQUIRES(context, boxes.dims() == 4,
101               errors::InvalidArgument("boxes must be 4-D",
102                                       boxes.shape().DebugString()));
103 
104   bool box_check = boxes.dim_size(2) == 1 || boxes.dim_size(2) == num_classes;
105   OP_REQUIRES(context, box_check,
106               errors::InvalidArgument(
107                   "third dimension of boxes must be either 1 or num classes"));
108   *num_boxes = boxes.dim_size(1);
109   OP_REQUIRES(context, boxes.dim_size(3) == 4,
110               errors::InvalidArgument("boxes must have 4 columns"));
111 }
112 // Return intersection-over-union overlap between boxes i and j
113 template <typename T>
IOU(typename TTypes<T,2>::ConstTensor boxes,int i,int j)114 static inline float IOU(typename TTypes<T, 2>::ConstTensor boxes, int i,
115                         int j) {
116   const float ymin_i = Eigen::numext::mini<float>(boxes(i, 0), boxes(i, 2));
117   const float xmin_i = Eigen::numext::mini<float>(boxes(i, 1), boxes(i, 3));
118   const float ymax_i = Eigen::numext::maxi<float>(boxes(i, 0), boxes(i, 2));
119   const float xmax_i = Eigen::numext::maxi<float>(boxes(i, 1), boxes(i, 3));
120   const float ymin_j = Eigen::numext::mini<float>(boxes(j, 0), boxes(j, 2));
121   const float xmin_j = Eigen::numext::mini<float>(boxes(j, 1), boxes(j, 3));
122   const float ymax_j = Eigen::numext::maxi<float>(boxes(j, 0), boxes(j, 2));
123   const float xmax_j = Eigen::numext::maxi<float>(boxes(j, 1), boxes(j, 3));
124   const float area_i = (ymax_i - ymin_i) * (xmax_i - xmin_i);
125   const float area_j = (ymax_j - ymin_j) * (xmax_j - xmin_j);
126   if (area_i <= 0 || area_j <= 0) {
127     return 0.0;
128   }
129   const float intersection_ymin = Eigen::numext::maxi<float>(ymin_i, ymin_j);
130   const float intersection_xmin = Eigen::numext::maxi<float>(xmin_i, xmin_j);
131   const float intersection_ymax = Eigen::numext::mini<float>(ymax_i, ymax_j);
132   const float intersection_xmax = Eigen::numext::mini<float>(xmax_i, xmax_j);
133   const float intersection_area =
134       Eigen::numext::maxi<float>(intersection_ymax - intersection_ymin, 0.0) *
135       Eigen::numext::maxi<float>(intersection_xmax - intersection_xmin, 0.0);
136   return intersection_area / (area_i + area_j - intersection_area);
137 }
138 
139 template <typename T>
Overlap(typename TTypes<T,2>::ConstTensor overlaps,int i,int j)140 static inline T Overlap(typename TTypes<T, 2>::ConstTensor overlaps, int i,
141                         int j) {
142   return overlaps(i, j);
143 }
144 
145 template <typename T>
CreateIOUSimilarityFn(const Tensor & boxes)146 static inline std::function<float(int, int)> CreateIOUSimilarityFn(
147     const Tensor& boxes) {
148   typename TTypes<T, 2>::ConstTensor boxes_data = boxes.tensor<T, 2>();
149   return std::bind(&IOU<T>, boxes_data, std::placeholders::_1,
150                    std::placeholders::_2);
151 }
152 
153 template <typename T>
CreateOverlapSimilarityFn(const Tensor & overlaps)154 static inline std::function<T(int, int)> CreateOverlapSimilarityFn(
155     const Tensor& overlaps) {
156   typename TTypes<T, 2>::ConstTensor overlaps_data =
157       overlaps.tensor<float, 2>();
158   return std::bind(&Overlap<T>, overlaps_data, std::placeholders::_1,
159                    std::placeholders::_2);
160 }
161 
162 template <typename T>
DoNonMaxSuppressionOp(OpKernelContext * context,const Tensor & scores,int num_boxes,const Tensor & max_output_size,const T similarity_threshold,const T score_threshold,const T soft_nms_sigma,const std::function<float (int,int)> & similarity_fn,bool return_scores_tensor=false,bool pad_to_max_output_size=false,int * ptr_num_valid_outputs=nullptr)163 void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& scores,
164                            int num_boxes, const Tensor& max_output_size,
165                            const T similarity_threshold,
166                            const T score_threshold, const T soft_nms_sigma,
167                            const std::function<float(int, int)>& similarity_fn,
168                            bool return_scores_tensor = false,
169                            bool pad_to_max_output_size = false,
170                            int* ptr_num_valid_outputs = nullptr) {
171   const int output_size = max_output_size.scalar<int>()();
172   OP_REQUIRES(context, output_size >= 0,
173               errors::InvalidArgument("output size must be non-negative"));
174 
175   std::vector<T> scores_data(num_boxes);
176   std::copy_n(scores.flat<T>().data(), num_boxes, scores_data.begin());
177 
178   // Data structure for a selection candidate in NMS.
179   struct Candidate {
180     int box_index;
181     T score;
182     int suppress_begin_index;
183   };
184 
185   auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
186     return ((bs_i.score == bs_j.score) && (bs_i.box_index > bs_j.box_index)) ||
187            bs_i.score < bs_j.score;
188   };
189   std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)>
190       candidate_priority_queue(cmp);
191   for (int i = 0; i < scores_data.size(); ++i) {
192     if (scores_data[i] > score_threshold) {
193       candidate_priority_queue.emplace(Candidate({i, scores_data[i], 0}));
194     }
195   }
196 
197   T scale = static_cast<T>(0.0);
198   bool is_soft_nms = soft_nms_sigma > static_cast<T>(0.0);
199   if (is_soft_nms) {
200     scale = static_cast<T>(-0.5) / soft_nms_sigma;
201   }
202 
203   auto suppress_weight = [similarity_threshold, scale,
204                           is_soft_nms](const T sim) {
205     const T weight = Eigen::numext::exp<T>(scale * sim * sim);
206     return is_soft_nms || sim <= similarity_threshold ? weight
207                                                       : static_cast<T>(0.0);
208   };
209 
210   std::vector<int> selected;
211   std::vector<T> selected_scores;
212   float similarity;
213   T original_score;
214   Candidate next_candidate;
215 
216   while (selected.size() < output_size && !candidate_priority_queue.empty()) {
217     next_candidate = candidate_priority_queue.top();
218     original_score = next_candidate.score;
219     candidate_priority_queue.pop();
220 
221     // Overlapping boxes are likely to have similar scores, therefore we
222     // iterate through the previously selected boxes backwards in order to
223     // see if `next_candidate` should be suppressed. We also enforce a property
224     // that a candidate can be suppressed by another candidate no more than
225     // once via `suppress_begin_index` which tracks which previously selected
226     // boxes have already been compared against next_candidate prior to a given
227     // iteration.  These previous selected boxes are then skipped over in the
228     // following loop.
229     bool should_hard_suppress = false;
230     for (int j = static_cast<int>(selected.size()) - 1;
231          j >= next_candidate.suppress_begin_index; --j) {
232       similarity = similarity_fn(next_candidate.box_index, selected[j]);
233 
234       next_candidate.score *= suppress_weight(static_cast<T>(similarity));
235 
236       // First decide whether to perform hard suppression
237       if (!is_soft_nms && static_cast<T>(similarity) > similarity_threshold) {
238         should_hard_suppress = true;
239         break;
240       }
241 
242       // If next_candidate survives hard suppression, apply soft suppression
243       if (next_candidate.score <= score_threshold) break;
244     }
245     // If `next_candidate.score` has not dropped below `score_threshold`
246     // by this point, then we know that we went through all of the previous
247     // selections and can safely update `suppress_begin_index` to
248     // `selected.size()`. If on the other hand `next_candidate.score`
249     // *has* dropped below the score threshold, then since `suppress_weight`
250     // always returns values in [0, 1], further suppression by items that were
251     // not covered in the above for loop would not have caused the algorithm
252     // to select this item. We thus do the same update to
253     // `suppress_begin_index`, but really, this element will not be added back
254     // into the priority queue in the following.
255     next_candidate.suppress_begin_index = selected.size();
256 
257     if (!should_hard_suppress) {
258       if (next_candidate.score == original_score) {
259         // Suppression has not occurred, so select next_candidate
260         selected.push_back(next_candidate.box_index);
261         selected_scores.push_back(next_candidate.score);
262         continue;
263       }
264       if (next_candidate.score > score_threshold) {
265         // Soft suppression has occurred and current score is still greater than
266         // score_threshold; add next_candidate back onto priority queue.
267         candidate_priority_queue.push(next_candidate);
268       }
269     }
270   }
271 
272   int num_valid_outputs = selected.size();
273   if (pad_to_max_output_size) {
274     selected.resize(output_size, 0);
275     selected_scores.resize(output_size, static_cast<T>(0));
276   }
277   if (ptr_num_valid_outputs) {
278     *ptr_num_valid_outputs = num_valid_outputs;
279   }
280 
281   // Allocate output tensors
282   Tensor* output_indices = nullptr;
283   TensorShape output_shape({static_cast<int>(selected.size())});
284   OP_REQUIRES_OK(context,
285                  context->allocate_output(0, output_shape, &output_indices));
286   TTypes<int, 1>::Tensor output_indices_data = output_indices->tensor<int, 1>();
287   std::copy_n(selected.begin(), selected.size(), output_indices_data.data());
288 
289   if (return_scores_tensor) {
290     Tensor* output_scores = nullptr;
291     OP_REQUIRES_OK(context,
292                    context->allocate_output(1, output_shape, &output_scores));
293     typename TTypes<T, 1>::Tensor output_scores_data =
294         output_scores->tensor<T, 1>();
295     std::copy_n(selected_scores.begin(), selected_scores.size(),
296                 output_scores_data.data());
297   }
298 }
299 
300 struct ResultCandidate {
301   int box_index;
302   float score;
303   int class_idx;
304   float box_coord[4];
305 };
306 
DoNMSPerClass(int batch_idx,int class_idx,const float * boxes_data,const float * scores_data,int num_boxes,int q,int num_classes,const int size_per_class,const float score_threshold,const float iou_threshold,std::vector<ResultCandidate> & result_candidate_vec)307 void DoNMSPerClass(int batch_idx, int class_idx, const float* boxes_data,
308                    const float* scores_data, int num_boxes, int q,
309                    int num_classes, const int size_per_class,
310                    const float score_threshold, const float iou_threshold,
311                    std::vector<ResultCandidate>& result_candidate_vec) {
312   std::vector<float> class_scores_data;
313   class_scores_data.reserve(num_boxes);
314   std::vector<float> class_boxes_data;
315   class_boxes_data.reserve(num_boxes * 4);
316 
317   for (int box_idx = 0; box_idx < num_boxes; ++box_idx) {
318     class_scores_data.push_back(scores_data[box_idx * num_classes + class_idx]);
319     for (int cid = 0; cid < 4; ++cid) {
320       if (q > 1) {
321         class_boxes_data.push_back(
322             boxes_data[(box_idx * q + class_idx) * 4 + cid]);
323       } else {
324         class_boxes_data.push_back(boxes_data[box_idx * 4 + cid]);
325       }
326     }
327   }
328 
329   // Do NMS, get the candidate indices of form vector<int>
330   // Data structure for selection candidate in NMS.
331   struct Candidate {
332     int box_index;
333     float score;
334   };
335   auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
336     return bs_i.score < bs_j.score;
337   };
338   std::priority_queue<Candidate, std::vector<Candidate>, decltype(cmp)>
339       candidate_priority_queue(cmp);
340   for (int i = 0; i < num_boxes; ++i) {
341     if (class_scores_data[i] > score_threshold) {
342       candidate_priority_queue.emplace(Candidate({i, class_scores_data[i]}));
343     }
344   }
345 
346   std::vector<int> selected;
347   std::vector<float> selected_boxes;
348   Candidate next_candidate;
349 
350   // Move class_boxes_data to a tensor
351   Eigen::array<Eigen::DenseIndex, 2> boxesShape = {num_boxes, 4};
352   typename TTypes<float, 2>::ConstTensor boxes_data_t(class_boxes_data.data(),
353                                                       boxesShape);
354   float iou;
355   while (selected.size() < size_per_class &&
356          !candidate_priority_queue.empty()) {
357     next_candidate = candidate_priority_queue.top();
358     candidate_priority_queue.pop();
359     // Overlapping boxes are likely to have similar scores,
360     // therefore we iterate through the previously selected boxes backwards
361     // in order to see if `next_candidate` should be suppressed.
362     bool should_select = true;
363     for (int j = selected.size() - 1; j >= 0; --j) {
364       iou = IOU<float>(boxes_data_t, next_candidate.box_index, selected[j]);
365       if (iou > iou_threshold) {
366         should_select = false;
367         break;
368       }
369     }
370 
371     if (should_select) {
372       // Add the selected box to the result candidate. Sorted by score
373       int id = next_candidate.box_index;
374       result_candidate_vec[selected.size() + size_per_class * class_idx] = {
375           next_candidate.box_index,
376           next_candidate.score,
377           class_idx,
378           {boxes_data_t(id, 0), boxes_data_t(id, 1), boxes_data_t(id, 2),
379            boxes_data_t(id, 3)}};
380       selected.push_back(next_candidate.box_index);
381     }
382   }
383 }
384 
SelectResultPerBatch(std::vector<float> & nmsed_boxes,std::vector<float> & nmsed_scores,std::vector<float> & nmsed_classes,std::vector<ResultCandidate> & result_candidate_vec,std::vector<int> & final_valid_detections,const int batch_idx,int total_size_per_batch,bool pad_per_class,int max_size_per_batch,bool clip_boxes,int per_batch_size)385 void SelectResultPerBatch(std::vector<float>& nmsed_boxes,
386                           std::vector<float>& nmsed_scores,
387                           std::vector<float>& nmsed_classes,
388                           std::vector<ResultCandidate>& result_candidate_vec,
389                           std::vector<int>& final_valid_detections,
390                           const int batch_idx, int total_size_per_batch,
391                           bool pad_per_class, int max_size_per_batch,
392                           bool clip_boxes, int per_batch_size) {
393   auto rc_cmp = [](const ResultCandidate rc_i, const ResultCandidate rc_j) {
394     return rc_i.score > rc_j.score;
395   };
396   std::sort(result_candidate_vec.begin(), result_candidate_vec.end(), rc_cmp);
397 
398   int max_detections = 0;
399   int result_candidate_size =
400       std::count_if(result_candidate_vec.begin(), result_candidate_vec.end(),
401                     [](ResultCandidate rc) { return rc.box_index > -1; });
402   // If pad_per_class is false, we always pad to max_total_size
403   if (!pad_per_class) {
404     max_detections = std::min(result_candidate_size, total_size_per_batch);
405   } else {
406     max_detections = std::min(per_batch_size, result_candidate_size);
407   }
408 
409   final_valid_detections[batch_idx] = max_detections;
410 
411   int curr_total_size = max_detections;
412   int result_idx = 0;
413   // Pick the top max_detections values
414   while (curr_total_size > 0 && result_idx < result_candidate_vec.size()) {
415     ResultCandidate next_candidate = result_candidate_vec[result_idx++];
416     // Add to final output vectors
417     if (clip_boxes) {
418       const float box_min = 0.0;
419       const float box_max = 1.0;
420       nmsed_boxes.push_back(
421           std::max(std::min(next_candidate.box_coord[0], box_max), box_min));
422       nmsed_boxes.push_back(
423           std::max(std::min(next_candidate.box_coord[1], box_max), box_min));
424       nmsed_boxes.push_back(
425           std::max(std::min(next_candidate.box_coord[2], box_max), box_min));
426       nmsed_boxes.push_back(
427           std::max(std::min(next_candidate.box_coord[3], box_max), box_min));
428     } else {
429       nmsed_boxes.push_back(next_candidate.box_coord[0]);
430       nmsed_boxes.push_back(next_candidate.box_coord[1]);
431       nmsed_boxes.push_back(next_candidate.box_coord[2]);
432       nmsed_boxes.push_back(next_candidate.box_coord[3]);
433     }
434     nmsed_scores.push_back(next_candidate.score);
435     nmsed_classes.push_back(next_candidate.class_idx);
436     curr_total_size--;
437   }
438 
439   nmsed_boxes.resize(per_batch_size * 4, 0);
440   nmsed_scores.resize(per_batch_size, 0);
441   nmsed_classes.resize(per_batch_size, 0);
442 }
443 
BatchedNonMaxSuppressionOp(OpKernelContext * context,const Tensor & inp_boxes,const Tensor & inp_scores,int num_boxes,const int max_size_per_class,const int total_size_per_batch,const float score_threshold,const float iou_threshold,bool pad_per_class=false,bool clip_boxes=true)444 void BatchedNonMaxSuppressionOp(
445     OpKernelContext* context, const Tensor& inp_boxes, const Tensor& inp_scores,
446     int num_boxes, const int max_size_per_class, const int total_size_per_batch,
447     const float score_threshold, const float iou_threshold,
448     bool pad_per_class = false, bool clip_boxes = true) {
449   const int num_batches = inp_boxes.dim_size(0);
450   int num_classes = inp_scores.dim_size(2);
451   int q = inp_boxes.dim_size(2);
452 
453   const float* scores_data =
454       const_cast<float*>(inp_scores.flat<float>().data());
455   const float* boxes_data = const_cast<float*>(inp_boxes.flat<float>().data());
456 
457   int boxes_per_batch = num_boxes * q * 4;
458   int scores_per_batch = num_boxes * num_classes;
459   const int size_per_class = std::min(max_size_per_class, num_boxes);
460   std::vector<std::vector<ResultCandidate>> result_candidate_vec(
461       num_batches,
462       std::vector<ResultCandidate>(size_per_class * num_classes,
463                                    {-1, -1.0, -1, {0.0, 0.0, 0.0, 0.0}}));
464 
465   // [num_batches, per_batch_size * 4]
466   std::vector<std::vector<float>> nmsed_boxes(num_batches);
467   // [num_batches, per_batch_size]
468   std::vector<std::vector<float>> nmsed_scores(num_batches);
469   // [num_batches, per_batch_size]
470   std::vector<std::vector<float>> nmsed_classes(num_batches);
471   // [num_batches]
472   std::vector<int> final_valid_detections(num_batches);
473 
474   auto shard_nms = [&](int begin, int end) {
475     for (int idx = begin; idx < end; ++idx) {
476       int batch_idx = idx / num_classes;
477       int class_idx = idx % num_classes;
478       DoNMSPerClass(batch_idx, class_idx,
479                     boxes_data + boxes_per_batch * batch_idx,
480                     scores_data + scores_per_batch * batch_idx, num_boxes, q,
481                     num_classes, size_per_class, score_threshold, iou_threshold,
482                     result_candidate_vec[batch_idx]);
483     }
484   };
485 
486   int length = num_batches * num_classes;
487   // Input data boxes_data, scores_data
488   int input_bytes = num_boxes * 10 * sizeof(float);
489   int output_bytes = num_boxes * 10 * sizeof(float);
490   int compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 14 +
491                        Eigen::TensorOpCost::MulCost<int>() * num_boxes * 9 +
492                        Eigen::TensorOpCost::MulCost<float>() * num_boxes * 9 +
493                        Eigen::TensorOpCost::AddCost<float>() * num_boxes * 8;
494   // The cost here is not the actual number of cycles, but rather a set of
495   // hand-tuned numbers that seem to work best.
496   const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
497   const CPUDevice& d = context->eigen_device<CPUDevice>();
498   d.parallelFor(length, cost, shard_nms);
499 
500   int per_batch_size = total_size_per_batch;
501   if (pad_per_class) {
502     per_batch_size =
503         std::min(total_size_per_batch, max_size_per_class * num_classes);
504   }
505 
506   Tensor* valid_detections_t = nullptr;
507   TensorShape valid_detections_shape({num_batches});
508   OP_REQUIRES_OK(context, context->allocate_output(3, valid_detections_shape,
509                                                    &valid_detections_t));
510   auto valid_detections_flat = valid_detections_t->template flat<int>();
511 
512   auto shard_result = [&](int begin, int end) {
513     for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
514       SelectResultPerBatch(
515           nmsed_boxes[batch_idx], nmsed_scores[batch_idx],
516           nmsed_classes[batch_idx], result_candidate_vec[batch_idx],
517           final_valid_detections, batch_idx, total_size_per_batch,
518           pad_per_class, max_size_per_class * num_classes, clip_boxes,
519           per_batch_size);
520       valid_detections_flat(batch_idx) = final_valid_detections[batch_idx];
521     }
522   };
523   length = num_batches;
524   // Input data boxes_data, scores_data
525   input_bytes =
526       num_boxes * 10 * sizeof(float) + per_batch_size * 6 * sizeof(float);
527   output_bytes =
528       num_boxes * 5 * sizeof(float) + per_batch_size * 6 * sizeof(float);
529   compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 5 +
530                    Eigen::TensorOpCost::AddCost<float>() * num_boxes * 5;
531   // The cost here is not the actual number of cycles, but rather a set of
532   // hand-tuned numbers that seem to work best.
533   const Eigen::TensorOpCost cost_result(input_bytes, output_bytes,
534                                         compute_cycles);
535   d.parallelFor(length, cost_result, shard_result);
536 
537   Tensor* nmsed_boxes_t = nullptr;
538   TensorShape boxes_shape({num_batches, per_batch_size, 4});
539   OP_REQUIRES_OK(context,
540                  context->allocate_output(0, boxes_shape, &nmsed_boxes_t));
541   auto nmsed_boxes_flat = nmsed_boxes_t->template flat<float>();
542 
543   Tensor* nmsed_scores_t = nullptr;
544   TensorShape scores_shape({num_batches, per_batch_size});
545   OP_REQUIRES_OK(context,
546                  context->allocate_output(1, scores_shape, &nmsed_scores_t));
547   auto nmsed_scores_flat = nmsed_scores_t->template flat<float>();
548 
549   Tensor* nmsed_classes_t = nullptr;
550   OP_REQUIRES_OK(context,
551                  context->allocate_output(2, scores_shape, &nmsed_classes_t));
552   auto nmsed_classes_flat = nmsed_classes_t->template flat<float>();
553 
554   auto shard_copy_result = [&](int begin, int end) {
555     for (int idx = begin; idx < end; ++idx) {
556       int batch_idx = idx / per_batch_size;
557       int j = idx % per_batch_size;
558       nmsed_scores_flat(idx) = nmsed_scores[batch_idx][j];
559       nmsed_classes_flat(idx) = nmsed_classes[batch_idx][j];
560       for (int k = 0; k < 4; ++k) {
561         nmsed_boxes_flat(idx * 4 + k) = nmsed_boxes[batch_idx][j * 4 + k];
562       }
563     }
564   };
565   length = num_batches * per_batch_size;
566   // Input data boxes_data, scores_data
567   input_bytes = 6 * sizeof(float);
568   output_bytes = 6 * sizeof(float);
569   compute_cycles = Eigen::TensorOpCost::AddCost<int>() * 2 +
570                    Eigen::TensorOpCost::MulCost<int>() * 2 +
571                    Eigen::TensorOpCost::DivCost<float>() * 2;
572   const Eigen::TensorOpCost cost_copy_result(input_bytes, output_bytes,
573                                              compute_cycles);
574   d.parallelFor(length, cost_copy_result, shard_copy_result);
575 }
576 
577 }  // namespace
578 
579 template <typename Device>
580 class NonMaxSuppressionOp : public OpKernel {
581  public:
NonMaxSuppressionOp(OpKernelConstruction * context)582   explicit NonMaxSuppressionOp(OpKernelConstruction* context)
583       : OpKernel(context) {
584     OP_REQUIRES_OK(context, context->GetAttr("iou_threshold", &iou_threshold_));
585   }
586 
Compute(OpKernelContext * context)587   void Compute(OpKernelContext* context) override {
588     // boxes: [num_boxes, 4]
589     const Tensor& boxes = context->input(0);
590     // scores: [num_boxes]
591     const Tensor& scores = context->input(1);
592     // max_output_size: scalar
593     const Tensor& max_output_size = context->input(2);
594     OP_REQUIRES(
595         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
596         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
597                                 max_output_size.shape().DebugString()));
598 
599     OP_REQUIRES(context, iou_threshold_ >= 0 && iou_threshold_ <= 1,
600                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
601     int num_boxes = 0;
602     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
603     CheckScoreSizes(context, num_boxes, scores);
604     if (!context->status().ok()) {
605       return;
606     }
607     auto similarity_fn = CreateIOUSimilarityFn<float>(boxes);
608 
609     const float score_threshold_val = std::numeric_limits<float>::lowest();
610     const float dummy_soft_nms_sigma = static_cast<float>(0.0);
611     DoNonMaxSuppressionOp<float>(context, scores, num_boxes, max_output_size,
612                                  iou_threshold_, score_threshold_val,
613                                  dummy_soft_nms_sigma, similarity_fn);
614   }
615 
616  private:
617   float iou_threshold_;
618 };
619 
620 template <typename Device, typename T>
621 class NonMaxSuppressionV2Op : public OpKernel {
622  public:
NonMaxSuppressionV2Op(OpKernelConstruction * context)623   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
624       : OpKernel(context) {}
625 
Compute(OpKernelContext * context)626   void Compute(OpKernelContext* context) override {
627     // boxes: [num_boxes, 4]
628     const Tensor& boxes = context->input(0);
629     // scores: [num_boxes]
630     const Tensor& scores = context->input(1);
631     // max_output_size: scalar
632     const Tensor& max_output_size = context->input(2);
633     OP_REQUIRES(
634         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
635         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
636                                 max_output_size.shape().DebugString()));
637     // iou_threshold: scalar
638     const Tensor& iou_threshold = context->input(3);
639     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
640                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
641                                         iou_threshold.shape().DebugString()));
642     const T iou_threshold_val = iou_threshold.scalar<T>()();
643 
644     OP_REQUIRES(context,
645                 iou_threshold_val >= static_cast<T>(0.0) &&
646                     iou_threshold_val <= static_cast<T>(1.0),
647                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
648     int num_boxes = 0;
649     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
650     CheckScoreSizes(context, num_boxes, scores);
651     if (!context->status().ok()) {
652       return;
653     }
654     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
655 
656     const T score_threshold_val = std::numeric_limits<T>::lowest();
657     const T dummy_soft_nms_sigma = static_cast<T>(0.0);
658     DoNonMaxSuppressionOp<T>(context, scores, num_boxes, max_output_size,
659                              iou_threshold_val, score_threshold_val,
660                              dummy_soft_nms_sigma, similarity_fn);
661   }
662 };
663 
664 template <typename Device, typename T>
665 class NonMaxSuppressionV3Op : public OpKernel {
666  public:
NonMaxSuppressionV3Op(OpKernelConstruction * context)667   explicit NonMaxSuppressionV3Op(OpKernelConstruction* context)
668       : OpKernel(context) {}
669 
Compute(OpKernelContext * context)670   void Compute(OpKernelContext* context) override {
671     // boxes: [num_boxes, 4]
672     const Tensor& boxes = context->input(0);
673     // scores: [num_boxes]
674     const Tensor& scores = context->input(1);
675     // max_output_size: scalar
676     const Tensor& max_output_size = context->input(2);
677     OP_REQUIRES(
678         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
679         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
680                                 max_output_size.shape().DebugString(),
681                                 " (Shape must be rank 0 but is ", "rank ",
682                                 max_output_size.dims(), ")"));
683     // iou_threshold: scalar
684     const Tensor& iou_threshold = context->input(3);
685     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
686                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
687                                         iou_threshold.shape().DebugString(),
688                                         " (Shape must be rank 0 but is rank ",
689                                         iou_threshold.dims(), ")"));
690     const T iou_threshold_val = iou_threshold.scalar<T>()();
691     OP_REQUIRES(context,
692                 iou_threshold_val >= static_cast<T>(0.0) &&
693                     iou_threshold_val <= static_cast<T>(1.0),
694                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
695     // score_threshold: scalar
696     const Tensor& score_threshold = context->input(4);
697     OP_REQUIRES(
698         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
699         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
700                                 score_threshold.shape().DebugString()));
701     const T score_threshold_val = score_threshold.scalar<T>()();
702 
703     int num_boxes = 0;
704     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
705     CheckScoreSizes(context, num_boxes, scores);
706     if (!context->status().ok()) {
707       return;
708     }
709 
710     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
711 
712     const T dummy_soft_nms_sigma = static_cast<T>(0.0);
713     DoNonMaxSuppressionOp<T>(context, scores, num_boxes, max_output_size,
714                              iou_threshold_val, score_threshold_val,
715                              dummy_soft_nms_sigma, similarity_fn);
716   }
717 };
718 
719 template <typename Device, typename T>
720 class NonMaxSuppressionV4Op : public OpKernel {
721  public:
NonMaxSuppressionV4Op(OpKernelConstruction * context)722   explicit NonMaxSuppressionV4Op(OpKernelConstruction* context)
723       : OpKernel(context) {
724     OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
725                                              &pad_to_max_output_size_));
726   }
727 
Compute(OpKernelContext * context)728   void Compute(OpKernelContext* context) override {
729     // boxes: [num_boxes, 4]
730     const Tensor& boxes = context->input(0);
731     // scores: [num_boxes]
732     const Tensor& scores = context->input(1);
733     // max_output_size: scalar
734     const Tensor& max_output_size = context->input(2);
735     OP_REQUIRES(
736         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
737         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
738                                 max_output_size.shape().DebugString()));
739     // iou_threshold: scalar
740     const Tensor& iou_threshold = context->input(3);
741     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
742                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
743                                         iou_threshold.shape().DebugString()));
744     const T iou_threshold_val = iou_threshold.scalar<T>()();
745     OP_REQUIRES(context,
746                 iou_threshold_val >= static_cast<T>(0.0) &&
747                     iou_threshold_val <= static_cast<T>(1.0),
748                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
749     // score_threshold: scalar
750     const Tensor& score_threshold = context->input(4);
751     OP_REQUIRES(
752         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
753         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
754                                 score_threshold.shape().DebugString()));
755     const T score_threshold_val = score_threshold.scalar<T>()();
756 
757     int num_boxes = 0;
758     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
759     CheckScoreSizes(context, num_boxes, scores);
760     if (!context->status().ok()) {
761       return;
762     }
763 
764     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
765     int num_valid_outputs;
766 
767     bool return_scores_tensor_ = false;
768     const T dummy_soft_nms_sigma = static_cast<T>(0.0);
769     DoNonMaxSuppressionOp<T>(
770         context, scores, num_boxes, max_output_size, iou_threshold_val,
771         score_threshold_val, dummy_soft_nms_sigma, similarity_fn,
772         return_scores_tensor_, pad_to_max_output_size_, &num_valid_outputs);
773     if (!context->status().ok()) {
774       return;
775     }
776 
777     // Allocate scalar output tensor for number of indices computed.
778     Tensor* num_outputs_t = nullptr;
779     OP_REQUIRES_OK(context, context->allocate_output(
780                                 1, tensorflow::TensorShape{}, &num_outputs_t));
781     num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
782   }
783 
784  private:
785   bool pad_to_max_output_size_;
786 };
787 
788 template <typename Device, typename T>
789 class NonMaxSuppressionV5Op : public OpKernel {
790  public:
NonMaxSuppressionV5Op(OpKernelConstruction * context)791   explicit NonMaxSuppressionV5Op(OpKernelConstruction* context)
792       : OpKernel(context) {
793     OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
794                                              &pad_to_max_output_size_));
795   }
796 
Compute(OpKernelContext * context)797   void Compute(OpKernelContext* context) override {
798     // boxes: [num_boxes, 4]
799     const Tensor& boxes = context->input(0);
800     // scores: [num_boxes]
801     const Tensor& scores = context->input(1);
802     // max_output_size: scalar
803     const Tensor& max_output_size = context->input(2);
804     OP_REQUIRES(
805         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
806         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
807                                 max_output_size.shape().DebugString()));
808     // iou_threshold: scalar
809     const Tensor& iou_threshold = context->input(3);
810     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
811                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
812                                         iou_threshold.shape().DebugString()));
813     const T iou_threshold_val = iou_threshold.scalar<T>()();
814     OP_REQUIRES(context,
815                 iou_threshold_val >= static_cast<T>(0.0) &&
816                     iou_threshold_val <= static_cast<T>(1.0),
817                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
818     // score_threshold: scalar
819     const Tensor& score_threshold = context->input(4);
820     OP_REQUIRES(
821         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
822         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
823                                 score_threshold.shape().DebugString()));
824     const T score_threshold_val = score_threshold.scalar<T>()();
825 
826     // soft_nms_sigma: scalar
827     const Tensor& soft_nms_sigma = context->input(5);
828     OP_REQUIRES(
829         context, TensorShapeUtils::IsScalar(soft_nms_sigma.shape()),
830         errors::InvalidArgument("soft_nms_sigma must be 0-D, got shape ",
831                                 soft_nms_sigma.shape().DebugString()));
832     const T soft_nms_sigma_val = soft_nms_sigma.scalar<T>()();
833     OP_REQUIRES(context, soft_nms_sigma_val >= static_cast<T>(0.0),
834                 errors::InvalidArgument("soft_nms_sigma_val must be >= 0"));
835 
836     int num_boxes = 0;
837     ParseAndCheckBoxSizes(context, boxes, &num_boxes);
838     CheckScoreSizes(context, num_boxes, scores);
839     if (!context->status().ok()) {
840       return;
841     }
842 
843     auto similarity_fn = CreateIOUSimilarityFn<T>(boxes);
844     int num_valid_outputs;
845 
846     // For NonMaxSuppressionV5Op, we always return a second output holding
847     // corresponding scores, so `return_scores_tensor` should never be false.
848     const bool return_scores_tensor_ = true;
849     DoNonMaxSuppressionOp<T>(
850         context, scores, num_boxes, max_output_size, iou_threshold_val,
851         score_threshold_val, soft_nms_sigma_val, similarity_fn,
852         return_scores_tensor_, pad_to_max_output_size_, &num_valid_outputs);
853     if (!context->status().ok()) {
854       return;
855     }
856 
857     // Allocate scalar output tensor for number of indices computed.
858     Tensor* num_outputs_t = nullptr;
859     OP_REQUIRES_OK(context, context->allocate_output(
860                                 2, tensorflow::TensorShape{}, &num_outputs_t));
861     num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
862   }
863 
864  private:
865   bool pad_to_max_output_size_;
866 };
867 
868 template <typename Device>
869 class NonMaxSuppressionWithOverlapsOp : public OpKernel {
870  public:
NonMaxSuppressionWithOverlapsOp(OpKernelConstruction * context)871   explicit NonMaxSuppressionWithOverlapsOp(OpKernelConstruction* context)
872       : OpKernel(context) {}
873 
Compute(OpKernelContext * context)874   void Compute(OpKernelContext* context) override {
875     // overlaps: [num_boxes, num_boxes]
876     const Tensor& overlaps = context->input(0);
877     // scores: [num_boxes]
878     const Tensor& scores = context->input(1);
879     // max_output_size: scalar
880     const Tensor& max_output_size = context->input(2);
881     OP_REQUIRES(
882         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
883         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
884                                 max_output_size.shape().DebugString()));
885     // overlap_threshold: scalar
886     const Tensor& overlap_threshold = context->input(3);
887     OP_REQUIRES(
888         context, TensorShapeUtils::IsScalar(overlap_threshold.shape()),
889         errors::InvalidArgument("overlap_threshold must be 0-D, got shape ",
890                                 overlap_threshold.shape().DebugString()));
891     const float overlap_threshold_val = overlap_threshold.scalar<float>()();
892 
893     // score_threshold: scalar
894     const Tensor& score_threshold = context->input(4);
895     OP_REQUIRES(
896         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
897         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
898                                 score_threshold.shape().DebugString()));
899     const float score_threshold_val = score_threshold.scalar<float>()();
900 
901     int num_boxes = 0;
902     ParseAndCheckOverlapSizes(context, overlaps, &num_boxes);
903     CheckScoreSizes(context, num_boxes, scores);
904     if (!context->status().ok()) {
905       return;
906     }
907     auto similarity_fn = CreateOverlapSimilarityFn<float>(overlaps);
908 
909     const float dummy_soft_nms_sigma = static_cast<float>(0.0);
910     DoNonMaxSuppressionOp<float>(context, scores, num_boxes, max_output_size,
911                                  overlap_threshold_val, score_threshold_val,
912                                  dummy_soft_nms_sigma, similarity_fn);
913   }
914 };
915 
916 template <typename Device>
917 class CombinedNonMaxSuppressionOp : public OpKernel {
918  public:
CombinedNonMaxSuppressionOp(OpKernelConstruction * context)919   explicit CombinedNonMaxSuppressionOp(OpKernelConstruction* context)
920       : OpKernel(context) {
921     OP_REQUIRES_OK(context, context->GetAttr("pad_per_class", &pad_per_class_));
922     OP_REQUIRES_OK(context, context->GetAttr("clip_boxes", &clip_boxes_));
923   }
924 
Compute(OpKernelContext * context)925   void Compute(OpKernelContext* context) override {
926     // boxes: [batch_size, num_anchors, q, 4]
927     const Tensor& boxes = context->input(0);
928     // scores: [batch_size, num_anchors, num_classes]
929     const Tensor& scores = context->input(1);
930     OP_REQUIRES(
931         context, (boxes.dim_size(0) == scores.dim_size(0)),
932         errors::InvalidArgument("boxes and scores must have same batch size"));
933 
934     // max_output_size: scalar
935     const Tensor& max_output_size = context->input(2);
936     OP_REQUIRES(
937         context, TensorShapeUtils::IsScalar(max_output_size.shape()),
938         errors::InvalidArgument("max_size_per_class must be 0-D, got shape ",
939                                 max_output_size.shape().DebugString()));
940     const int max_size_per_class = max_output_size.scalar<int>()();
941     OP_REQUIRES(context, max_size_per_class > 0,
942                 errors::InvalidArgument("max_size_per_class must be positive"));
943     // max_total_size: scalar
944     const Tensor& max_total_size = context->input(3);
945     OP_REQUIRES(
946         context, TensorShapeUtils::IsScalar(max_total_size.shape()),
947         errors::InvalidArgument("max_total_size must be 0-D, got shape ",
948                                 max_total_size.shape().DebugString()));
949     const int max_total_size_per_batch = max_total_size.scalar<int>()();
950     OP_REQUIRES(context, max_total_size_per_batch > 0,
951                 errors::InvalidArgument("max_total_size must be > 0"));
952     // Throw warning when `max_total_size` is too large as it may cause OOM.
953     if (max_total_size_per_batch > pow(10, 6)) {
954       LOG(WARNING) << "Detected a large value for `max_total_size`. This may "
955                    << "cause OOM error. (max_total_size: "
956                    << max_total_size.scalar<int>()() << ")";
957     }
958     // iou_threshold: scalar
959     const Tensor& iou_threshold = context->input(4);
960     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
961                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
962                                         iou_threshold.shape().DebugString()));
963     const float iou_threshold_val = iou_threshold.scalar<float>()();
964 
965     // score_threshold: scalar
966     const Tensor& score_threshold = context->input(5);
967     OP_REQUIRES(
968         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
969         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
970                                 score_threshold.shape().DebugString()));
971     const float score_threshold_val = score_threshold.scalar<float>()();
972 
973     OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
974                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
975     int num_boxes = 0;
976     const int num_classes = scores.dim_size(2);
977     ParseAndCheckCombinedNMSBoxSizes(context, boxes, &num_boxes, num_classes);
978     CheckCombinedNMSScoreSizes(context, num_boxes, scores);
979 
980     if (!context->status().ok()) {
981       return;
982     }
983     BatchedNonMaxSuppressionOp(context, boxes, scores, num_boxes,
984                                max_size_per_class, max_total_size_per_batch,
985                                score_threshold_val, iou_threshold_val,
986                                pad_per_class_, clip_boxes_);
987   }
988 
989  private:
990   bool pad_per_class_;
991   bool clip_boxes_;
992 };
993 
994 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
995                         NonMaxSuppressionOp<CPUDevice>);
996 
997 REGISTER_KERNEL_BUILDER(
998     Name("NonMaxSuppressionV2").TypeConstraint<float>("T").Device(DEVICE_CPU),
999     NonMaxSuppressionV2Op<CPUDevice, float>);
1000 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2")
1001                             .TypeConstraint<Eigen::half>("T")
1002                             .Device(DEVICE_CPU),
1003                         NonMaxSuppressionV2Op<CPUDevice, Eigen::half>);
1004 
1005 REGISTER_KERNEL_BUILDER(
1006     Name("NonMaxSuppressionV3").TypeConstraint<float>("T").Device(DEVICE_CPU),
1007     NonMaxSuppressionV3Op<CPUDevice, float>);
1008 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
1009                             .TypeConstraint<Eigen::half>("T")
1010                             .Device(DEVICE_CPU),
1011                         NonMaxSuppressionV3Op<CPUDevice, Eigen::half>);
1012 
1013 REGISTER_KERNEL_BUILDER(
1014     Name("NonMaxSuppressionV4").TypeConstraint<float>("T").Device(DEVICE_CPU),
1015     NonMaxSuppressionV4Op<CPUDevice, float>);
1016 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV4")
1017                             .TypeConstraint<Eigen::half>("T")
1018                             .Device(DEVICE_CPU),
1019                         NonMaxSuppressionV4Op<CPUDevice, Eigen::half>);
1020 
1021 REGISTER_KERNEL_BUILDER(
1022     Name("NonMaxSuppressionV5").TypeConstraint<float>("T").Device(DEVICE_CPU),
1023     NonMaxSuppressionV5Op<CPUDevice, float>);
1024 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV5")
1025                             .TypeConstraint<Eigen::half>("T")
1026                             .Device(DEVICE_CPU),
1027                         NonMaxSuppressionV5Op<CPUDevice, Eigen::half>);
1028 
1029 REGISTER_KERNEL_BUILDER(
1030     Name("NonMaxSuppressionWithOverlaps").Device(DEVICE_CPU),
1031     NonMaxSuppressionWithOverlapsOp<CPUDevice>);
1032 
1033 REGISTER_KERNEL_BUILDER(Name("CombinedNonMaxSuppression").Device(DEVICE_CPU),
1034                         CombinedNonMaxSuppressionOp<CPUDevice>);
1035 
1036 }  // namespace tensorflow
1037