1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/image_ops.cc
17 
18 #define EIGEN_USE_THREADS
19 
20 #include "tensorflow/core/kernels/image/crop_and_resize_op.h"
21 
22 #include <functional>
23 #include <string>
24 
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/framework/bounds_check.h"
27 #include "tensorflow/core/framework/register_types.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/tensor_reference.h"
30 #include "tensorflow/core/framework/tensor_shape.h"
31 #include "tensorflow/core/framework/types.h"
32 #include "tensorflow/core/lib/core/errors.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/determinism.h"
37 #include "tensorflow/core/util/work_sharder.h"
38 
39 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
40 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
41 #include "tensorflow/core/platform/stream_executor.h"
42 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
43 
44 #if GOOGLE_CUDA
45 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
46 using stream_executor::cuda::ScopedActivateExecutorContext;
47 #elif TENSORFLOW_USE_ROCM
48 #include "tensorflow/core/platform/rocm.h"
49 using stream_executor::rocm::ScopedActivateExecutorContext;
50 #endif
51 
52 namespace tensorflow {
53 namespace {
54 
55 typedef Eigen::ThreadPoolDevice CPUDevice;
56 typedef Eigen::GpuDevice GPUDevice;
57 using Callback = std::function<void()>;
58 
ParseAndCheckBoxSizes(const Tensor & boxes,const Tensor & box_index,int * num_boxes)59 static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
60                                            const Tensor& box_index,
61                                            int* num_boxes) {
62   if (boxes.NumElements() == 0 && box_index.NumElements() == 0) {
63     *num_boxes = 0;
64     return Status::OK();
65   }
66   // The shape of 'boxes' is [num_boxes, 4].
67   if (boxes.dims() != 2) {
68     return errors::InvalidArgument("boxes must be 2-D",
69                                    boxes.shape().DebugString());
70   }
71   *num_boxes = boxes.dim_size(0);
72   if (boxes.dim_size(1) != 4) {
73     return errors::InvalidArgument("boxes must have 4 columns");
74   }
75   // The shape of 'box_index' is [num_boxes].
76   if (box_index.dims() != 1) {
77     return errors::InvalidArgument("box_index must be 1-D",
78                                    box_index.shape().DebugString());
79   }
80   if (box_index.dim_size(0) != *num_boxes) {
81     return errors::InvalidArgument("box_index has incompatible shape");
82   }
83   return Status::OK();
84 }
85 
86 // Conditionally calls the compute callback if all values in box_index are in
87 // [0, batch_size) then calls done.
88 template <typename Device>
89 inline void RunIfBoxIndexIsValid(
90     OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
91     int batch_size, const Callback& compute, const Callback& done);
92 
93 // Specialization of CheckValidBoxIndex for a CPUDevice.
94 template <>
RunIfBoxIndexIsValid(OpKernelContext * context,typename TTypes<int32,1>::ConstTensor box_index,int batch_size,const Callback & compute,const Callback & done)95 inline void RunIfBoxIndexIsValid<CPUDevice>(
96     OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
97     int batch_size, const Callback& compute, const Callback& done) {
98   const int num_boxes = box_index.dimension(0);
99   for (int b = 0; b < num_boxes; ++b) {
100     OP_REQUIRES_ASYNC(
101         context, FastBoundsCheck(box_index(b), batch_size),
102         errors::OutOfRange("box_index has values outside [0, batch_size)"),
103         done);
104   }
105   if (compute) {
106     compute();
107   }
108   if (done) {
109     done();
110   }
111 }
112 
113 }  // namespace
114 
115 template <typename Device, typename T>
116 class CropAndResizeOp : public AsyncOpKernel {
117  public:
CropAndResizeOp(OpKernelConstruction * context)118   explicit CropAndResizeOp(OpKernelConstruction* context)
119       : AsyncOpKernel(context) {
120     OP_REQUIRES_OK(context, context->GetAttr("method", &method_));
121     OP_REQUIRES(context, method_ == "bilinear" || method_ == "nearest",
122                 errors::InvalidArgument(
123                     "method must be 'bilinear' or 'nearest'", method_));
124     OP_REQUIRES_OK(context, context->GetAttr("extrapolation_value",
125                                              &extrapolation_value_));
126   }
127 
ComputeAsync(OpKernelContext * context,DoneCallback done)128   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
129     // The shape of 'image' is [batch_size, image_height, image_width,
130     // channels].
131     const Tensor& image = context->input(0);
132     // The shape of 'boxes' is [num_boxes, 4].
133     const Tensor& boxes = context->input(1);
134     // The shape of 'box_index' is [num_boxes].
135     const Tensor& box_index = context->input(2);
136     // The shape of 'crop_size' is [2].
137     const Tensor& crop_size = context->input(3);
138 
139     // Validate inputs dimensions.
140     OP_REQUIRES_ASYNC(context, image.dims() == 4,
141                       errors::InvalidArgument("input image must be 4-D",
142                                               image.shape().DebugString()),
143                       done);
144     const int batch_size = image.dim_size(0);
145     const int image_height = image.dim_size(1);
146     const int image_width = image.dim_size(2);
147     const int depth = image.dim_size(3);
148     OP_REQUIRES_ASYNC(
149         context, image_height > 0 && image_width > 0,
150         errors::InvalidArgument("image dimensions must be positive"), done);
151     int num_boxes = 0;
152     OP_REQUIRES_OK_ASYNC(
153         context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
154 
155     OP_REQUIRES_ASYNC(context, crop_size.dims() == 1,
156                       errors::InvalidArgument("crop_size must be 1-D",
157                                               crop_size.shape().DebugString()),
158                       done);
159     OP_REQUIRES_ASYNC(
160         context, crop_size.dim_size(0) == 2,
161         errors::InvalidArgument("crop_size must have two elements",
162                                 crop_size.shape().DebugString()),
163         done);
164 
165     // Copy and validate crop sizes.
166     auto crop_size_vec = crop_size.vec<int32>();
167     const int crop_height = internal::SubtleMustCopy(crop_size_vec(0));
168     const int crop_width = internal::SubtleMustCopy(crop_size_vec(1));
169     OP_REQUIRES_ASYNC(
170         context, crop_height > 0 && crop_width > 0,
171         errors::InvalidArgument("crop dimensions must be positive"), done);
172 
173     // Allocate output tensor.
174     Tensor* output = nullptr;
175     OP_REQUIRES_OK_ASYNC(
176         context,
177         context->allocate_output(
178             0, TensorShape({num_boxes, crop_height, crop_width, depth}),
179             &output),
180         done);
181 
182     auto compute_callback = [this, context, output]() {
183       const Tensor& image = context->input(0);
184       const Tensor& boxes = context->input(1);
185       const Tensor& box_index = context->input(2);
186       const bool status = functor::CropAndResize<Device, T>()(
187           context, image.tensor<T, 4>(), boxes.tensor<float, 2>(),
188           box_index.tensor<int32, 1>(), method_, extrapolation_value_,
189           output->tensor<float, 4>());
190 
191       if (!status) {
192         context->SetStatus(
193             errors::Internal("Failed to launch CropAndResizeKernel."));
194       }
195     };
196 
197     RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
198                                  batch_size, std::move(compute_callback),
199                                  std::move(done));
200   }
201 
202  private:
203   float extrapolation_value_;
204   string method_;
205 };
206 
207 // Partial specialization of CropAndResize functor for a CPUDevice.
208 namespace functor {
209 template <typename T>
210 struct CropAndResize<CPUDevice, T> {
operator ()tensorflow::functor::CropAndResize211   bool operator()(OpKernelContext* context,
212                   typename TTypes<T, 4>::ConstTensor image,
213                   typename TTypes<float, 2>::ConstTensor boxes,
214                   typename TTypes<int32, 1>::ConstTensor box_index,
215                   const string& method_name, float extrapolation_value,
216                   typename TTypes<float, 4>::Tensor crops) {
217     const int batch_size = image.dimension(0);
218     const int image_height = image.dimension(1);
219     const int image_width = image.dimension(2);
220 
221     const int num_boxes = crops.dimension(0);
222     const int crop_height = crops.dimension(1);
223     const int crop_width = crops.dimension(2);
224     const int depth = crops.dimension(3);
225 
226     // Since `functor::CropAndResize` operates on float, we first validate
227     // that we don't overflow (since overflow causes undefined behavior which
228     // could result in segfault in this scenario).
229     const Eigen::Tensor<bool, 0, Eigen::RowMajor> only_finite_elements =
230         boxes.isfinite().all();
231     if (!only_finite_elements()) {
232       context->SetStatus(errors::InvalidArgument(
233           "Boxes contains at least one element that is not finite"));
234       return false;
235     }
236 
237     // Sharding across boxes.
238     auto CropAndResizePerBox = [&](int64_t start_box, int64_t limit_box) {
239       for (int b = start_box; b < limit_box; ++b) {
240         const float y1 = boxes(b, 0);
241         const float x1 = boxes(b, 1);
242         const float y2 = boxes(b, 2);
243         const float x2 = boxes(b, 3);
244 
245         const int32_t b_in = box_index(b);
246         if (!FastBoundsCheck(b_in, batch_size)) {
247           continue;
248         }
249 
250         const float height_scale =
251             (crop_height > 1)
252                 ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
253                 : 0;
254         const float width_scale =
255             (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1)
256                              : 0;
257 
258         for (int y = 0; y < crop_height; ++y) {
259           const float in_y = (crop_height > 1)
260                                  ? y1 * (image_height - 1) + y * height_scale
261                                  : 0.5 * (y1 + y2) * (image_height - 1);
262           if (in_y < 0 || in_y > image_height - 1) {
263             for (int x = 0; x < crop_width; ++x) {
264               for (int d = 0; d < depth; ++d) {
265                 crops(b, y, x, d) = extrapolation_value;
266               }
267             }
268             continue;
269           }
270           if (method_name == "bilinear") {
271             const int top_y_index = floorf(in_y);
272             const int bottom_y_index = ceilf(in_y);
273             const float y_lerp = in_y - top_y_index;
274 
275             for (int x = 0; x < crop_width; ++x) {
276               const float in_x = (crop_width > 1)
277                                      ? x1 * (image_width - 1) + x * width_scale
278                                      : 0.5 * (x1 + x2) * (image_width - 1);
279               if (in_x < 0 || in_x > image_width - 1) {
280                 for (int d = 0; d < depth; ++d) {
281                   crops(b, y, x, d) = extrapolation_value;
282                 }
283                 continue;
284               }
285               const int left_x_index = floorf(in_x);
286               const int right_x_index = ceilf(in_x);
287               const float x_lerp = in_x - left_x_index;
288 
289               for (int d = 0; d < depth; ++d) {
290                 const float top_left(static_cast<float>(
291                     image(b_in, top_y_index, left_x_index, d)));
292                 const float top_right(static_cast<float>(
293                     image(b_in, top_y_index, right_x_index, d)));
294                 const float bottom_left(static_cast<float>(
295                     image(b_in, bottom_y_index, left_x_index, d)));
296                 const float bottom_right(static_cast<float>(
297                     image(b_in, bottom_y_index, right_x_index, d)));
298                 const float top = top_left + (top_right - top_left) * x_lerp;
299                 const float bottom =
300                     bottom_left + (bottom_right - bottom_left) * x_lerp;
301                 crops(b, y, x, d) = top + (bottom - top) * y_lerp;
302               }
303             }
304           } else {  // method == "nearest"
305             for (int x = 0; x < crop_width; ++x) {
306               const float in_x = (crop_width > 1)
307                                      ? x1 * (image_width - 1) + x * width_scale
308                                      : 0.5 * (x1 + x2) * (image_width - 1);
309               if (in_x < 0 || in_x > image_width - 1) {
310                 for (int d = 0; d < depth; ++d) {
311                   crops(b, y, x, d) = extrapolation_value;
312                 }
313                 continue;
314               }
315               const int closest_x_index = roundf(in_x);
316               const int closest_y_index = roundf(in_y);
317               for (int d = 0; d < depth; ++d) {
318                 crops(b, y, x, d) = static_cast<float>(
319                     image(b_in, closest_y_index, closest_x_index, d));
320               }
321             }
322           }
323         }
324       }
325     };
326 
327     // A rough estimation of the cost for each cropped box.
328     double cost_per_pixel =
329         depth * (Eigen::TensorOpCost::AddCost<float>() * 6 +
330                  Eigen::TensorOpCost::MulCost<float>() * 3 +
331                  Eigen::TensorOpCost::CastCost<T, float>() * 4) +
332         (Eigen::TensorOpCost::AddCost<float>() * 2 +
333          Eigen::TensorOpCost::AddCost<float>() * 3);
334     if (method_name == "nearest") {
335       cost_per_pixel = depth * Eigen::TensorOpCost::CastCost<T, float>() +
336                        Eigen::TensorOpCost::AddCost<float>() * 4 +
337                        Eigen::TensorOpCost::MulCost<float>() * 4;
338     }
339     const double cost_per_box = crop_height * crop_width * cost_per_pixel;
340 
341     const DeviceBase::CpuWorkerThreads& worker_threads =
342         *(context->device()->tensorflow_cpu_worker_threads());
343     Shard(worker_threads.num_threads, worker_threads.workers, num_boxes,
344           cost_per_box, CropAndResizePerBox);
345 
346     return true;
347   }
348 };
349 
350 }  // namespace functor
351 
352 template <typename Device, typename T>
353 class CropAndResizeGradImageOp : public AsyncOpKernel {
354  public:
CropAndResizeGradImageOp(OpKernelConstruction * context)355   explicit CropAndResizeGradImageOp(OpKernelConstruction* context)
356       : AsyncOpKernel(context) {
357     OP_REQUIRES_OK(context, context->GetAttr("method", &method_));
358     OP_REQUIRES(context, method_ == "bilinear" || method_ == "nearest",
359                 errors::InvalidArgument(
360                     "method must be 'bilinear' or 'nearest'", method_));
361   }
362 
ComputeAsync(OpKernelContext * context,DoneCallback done)363   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
364     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
365     const Tensor& grads = context->input(0);
366     // The shape of 'boxes' is [num_boxes, 4].
367     const Tensor& boxes = context->input(1);
368     // The shape of 'box_index' is [num_boxes].
369     const Tensor& box_index = context->input(2);
370     // The shape of 'image_size' is [4].
371     const Tensor& image_size = context->input(3);
372 
373     // Validate input shapes.
374     OP_REQUIRES_ASYNC(context, grads.dims() == 4,
375                       errors::InvalidArgument("grads image must be 4-D",
376                                               grads.shape().DebugString()),
377                       done);
378     const int crop_height = grads.dim_size(1);
379     const int crop_width = grads.dim_size(2);
380     OP_REQUIRES_ASYNC(
381         context, crop_height > 0 && crop_width > 0,
382         errors::InvalidArgument("grads dimensions must be positive"), done);
383     int num_boxes = 0;
384     OP_REQUIRES_OK_ASYNC(
385         context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
386     OP_REQUIRES_ASYNC(
387         context, grads.dim_size(0) == num_boxes,
388         errors::InvalidArgument("boxes and grads have incompatible shape"),
389         done);
390 
391     OP_REQUIRES_ASYNC(context, image_size.dims() == 1,
392                       errors::InvalidArgument("image_size must be 1-D",
393                                               image_size.shape().DebugString()),
394                       done);
395     OP_REQUIRES_ASYNC(context, image_size.dim_size(0) == 4,
396                       errors::InvalidArgument("image_size must have 4 elements",
397                                               image_size.shape().DebugString()),
398                       done);
399     auto image_size_vec = image_size.vec<int32>();
400     const int batch_size = internal::SubtleMustCopy(image_size_vec(0));
401     const int image_height = internal::SubtleMustCopy(image_size_vec(1));
402     const int image_width = internal::SubtleMustCopy(image_size_vec(2));
403     const int depth = internal::SubtleMustCopy(image_size_vec(3));
404     OP_REQUIRES_ASYNC(
405         context, image_height > 0 && image_width > 0,
406         errors::InvalidArgument("image dimensions must be positive"), done);
407     OP_REQUIRES_ASYNC(
408         context, grads.dim_size(3) == depth,
409         errors::InvalidArgument("image_size and grads are incompatible"), done);
410 
411     if (std::is_same<Device, GPUDevice>::value) {
412       OP_REQUIRES_ASYNC(
413           context, !OpDeterminismRequired(),
414           errors::Unimplemented(
415               "Deterministic GPU implementation of CropAndResizeBackpropImage"
416               " not available."),
417           done);
418     }
419 
420     // Allocate output tensor.
421     Tensor* output = nullptr;
422     OP_REQUIRES_OK_ASYNC(
423         context,
424         context->allocate_output(
425             0, TensorShape({batch_size, image_height, image_width, depth}),
426             &output),
427         done);
428 
429     auto compute_callback = [this, context, output]() {
430       const Tensor& grads = context->input(0);
431       const Tensor& boxes = context->input(1);
432       const Tensor& box_index = context->input(2);
433       const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
434           context, grads.tensor<float, 4>(), boxes.tensor<float, 2>(),
435           box_index.tensor<int32, 1>(), output->tensor<T, 4>(), method_);
436 
437       if (!status) {
438         context->SetStatus(errors::Internal(
439             "Failed to launch CropAndResizeBackpropImage kernel."));
440       }
441     };
442 
443     RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
444                                  batch_size, std::move(compute_callback),
445                                  std::move(done));
446   }
447 
448  private:
449   string method_;
450 };
451 
452 // Partial specialization of CropAndResizeBackpropImage functor for a CPUDevice.
453 namespace functor {
454 template <typename T>
455 struct CropAndResizeBackpropImage<CPUDevice, T> {
operator ()tensorflow::functor::CropAndResizeBackpropImage456   bool operator()(const OpKernelContext* context,
457                   typename TTypes<float, 4>::ConstTensor grads,
458                   typename TTypes<float, 2>::ConstTensor boxes,
459                   typename TTypes<int32, 1>::ConstTensor box_index,
460                   typename TTypes<T, 4>::Tensor grads_image,
461                   const string& method_name) {
462     const int batch_size = grads_image.dimension(0);
463     const int image_height = grads_image.dimension(1);
464     const int image_width = grads_image.dimension(2);
465 
466     const int num_boxes = grads.dimension(0);
467     const int crop_height = grads.dimension(1);
468     const int crop_width = grads.dimension(2);
469     const int depth = grads.dimension(3);
470 
471     grads_image.setZero();
472 
473     auto CropAndResizeBackImgPerBox = [&](int64_t start_box,
474                                           int64_t limit_box) {
475       for (int b = start_box; b < limit_box; ++b) {
476         const float y1 = boxes(b, 0);
477         const float x1 = boxes(b, 1);
478         const float y2 = boxes(b, 2);
479         const float x2 = boxes(b, 3);
480 
481         const int32_t b_in = box_index(b);
482         if (!FastBoundsCheck(b_in, batch_size)) {
483           continue;
484         }
485 
486         const float height_scale =
487             (crop_height > 1)
488                 ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
489                 : 0;
490         const float width_scale =
491             (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1)
492                              : 0;
493 
494         for (int y = 0; y < crop_height; ++y) {
495           const float in_y = (crop_height > 1)
496                                  ? y1 * (image_height - 1) + y * height_scale
497                                  : 0.5 * (y1 + y2) * (image_height - 1);
498           if (in_y < 0 || in_y > image_height - 1) {
499             continue;
500           }
501           const int top_y_index = floorf(in_y);
502           const int bottom_y_index = ceilf(in_y);
503           const float y_lerp = in_y - top_y_index;
504 
505           for (int x = 0; x < crop_width; ++x) {
506             const float in_x = (crop_width > 1)
507                                    ? x1 * (image_width - 1) + x * width_scale
508                                    : 0.5 * (x1 + x2) * (image_width - 1);
509             if (in_x < 0 || in_x > image_width - 1) {
510               continue;
511             }
512 
513             if (method_name == "bilinear") {
514               const int left_x_index = floorf(in_x);
515               const int right_x_index = ceilf(in_x);
516               const float x_lerp = in_x - left_x_index;
517 
518               for (int d = 0; d < depth; ++d) {
519                 const float dtop = (1 - y_lerp) * grads(b, y, x, d);
520                 grads_image(b_in, top_y_index, left_x_index, d) +=
521                     static_cast<T>((1 - x_lerp) * dtop);
522                 grads_image(b_in, top_y_index, right_x_index, d) +=
523                     static_cast<T>(x_lerp * dtop);
524                 const float dbottom = y_lerp * grads(b, y, x, d);
525                 grads_image(b_in, bottom_y_index, left_x_index, d) +=
526                     static_cast<T>((1 - x_lerp) * dbottom);
527                 grads_image(b_in, bottom_y_index, right_x_index, d) +=
528                     static_cast<T>(x_lerp * dbottom);
529               }
530             } else {  // method_name == "nearest"
531               for (int d = 0; d < depth; ++d) {
532                 int closest_x_index = roundf(in_x);
533                 int closest_y_index = roundf(in_y);
534                 grads_image(b_in, closest_y_index, closest_x_index, d) +=
535                     static_cast<T>(grads(b, y, x, d));
536               }
537             }
538           }
539         }
540       }
541     };
542 
543     // A rough estimation of the cost for each cropped box.
544     // Including calculation cost in the depth loop and pixel loop.
545     const double cost_per_pixel =
546         (method_name == "bilinear"
547              ? depth * (Eigen::TensorOpCost::AddCost<float>() * 7 +
548                         Eigen::TensorOpCost::MulCost<float>() * 6 +
549                         Eigen::TensorOpCost::CastCost<T, float>() * 4) +
550                    Eigen::TensorOpCost::AddCost<float>() * 4
551              : depth * (Eigen::TensorOpCost::AddCost<float>() +
552                         Eigen::TensorOpCost::CastCost<T, float>()) +
553                    Eigen::TensorOpCost::AddCost<float>() * 3);
554 
555     const double cost_per_box = crop_height * crop_width * cost_per_pixel;
556 
557     const DeviceBase::CpuWorkerThreads& worker_threads =
558         *(context->device()->tensorflow_cpu_worker_threads());
559 
560     // Sharding introduces nondeterminism when the gradients associated with
561     // more than two crops backprop into the same element in the source image.
562     int max_threads = OpDeterminismRequired() ? 1 : worker_threads.num_threads;
563 
564     Shard(max_threads, worker_threads.workers, num_boxes, cost_per_box,
565           CropAndResizeBackImgPerBox);
566 
567     return true;
568   }
569 };
570 
571 }  // namespace functor
572 
573 template <typename Device, typename T>
574 class CropAndResizeGradBoxesOp : public AsyncOpKernel {
575  public:
CropAndResizeGradBoxesOp(OpKernelConstruction * context)576   explicit CropAndResizeGradBoxesOp(OpKernelConstruction* context)
577       : AsyncOpKernel(context) {
578     string method;
579     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
580     OP_REQUIRES(context, method == "bilinear",
581                 errors::InvalidArgument("method must be 'bilinear'", method));
582   }
583 
ComputeAsync(OpKernelContext * context,DoneCallback done)584   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
585     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
586     const Tensor& grads = context->input(0);
587     // The shape of 'boxes' is [num_boxes, 4].
588     const Tensor& boxes = context->input(2);
589     // The shape of 'box_index' is [num_boxes].
590     const Tensor& box_index = context->input(3);
591     // The shape of 'image' is [batch_size, image_height, image_width, depth].
592     const Tensor& image = context->input(1);
593 
594     // Validate input shapes.
595     OP_REQUIRES_ASYNC(context, grads.dims() == 4,
596                       errors::InvalidArgument("grads image must be 4-D",
597                                               grads.shape().DebugString()),
598                       done);
599     const int crop_height = grads.dim_size(1);
600     const int crop_width = grads.dim_size(2);
601     const int depth = grads.dim_size(3);
602     OP_REQUIRES_ASYNC(
603         context, crop_height > 0 && crop_width > 0,
604         errors::InvalidArgument("grads dimensions must be positive"), done);
605 
606     OP_REQUIRES_ASYNC(context, image.dims() == 4,
607                       errors::InvalidArgument("input image must be 4-D",
608                                               image.shape().DebugString()),
609                       done);
610     const int batch_size = image.dim_size(0);
611     const int image_height = image.dim_size(1);
612     const int image_width = image.dim_size(2);
613     OP_REQUIRES_ASYNC(
614         context, image_height > 0 && image_width > 0,
615         errors::InvalidArgument("image dimensions must be positive"), done);
616     OP_REQUIRES_ASYNC(context, image.dim_size(3) == depth,
617                       errors::InvalidArgument("image, grads depth differ"),
618                       done);
619 
620     int num_boxes = 0;
621     OP_REQUIRES_OK_ASYNC(
622         context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
623 
624     OP_REQUIRES_ASYNC(
625         context, grads.dim_size(0) == num_boxes,
626         errors::InvalidArgument("boxes and grads have incompatible shape"),
627         done);
628 
629     if (std::is_same<Device, GPUDevice>::value) {
630       OP_REQUIRES_ASYNC(
631           context, !OpDeterminismRequired(),
632           errors::Unimplemented(
633               "Deterministic GPU implementation of CropAndResizeBackpropBoxes"
634               " not available."),
635           done);
636     }
637 
638     // Allocate output tensor.
639     Tensor* output = nullptr;
640     OP_REQUIRES_OK_ASYNC(
641         context,
642         context->allocate_output(0, TensorShape({num_boxes, 4}), &output),
643         done);
644 
645     auto compute_callback = [context, output]() {
646       const Tensor& grads = context->input(0);
647       const Tensor& image = context->input(1);
648       const Tensor& boxes = context->input(2);
649       const Tensor& box_index = context->input(3);
650       const bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
651           context->eigen_device<Device>(), grads.tensor<float, 4>(),
652           image.tensor<T, 4>(), boxes.tensor<float, 2>(),
653           box_index.tensor<int32, 1>(), output->tensor<float, 2>());
654       if (!status) {
655         context->SetStatus(errors::Internal(
656             "Failed to launch CropAndResizeBackpropBoxes kernel."));
657       }
658     };
659 
660     RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
661                                  batch_size, std::move(compute_callback),
662                                  std::move(done));
663   }
664 };
665 
666 // Partial specialization of CropAndResizeBackpropBoxes functor for a CPUDevice.
667 namespace functor {
668 template <typename T>
669 struct CropAndResizeBackpropBoxes<CPUDevice, T> {
operator ()tensorflow::functor::CropAndResizeBackpropBoxes670   bool operator()(const CPUDevice& d,
671                   typename TTypes<float, 4>::ConstTensor grads,
672                   typename TTypes<T, 4>::ConstTensor image,
673                   typename TTypes<float, 2>::ConstTensor boxes,
674                   typename TTypes<int32, 1>::ConstTensor box_index,
675                   typename TTypes<float, 2>::Tensor grads_boxes) {
676     const int batch_size = image.dimension(0);
677     const int image_height = image.dimension(1);
678     const int image_width = image.dimension(2);
679 
680     const int num_boxes = grads.dimension(0);
681     const int crop_height = grads.dimension(1);
682     const int crop_width = grads.dimension(2);
683     const int depth = grads.dimension(3);
684 
685     grads_boxes.setZero();
686 
687     for (int b = 0; b < num_boxes; ++b) {
688       const float y1 = boxes(b, 0);
689       const float x1 = boxes(b, 1);
690       const float y2 = boxes(b, 2);
691       const float x2 = boxes(b, 3);
692 
693       const int32_t b_in = box_index(b);
694       if (!FastBoundsCheck(b_in, batch_size)) {
695         continue;
696       }
697 
698       const float height_ratio =
699           (crop_height > 1)
700               ? static_cast<float>(image_height - 1) / (crop_height - 1)
701               : 0;
702       const float width_ratio =
703           (crop_width > 1)
704               ? static_cast<float>(image_width - 1) / (crop_width - 1)
705               : 0;
706 
707       const float height_scale =
708           (crop_height > 1) ? (y2 - y1) * height_ratio : 0;
709       const float width_scale = (crop_width > 1) ? (x2 - x1) * width_ratio : 0;
710 
711       for (int y = 0; y < crop_height; ++y) {
712         const float in_y = (crop_height > 1)
713                                ? y1 * (image_height - 1) + y * height_scale
714                                : 0.5 * (y1 + y2) * (image_height - 1);
715         if (in_y < 0 || in_y > image_height - 1) {
716           continue;
717         }
718         const int top_y_index = floorf(in_y);
719         const int bottom_y_index = ceilf(in_y);
720         const float y_lerp = in_y - top_y_index;
721 
722         for (int x = 0; x < crop_width; ++x) {
723           const float in_x = (crop_width > 1)
724                                  ? x1 * (image_width - 1) + x * width_scale
725                                  : 0.5 * (x1 + x2) * (image_width - 1);
726           if (in_x < 0 || in_x > image_width - 1) {
727             continue;
728           }
729           const int left_x_index = floorf(in_x);
730           const int right_x_index = ceilf(in_x);
731           const float x_lerp = in_x - left_x_index;
732 
733           for (int d = 0; d < depth; ++d) {
734             const float top_left(
735                 static_cast<float>(image(b_in, top_y_index, left_x_index, d)));
736             const float top_right(
737                 static_cast<float>(image(b_in, top_y_index, right_x_index, d)));
738             const float bottom_left(static_cast<float>(
739                 image(b_in, bottom_y_index, left_x_index, d)));
740             const float bottom_right(static_cast<float>(
741                 image(b_in, bottom_y_index, right_x_index, d)));
742             // Compute the image gradient.
743             float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) +
744                                  x_lerp * (bottom_right - top_right);
745             float image_grad_x = (1 - y_lerp) * (top_right - top_left) +
746                                  y_lerp * (bottom_right - bottom_left);
747             // Modulate the image gradient with the incoming gradient.
748             const float top_grad = grads(b, y, x, d);
749             image_grad_y *= top_grad;
750             image_grad_x *= top_grad;
751             // dy1, dy2
752             if (crop_height > 1) {
753               grads_boxes(b, 0) +=
754                   image_grad_y * (image_height - 1 - y * height_ratio);
755               grads_boxes(b, 2) += image_grad_y * (y * height_ratio);
756             } else {
757               grads_boxes(b, 0) += image_grad_y * 0.5 * (image_height - 1);
758               grads_boxes(b, 2) += image_grad_y * 0.5 * (image_height - 1);
759             }
760             // dx1, dx2
761             if (crop_width > 1) {
762               grads_boxes(b, 1) +=
763                   image_grad_x * (image_width - 1 - x * width_ratio);
764               grads_boxes(b, 3) += image_grad_x * (x * width_ratio);
765             } else {
766               grads_boxes(b, 1) += image_grad_x * 0.5 * (image_width - 1);
767               grads_boxes(b, 3) += image_grad_x * 0.5 * (image_width - 1);
768             }
769           }
770         }
771       }
772     }
773     return true;
774   }
775 };
776 
777 }  // namespace functor
778 
779 #define REGISTER_KERNEL(T)                                \
780   REGISTER_KERNEL_BUILDER(Name("CropAndResize")           \
781                               .Device(DEVICE_CPU)         \
782                               .TypeConstraint<T>("T")     \
783                               .HostMemory("crop_size"),   \
784                           CropAndResizeOp<CPUDevice, T>); \
785                                                           \
786   REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")  \
787                               .Device(DEVICE_CPU)         \
788                               .TypeConstraint<T>("T"),    \
789                           CropAndResizeGradBoxesOp<CPUDevice, T>);
790 
791 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
792 
793 #undef REGISTER_KERNEL
794 
795 #define REGISTER_KERNEL(T)                               \
796   REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradImage") \
797                               .Device(DEVICE_CPU)        \
798                               .TypeConstraint<T>("T")    \
799                               .HostMemory("image_size"), \
800                           CropAndResizeGradImageOp<CPUDevice, T>);
801 
802 TF_CALL_half(REGISTER_KERNEL);
803 TF_CALL_float(REGISTER_KERNEL);
804 TF_CALL_double(REGISTER_KERNEL);
805 
806 #undef REGISTER_KERNEL
807 
808 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
809 
810 // Forward declaration of the CheckValidBoxIndexHelper specialization for GPU.
811 namespace functor {
812 template <>
813 void CheckValidBoxIndexHelper<GPUDevice>::operator()(
814     const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_index,
815     int batch_size, typename TTypes<bool, 0>::Tensor isvalid);
816 extern template struct CheckValidBoxIndexHelper<GPUDevice>;
817 }  // namespace functor
818 
819 namespace {
820 
821 // Specialization of CheckValidBoxIndex for a GPUDevice.
822 template <>
RunIfBoxIndexIsValid(OpKernelContext * context,typename TTypes<int32,1>::ConstTensor box_index,int batch_size,const Callback & compute,const Callback & done)823 inline void RunIfBoxIndexIsValid<GPUDevice>(
824     OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
825     int batch_size, const Callback& compute, const Callback& done) {
826   const int num_boxes = box_index.dimension(0);
827   if (num_boxes == 0) {
828     compute();
829     done();
830     return;
831   }
832 
833   Tensor isvalid_dev_tensor;
834   OP_REQUIRES_OK_ASYNC(
835       context,
836       context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
837                              &isvalid_dev_tensor),
838       done);
839   typename TTypes<bool, 0>::Tensor isvalid_dev =
840       isvalid_dev_tensor.tensor<bool, 0>();
841 
842   // Run the actual box check on the device.
843   functor::CheckValidBoxIndexHelper<GPUDevice>()(
844       context->eigen_device<GPUDevice>(), box_index, batch_size, isvalid_dev);
845 
846   // Copy the result back to the host.
847   auto* stream = context->op_device_context()->stream();
848   OP_REQUIRES_ASYNC(context, stream,
849                     errors::Internal("No GPU stream available."), done);
850   Tensor isvalid_host_tensor;
851   // Use pinned host memory on the host to avoid unnecessary
852   // synchronization.
853   AllocatorAttributes alloc_attr;
854   alloc_attr.set_on_host(true);
855   alloc_attr.set_gpu_compatible(true);
856   OP_REQUIRES_OK_ASYNC(
857       context,
858       context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
859                              &isvalid_host_tensor, alloc_attr),
860       done);
861   se::DeviceMemoryBase wrapped(isvalid_dev.data(), sizeof(bool));
862   const bool status =
863       stream
864           ->ThenMemcpy(
865               isvalid_host_tensor.scalar<bool>().data() /* destination */,
866               wrapped /* source */, sizeof(bool))
867           .ok();
868   OP_REQUIRES_ASYNC(
869       context, status,
870       errors::Internal("Failed to launch copy of isvalid from device to host."),
871       done);
872 
873   // We capture both temporary tensors to prevent them from being deallocated
874   // when ComputeAsync returns and before the closure runs.
875   TensorReference isvalid_dev_ref(isvalid_dev_tensor);
876   auto wrapped_callback = [context, isvalid_host_tensor, isvalid_dev_ref,
877                            compute, done]() {
878     auto stream = context->op_device_context()->stream();
879     ScopedActivateExecutorContext scoped_activation{stream->parent()};
880     const bool isvalid = isvalid_host_tensor.scalar<bool>()();
881     isvalid_dev_ref.Unref();
882     OP_REQUIRES_ASYNC(
883         context, isvalid,
884         errors::OutOfRange("box_index has values outside [0, batch_size)"),
885         done);
886     compute();
887     done();
888   };
889 
890   context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
891       stream, wrapped_callback);
892 }
893 
894 }  // namespace
895 
896 #define REGISTER_KERNEL(T)                                         \
897   REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
898                               .Device(DEVICE_GPU)                  \
899                               .TypeConstraint<T>("T")              \
900                               .HostMemory("crop_size"),            \
901                           CropAndResizeOp<GPUDevice, T>);          \
902                                                                    \
903   REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradImage")           \
904                               .Device(DEVICE_GPU)                  \
905                               .TypeConstraint<T>("T")              \
906                               .HostMemory("image_size"),           \
907                           CropAndResizeGradImageOp<GPUDevice, T>); \
908                                                                    \
909   REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")           \
910                               .Device(DEVICE_GPU)                  \
911                               .TypeConstraint<T>("T"),             \
912                           CropAndResizeGradBoxesOp<GPUDevice, T>);
913 
914 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
915 
916 #undef REGISTER_KERNEL
917 
918 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
919 
920 }  // namespace tensorflow
921