• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/array_ops.cc
17 #define EIGEN_USE_THREADS
18 
19 #include "tensorflow/core/kernels/reverse_op.h"
20 #include <memory>
21 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
22 #include "tensorflow/core/framework/bounds_check.h"
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/lib/core/status.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/util/work_sharder.h"
32 
33 namespace tensorflow {
34 
35 typedef Eigen::ThreadPoolDevice CPUDevice;
36 typedef Eigen::GpuDevice GPUDevice;
37 #ifdef TENSORFLOW_USE_SYCL
38 typedef Eigen::SyclDevice SYCLDevice;
39 #endif  // TENSORFLOW_USE_SYCL
40 
41 namespace {
42 
43 // Reverse rows (middle dimension) of a three dimensional tensor.
44 // NUM_CHANNELS can be <= 0 to compute it dynamically from <input>
45 // Otherwise, it must equal input.dim_size(2) and is used as a compile-time
46 // constant.
47 template <typename T, int NUM_CHANNELS>
ReverseRows(OpKernelContext * context,const Tensor & input,Tensor * result)48 void ReverseRows(OpKernelContext* context, const Tensor& input,
49                  Tensor* result) {
50   auto work = [&input, result](int64 start, int64 end) {
51     const int64 inner_size =
52         NUM_CHANNELS > 0 ? NUM_CHANNELS : input.dim_size(2);
53     const int64 middle_size = input.dim_size(1);
54     const int64 row_size = inner_size * middle_size;
55     DCHECK_EQ(input.dim_size(2), inner_size);
56 
57     const T* in_ptr = input.bit_casted_tensor<T, 3>().data();
58     T* out_ptr = result->bit_casted_tensor<T, 3>().data();
59 
60     in_ptr += start * row_size;
61     out_ptr += start * row_size;
62 
63     for (int outer_dim = start; outer_dim < end; ++outer_dim) {
64       out_ptr += row_size;
65       int remaining = middle_size;
66       while (remaining > 0) {
67         out_ptr -= inner_size;
68         memcpy(out_ptr, in_ptr, inner_size * sizeof(T));
69         in_ptr += inner_size;
70         --remaining;
71       }
72 
73       out_ptr += row_size;
74     }
75   };
76 
77   // Shard across outer dimension.
78   const int64 N = input.dim_size(0);
79   const int64 cost_per_unit = input.NumElements() / N;
80   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
81   Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit,
82         std::move(work));
83 }
84 
85 template <typename T>
86 struct data_type_can_memcpy {
87   static constexpr bool value =
88       std::is_same<T, uint8>::value || std::is_same<T, int8>::value ||
89       std::is_same<T, bool>::value || std::is_same<T, uint16>::value ||
90       std::is_same<T, int16>::value || std::is_same<T, Eigen::half>::value ||
91       std::is_same<T, int32>::value || std::is_same<T, float>::value ||
92       std::is_same<T, int64>::value || std::is_same<T, double>::value ||
93       std::is_same<T, complex64>::value || std::is_same<T, complex128>::value;
94 };
95 
96 template <typename T, int NUM_CHANNELS>
97 typename std::enable_if<data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)98 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
99                     Tensor* result) {
100   if (sizeof(T) == 1) {
101     static_assert(sizeof(uint8) == 1, "uint8 must be 1 byte.");
102     ReverseRows<uint8, NUM_CHANNELS>(context, input, result);
103   } else if (sizeof(T) == 2) {
104     static_assert(sizeof(uint16) == 2, "uint16 must be 2 bytes");
105     ReverseRows<uint16, NUM_CHANNELS>(context, input, result);
106   } else if (sizeof(T) == 4) {
107     static_assert(sizeof(uint32) == 4, "uint32 must be 4 bytes");
108     ReverseRows<uint32, NUM_CHANNELS>(context, input, result);
109   } else if (sizeof(T) == 8) {
110     static_assert(sizeof(uint64) == 8, "uint64 must be 8 bytes");
111     ReverseRows<uint64, NUM_CHANNELS>(context, input, result);
112   } else if (sizeof(T) == 16) {
113     static_assert(sizeof(complex128) == 16, "complex128 must be 16 bytes");
114     ReverseRows<complex128, NUM_CHANNELS>(context, input, result);
115   } else {
116     context->CtxFailure(
117         errors::InvalidArgument("%s has unexpected size of %d bytes",
118                                 DataTypeString(input.dtype()), sizeof(T)));
119   }
120 }
121 
122 template <typename T, int NUM_CHANNELS>
123 typename std::enable_if<!data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)124 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
125                     Tensor* result) {}
126 
127 }  // namespace
128 
129 template <typename Device, typename T, int NDIMS>
HandleReverseCase(OpKernelContext * context,typename TTypes<bool,1>::ConstTensor dims,Tensor * result)130 void HandleReverseCase(OpKernelContext* context,
131                        typename TTypes<bool, 1>::ConstTensor dims,
132                        Tensor* result) {
133   const Tensor& input = context->input(0);
134 
135   // Use optimized reverse if possible.
136   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
137       data_type_can_memcpy<T>::value && (!dims(0) && dims(1) && !dims(2))) {
138     if (input.dim_size(2) == 3) {
139       DoHandleReverseCase<T, 3>(context, input, result);
140     } else {
141       DoHandleReverseCase<T, -1>(context, input, result);
142     }
143     return;
144   }
145   typename Eigen::array<bool, NDIMS> axes_di;
146   for (int i = 0; i < NDIMS; i++) {
147     axes_di[i] = dims(i);
148   }
149   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
150                                        input.tensor<T, NDIMS>(), axes_di,
151                                        result->tensor<T, NDIMS>());
152 }
153 
154 template <typename Device, typename T>
155 class ReverseOp : public OpKernel {
156  public:
ReverseOp(OpKernelConstruction * context)157   explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {}
158 
Compute(OpKernelContext * context)159   void Compute(OpKernelContext* context) override {
160     const Tensor& input = context->input(0);
161     const Tensor& dims = context->input(1);
162 
163     if (TensorShapeUtils::IsScalar(input.shape())) {
164       context->set_output(0, input);
165     } else {
166       const int input_dims = input.dims();
167       OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
168                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
169                                           dims.dims()));
170 
171       OP_REQUIRES(
172           context, input_dims == dims.dim_size(0),
173           errors::InvalidArgument(
174               "'dims' must have the same number of values as 'input' has "
175               "dimensions. 'input' has ",
176               input_dims, "'dims' has ", dims.dim_size(0), " values"));
177       OP_REQUIRES(context, input_dims <= 8,
178                   errors::Unimplemented(
179                       "reverse is not implemented for tensors of rank > 8."));
180 
181       Tensor* output = nullptr;
182       OP_REQUIRES_OK(context,
183                      context->allocate_output(0, input.shape(), &output));
184 
185 #define HANDLE_REVERSE(NDIMS)                                               \
186   case NDIMS:                                                               \
187     HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output); \
188     return;
189 
190       switch (input_dims) {
191         HANDLE_REVERSE(0);
192         HANDLE_REVERSE(1);
193         HANDLE_REVERSE(2);
194         HANDLE_REVERSE(3);
195         HANDLE_REVERSE(4);
196         HANDLE_REVERSE(5);
197         HANDLE_REVERSE(6);
198         HANDLE_REVERSE(7);
199         HANDLE_REVERSE(8);
200       }
201 #undef HANDLE_REVERSE
202     }
203   }
204 };
205 
206 template <typename Device, typename T, int NDIMS>
HandleReverseV2Case(OpKernelContext * context,const gtl::ArraySlice<bool> & axes,Tensor * result)207 void HandleReverseV2Case(OpKernelContext* context,
208                          const gtl::ArraySlice<bool>& axes, Tensor* result) {
209   const Tensor& input = context->input(0);
210 
211   // Use optimized reverse if possible.
212   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
213       data_type_can_memcpy<T>::value && (!axes[0] && axes[1] && !axes[2])) {
214     if (input.dim_size(2) == 3) {
215       DoHandleReverseCase<T, 3>(context, input, result);
216     } else {
217       DoHandleReverseCase<T, -1>(context, input, result);
218     }
219     return;
220   }
221 
222   typename Eigen::array<bool, NDIMS> axes_di;
223   for (int i = 0; i < NDIMS; i++) {
224     axes_di[i] = axes[i];
225   }
226   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
227                                        input.tensor<T, NDIMS>(), axes_di,
228                                        result->tensor<T, NDIMS>());
229 }
230 
231 template <typename Device, typename T, typename Tidx>
232 class ReverseV2Op : public OpKernel {
233  public:
ReverseV2Op(OpKernelConstruction * context)234   explicit ReverseV2Op(OpKernelConstruction* context) : OpKernel(context) {}
235 
Compute(OpKernelContext * context)236   void Compute(OpKernelContext* context) override {
237     const Tensor& input = context->input(0);
238     const Tensor& sparse_dims = context->input(1);
239 
240     if (TensorShapeUtils::IsScalar(input.shape())) {
241       context->set_output(0, input);
242     } else {
243       const int input_dims = input.dims();
244       const TensorShape& sparse_dims_shape = sparse_dims.shape();
245       const auto& axes_sparse_flat = sparse_dims.flat<Tidx>();
246 
247       OP_REQUIRES(context, TensorShapeUtils::IsVector(sparse_dims_shape),
248                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
249                                           sparse_dims.dims()));
250       gtl::InlinedVector<bool, 8> axes_dense(input_dims, false);
251       for (int dummy = 0; dummy < axes_sparse_flat.size(); dummy++) {
252         Tidx axis = internal::SubtleMustCopy<Tidx>(axes_sparse_flat(dummy));
253         Tidx canonical_axis = axis < 0 ? input_dims + axis : axis;
254         OP_REQUIRES(context, canonical_axis >= 0 && canonical_axis < input_dims,
255                     errors::InvalidArgument("'axis'[", dummy, "] = ", axis,
256                                             " is out of valid range [", 0, ", ",
257                                             input_dims - 1));
258         OP_REQUIRES(context, !axes_dense[canonical_axis],
259                     errors::InvalidArgument("axis ", canonical_axis,
260                                             " specified more than once."));
261         axes_dense[canonical_axis] = true;
262       }
263 
264       OP_REQUIRES(context, input_dims <= 8,
265                   errors::Unimplemented(
266                       "reverse is not implemented for tensors of rank > 8."));
267 
268       Tensor* output = nullptr;
269       OP_REQUIRES_OK(context,
270                      context->allocate_output(0, input.shape(), &output));
271 
272       // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
273       // of a single dimension to the dims=3 or dims=2 case, regardless of the
274       // number of dimensions in the tensor. This would let some ops use faster
275       // lower-dimension code (and use optimized versions).
276 
277 #define HANDLE_REVERSE(NDIMS)                                           \
278   case NDIMS:                                                           \
279     HandleReverseV2Case<Device, T, NDIMS>(context, axes_dense, output); \
280     return;
281 
282       switch (input_dims) {
283         HANDLE_REVERSE(0);
284         HANDLE_REVERSE(1);
285         HANDLE_REVERSE(2);
286         HANDLE_REVERSE(3);
287         HANDLE_REVERSE(4);
288         HANDLE_REVERSE(5);
289         HANDLE_REVERSE(6);
290         HANDLE_REVERSE(7);
291         HANDLE_REVERSE(8);
292       }
293 #undef HANDLE_REVERSE
294     }
295   }
296 };
297 
298 #define REGISTER_KERNELS(T)                                  \
299   REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
300                               .Device(DEVICE_CPU)            \
301                               .TypeConstraint<T>("T")        \
302                               .HostMemory("dims"),           \
303                           ReverseOp<CPUDevice, T>)           \
304   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
305                               .Device(DEVICE_CPU)            \
306                               .TypeConstraint<T>("T")        \
307                               .TypeConstraint<int32>("Tidx") \
308                               .HostMemory("axis"),           \
309                           ReverseV2Op<CPUDevice, T, int32>)  \
310   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
311                               .Device(DEVICE_CPU)            \
312                               .TypeConstraint<T>("T")        \
313                               .TypeConstraint<int64>("Tidx") \
314                               .HostMemory("axis"),           \
315                           ReverseV2Op<CPUDevice, T, int64>)
316 TF_CALL_POD_TYPES(REGISTER_KERNELS);
317 TF_CALL_string(REGISTER_KERNELS);
318 #undef REGISTER_KERNELS
319 
320 #if GOOGLE_CUDA
321 
322 // Forward declarations of the function specializations for GPU (to prevent
323 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
324 namespace functor {
325 #define DECLARE_GPU_SPEC_DIM(T, DIM)                                  \
326   template <>                                                         \
327   void Reverse<GPUDevice, T, DIM>::operator()(                        \
328       const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \
329       const Eigen::array<bool, DIM>& reverse_dims,                    \
330       typename TTypes<T, DIM>::Tensor output);                        \
331   extern template struct Reverse<GPUDevice, T, DIM>;
332 #define DECLARE_GPU_SPEC(T)  \
333   DECLARE_GPU_SPEC_DIM(T, 0) \
334   DECLARE_GPU_SPEC_DIM(T, 1) \
335   DECLARE_GPU_SPEC_DIM(T, 2) \
336   DECLARE_GPU_SPEC_DIM(T, 3) \
337   DECLARE_GPU_SPEC_DIM(T, 4) \
338   DECLARE_GPU_SPEC_DIM(T, 5) \
339   DECLARE_GPU_SPEC_DIM(T, 6) \
340   DECLARE_GPU_SPEC_DIM(T, 7) \
341   DECLARE_GPU_SPEC_DIM(T, 8)
342 
343 TF_CALL_uint8(DECLARE_GPU_SPEC);
344 TF_CALL_int8(DECLARE_GPU_SPEC);
345 TF_CALL_bool(DECLARE_GPU_SPEC);
346 TF_CALL_half(DECLARE_GPU_SPEC);
347 TF_CALL_float(DECLARE_GPU_SPEC);
348 TF_CALL_double(DECLARE_GPU_SPEC);
349 TF_CALL_complex64(DECLARE_GPU_SPEC);
350 TF_CALL_complex128(DECLARE_GPU_SPEC);
351 #undef DECLARE_GPU_SPEC
352 #undef DECLARE_GPU_SPEC_DIM
353 }  // namespace functor
354 
355 // Registration of the GPU implementations.
356 #define REGISTER_GPU_KERNELS(T)                              \
357   REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
358                               .Device(DEVICE_GPU)            \
359                               .TypeConstraint<T>("T")        \
360                               .HostMemory("dims"),           \
361                           ReverseOp<GPUDevice, T>)           \
362   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
363                               .Device(DEVICE_GPU)            \
364                               .TypeConstraint<T>("T")        \
365                               .TypeConstraint<int32>("Tidx") \
366                               .HostMemory("axis"),           \
367                           ReverseV2Op<GPUDevice, T, int32>)  \
368   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
369                               .Device(DEVICE_GPU)            \
370                               .TypeConstraint<T>("T")        \
371                               .TypeConstraint<int64>("Tidx") \
372                               .HostMemory("axis"),           \
373                           ReverseV2Op<GPUDevice, T, int64>)
374 TF_CALL_uint8(REGISTER_GPU_KERNELS);
375 TF_CALL_int8(REGISTER_GPU_KERNELS);
376 TF_CALL_bool(REGISTER_GPU_KERNELS);
377 TF_CALL_half(REGISTER_GPU_KERNELS);
378 TF_CALL_float(REGISTER_GPU_KERNELS);
379 TF_CALL_double(REGISTER_GPU_KERNELS);
380 TF_CALL_complex64(REGISTER_GPU_KERNELS);
381 TF_CALL_complex128(REGISTER_GPU_KERNELS);
382 #undef REGISTER_GPU_KERNEL
383 
384 // A special GPU kernel for int32.
385 // TODO(b/25387198): Also enable int32 in device memory. This kernel
386 // registration requires all int32 inputs and outputs to be in host memory.
387 REGISTER_KERNEL_BUILDER(Name("Reverse")
388                             .Device(DEVICE_GPU)
389                             .TypeConstraint<int32>("T")
390                             .HostMemory("tensor")
391                             .HostMemory("dims")
392                             .HostMemory("output"),
393                         ReverseOp<CPUDevice, int32>);
394 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
395                             .Device(DEVICE_GPU)
396                             .TypeConstraint<int32>("T")
397                             .TypeConstraint<int32>("Tidx")
398                             .HostMemory("tensor")
399                             .HostMemory("axis")
400                             .HostMemory("output"),
401                         ReverseV2Op<CPUDevice, int32, int32>);
402 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
403                             .Device(DEVICE_GPU)
404                             .TypeConstraint<int32>("T")
405                             .TypeConstraint<int64>("Tidx")
406                             .HostMemory("tensor")
407                             .HostMemory("axis")
408                             .HostMemory("output"),
409                         ReverseV2Op<CPUDevice, int32, int64>);
410 #endif  // GOOGLE_CUDA
411 
412 #ifdef TENSORFLOW_USE_SYCL
413 #define REGISTER_SYCL_KERNELS(T)                             \
414   REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
415                               .Device(DEVICE_SYCL)           \
416                               .TypeConstraint<T>("T")        \
417                               .HostMemory("dims"),           \
418                           ReverseOp<SYCLDevice, T>)          \
419   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
420                               .Device(DEVICE_SYCL)           \
421                               .TypeConstraint<T>("T")        \
422                               .TypeConstraint<int32>("Tidx") \
423                               .HostMemory("axis"),           \
424                           ReverseV2Op<SYCLDevice, T, int32>) \
425   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
426                               .Device(DEVICE_SYCL)           \
427                               .TypeConstraint<T>("T")        \
428                               .TypeConstraint<int64>("Tidx") \
429                               .HostMemory("axis"),           \
430                           ReverseV2Op<SYCLDevice, T, int64>)
431 TF_CALL_uint8(REGISTER_SYCL_KERNELS);
432 TF_CALL_int8(REGISTER_SYCL_KERNELS);
433 TF_CALL_float(REGISTER_SYCL_KERNELS);
434 TF_CALL_double(REGISTER_SYCL_KERNELS);
435 
436 REGISTER_KERNEL_BUILDER(Name("Reverse")
437                             .Device(DEVICE_SYCL)
438                             .TypeConstraint<int32>("T")
439                             .HostMemory("tensor")
440                             .HostMemory("dims")
441                             .HostMemory("output"),
442                         ReverseOp<CPUDevice, int32>);
443 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
444                             .Device(DEVICE_SYCL)
445                             .TypeConstraint<int32>("T")
446                             .TypeConstraint<int32>("Tidx")
447                             .HostMemory("tensor")
448                             .HostMemory("axis")
449                             .HostMemory("output"),
450                         ReverseV2Op<CPUDevice, int32, int32>);
451 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
452                             .Device(DEVICE_SYCL)
453                             .TypeConstraint<int32>("T")
454                             .TypeConstraint<int64>("Tidx")
455                             .HostMemory("tensor")
456                             .HostMemory("axis")
457                             .HostMemory("output"),
458                         ReverseV2Op<CPUDevice, int32, int64>);
459 #endif  // TENSORFLOW_USE_SYCL
460 }  // namespace tensorflow
461