• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/array_ops.cc
17 #define EIGEN_USE_THREADS
18 
19 #include "tensorflow/core/kernels/reverse_op.h"
20 #include <memory>
21 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
22 #include "tensorflow/core/framework/bounds_check.h"
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/lib/core/status.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/util/work_sharder.h"
32 
33 namespace tensorflow {
34 
35 typedef Eigen::ThreadPoolDevice CPUDevice;
36 typedef Eigen::GpuDevice GPUDevice;
37 
38 namespace {
39 
40 // Reverse rows (middle dimension) of a three dimensional tensor.
41 // NUM_CHANNELS can be <= 0 to compute it dynamically from <input>
42 // Otherwise, it must equal input.dim_size(2) and is used as a compile-time
43 // constant.
44 template <typename T, int NUM_CHANNELS>
ReverseRows(OpKernelContext * context,const Tensor & input,Tensor * result)45 void ReverseRows(OpKernelContext* context, const Tensor& input,
46                  Tensor* result) {
47   auto work = [&input, result](int64 start, int64 end) {
48     const int64 inner_size =
49         NUM_CHANNELS > 0 ? NUM_CHANNELS : input.dim_size(2);
50     const int64 middle_size = input.dim_size(1);
51     const int64 row_size = inner_size * middle_size;
52     DCHECK_EQ(input.dim_size(2), inner_size);
53 
54     const T* in_ptr = input.bit_casted_tensor<T, 3>().data();
55     T* out_ptr = result->bit_casted_tensor<T, 3>().data();
56 
57     in_ptr += start * row_size;
58     out_ptr += start * row_size;
59 
60     for (int outer_dim = start; outer_dim < end; ++outer_dim) {
61       out_ptr += row_size;
62       int remaining = middle_size;
63       while (remaining > 0) {
64         out_ptr -= inner_size;
65         memcpy(out_ptr, in_ptr, inner_size * sizeof(T));
66         in_ptr += inner_size;
67         --remaining;
68       }
69 
70       out_ptr += row_size;
71     }
72   };
73 
74   // Shard across outer dimension.
75   const int64 N = input.dim_size(0);
76   const int64 cost_per_unit = input.NumElements() / N;
77   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
78   Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit,
79         std::move(work));
80 }
81 
82 template <typename T>
83 struct data_type_can_memcpy {
84   static constexpr bool value =
85       std::is_same<T, uint8>::value || std::is_same<T, int8>::value ||
86       std::is_same<T, bool>::value || std::is_same<T, uint16>::value ||
87       std::is_same<T, int16>::value || std::is_same<T, Eigen::half>::value ||
88       std::is_same<T, int32>::value || std::is_same<T, float>::value ||
89       std::is_same<T, int64>::value || std::is_same<T, double>::value ||
90       std::is_same<T, complex64>::value || std::is_same<T, complex128>::value;
91 };
92 
93 template <typename T, int NUM_CHANNELS>
94 typename std::enable_if<data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)95 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
96                     Tensor* result) {
97   if (sizeof(T) == 1) {
98     static_assert(sizeof(uint8) == 1, "uint8 must be 1 byte.");
99     ReverseRows<uint8, NUM_CHANNELS>(context, input, result);
100   } else if (sizeof(T) == 2) {
101     static_assert(sizeof(uint16) == 2, "uint16 must be 2 bytes");
102     ReverseRows<uint16, NUM_CHANNELS>(context, input, result);
103   } else if (sizeof(T) == 4) {
104     static_assert(sizeof(uint32) == 4, "uint32 must be 4 bytes");
105     ReverseRows<uint32, NUM_CHANNELS>(context, input, result);
106   } else if (sizeof(T) == 8) {
107     static_assert(sizeof(uint64) == 8, "uint64 must be 8 bytes");
108     ReverseRows<uint64, NUM_CHANNELS>(context, input, result);
109   } else if (sizeof(T) == 16) {
110     static_assert(sizeof(complex128) == 16, "complex128 must be 16 bytes");
111     ReverseRows<complex128, NUM_CHANNELS>(context, input, result);
112   } else {
113     context->CtxFailure(errors::InvalidArgument(DataTypeString(input.dtype()),
114                                                 " has unexpected size of ",
115                                                 sizeof(T), " bytes"));
116   }
117 }
118 
119 template <typename T, int NUM_CHANNELS>
120 typename std::enable_if<!data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)121 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
122                     Tensor* result) {}
123 
124 }  // namespace
125 
126 template <typename Device, typename T, int NDIMS>
HandleReverseCase(OpKernelContext * context,typename TTypes<bool,1>::ConstTensor dims,Tensor * result)127 void HandleReverseCase(OpKernelContext* context,
128                        typename TTypes<bool, 1>::ConstTensor dims,
129                        Tensor* result) {
130   const Tensor& input = context->input(0);
131 
132   // Use optimized reverse if possible.
133   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
134       data_type_can_memcpy<T>::value && (!dims(0) && dims(1) && !dims(2))) {
135     if (input.dim_size(2) == 3) {
136       DoHandleReverseCase<T, 3>(context, input, result);
137     } else {
138       DoHandleReverseCase<T, -1>(context, input, result);
139     }
140     return;
141   }
142   typename Eigen::array<bool, NDIMS> axes_di;
143   for (int i = 0; i < NDIMS; i++) {
144     axes_di[i] = dims(i);
145   }
146   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
147                                        input.tensor<T, NDIMS>(), axes_di,
148                                        result->tensor<T, NDIMS>());
149 }
150 
151 template <typename Device, typename T>
152 class ReverseOp : public OpKernel {
153  public:
ReverseOp(OpKernelConstruction * context)154   explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {}
155 
Compute(OpKernelContext * context)156   void Compute(OpKernelContext* context) override {
157     const Tensor& input = context->input(0);
158     const Tensor& dims = context->input(1);
159 
160     if (TensorShapeUtils::IsScalar(input.shape())) {
161       context->set_output(0, input);
162     } else {
163       const int input_dims = input.dims();
164       OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
165                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
166                                           dims.dims()));
167 
168       OP_REQUIRES(
169           context, input_dims == dims.dim_size(0),
170           errors::InvalidArgument(
171               "'dims' must have the same number of values as 'input' has "
172               "dimensions. 'input' has ",
173               input_dims, "'dims' has ", dims.dim_size(0), " values"));
174       OP_REQUIRES(context, input_dims <= 8,
175                   errors::Unimplemented(
176                       "reverse is not implemented for tensors of rank > 8."));
177 
178       Tensor* output = nullptr;
179       OP_REQUIRES_OK(context,
180                      context->allocate_output(0, input.shape(), &output));
181 
182 #define HANDLE_REVERSE(NDIMS)                                               \
183   case NDIMS:                                                               \
184     HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output); \
185     return;
186 
187       switch (input_dims) {
188         HANDLE_REVERSE(0);
189         HANDLE_REVERSE(1);
190         HANDLE_REVERSE(2);
191         HANDLE_REVERSE(3);
192         HANDLE_REVERSE(4);
193         HANDLE_REVERSE(5);
194         HANDLE_REVERSE(6);
195         HANDLE_REVERSE(7);
196         HANDLE_REVERSE(8);
197       }
198 #undef HANDLE_REVERSE
199     }
200   }
201 };
202 
203 template <typename Device, typename T, int NDIMS>
HandleReverseV2Case(OpKernelContext * context,const gtl::ArraySlice<bool> & axes,Tensor * result)204 void HandleReverseV2Case(OpKernelContext* context,
205                          const gtl::ArraySlice<bool>& axes, Tensor* result) {
206   const Tensor& input = context->input(0);
207 
208   // Use optimized reverse if possible.
209   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
210       data_type_can_memcpy<T>::value && (!axes[0] && axes[1] && !axes[2])) {
211     if (input.dim_size(2) == 3) {
212       DoHandleReverseCase<T, 3>(context, input, result);
213     } else {
214       DoHandleReverseCase<T, -1>(context, input, result);
215     }
216     return;
217   }
218 
219   typename Eigen::array<bool, NDIMS> axes_di;
220   for (int i = 0; i < NDIMS; i++) {
221     axes_di[i] = axes[i];
222   }
223   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
224                                        input.tensor<T, NDIMS>(), axes_di,
225                                        result->tensor<T, NDIMS>());
226 }
227 
228 template <typename Device, typename T, typename Tidx>
229 class ReverseV2Op : public OpKernel {
230  public:
ReverseV2Op(OpKernelConstruction * context)231   explicit ReverseV2Op(OpKernelConstruction* context) : OpKernel(context) {}
232 
Compute(OpKernelContext * context)233   void Compute(OpKernelContext* context) override {
234     const Tensor& input = context->input(0);
235     const Tensor& sparse_dims = context->input(1);
236 
237     if (TensorShapeUtils::IsScalar(input.shape()) || input.NumElements() == 0) {
238       context->set_output(0, input);
239     } else {
240       const int input_dims = input.dims();
241       const TensorShape& sparse_dims_shape = sparse_dims.shape();
242       const auto& axes_sparse_flat = sparse_dims.flat<Tidx>();
243 
244       OP_REQUIRES(context, TensorShapeUtils::IsVector(sparse_dims_shape),
245                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
246                                           sparse_dims.dims()));
247       gtl::InlinedVector<bool, 8> axes_dense(input_dims, false);
248       for (int dummy = 0; dummy < axes_sparse_flat.size(); dummy++) {
249         Tidx axis = internal::SubtleMustCopy<Tidx>(axes_sparse_flat(dummy));
250         Tidx canonical_axis = axis < 0 ? input_dims + axis : axis;
251         OP_REQUIRES(context, canonical_axis >= 0 && canonical_axis < input_dims,
252                     errors::InvalidArgument("'axis'[", dummy, "] = ", axis,
253                                             " is out of valid range [", 0, ", ",
254                                             input_dims - 1));
255         OP_REQUIRES(context, !axes_dense[canonical_axis],
256                     errors::InvalidArgument("axis ", canonical_axis,
257                                             " specified more than once."));
258         axes_dense[canonical_axis] = true;
259       }
260 
261       OP_REQUIRES(context, input_dims <= 8,
262                   errors::Unimplemented(
263                       "reverse is not implemented for tensors of rank > 8."));
264 
265       Tensor* output = nullptr;
266       OP_REQUIRES_OK(context,
267                      context->allocate_output(0, input.shape(), &output));
268 
269       // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
270       // of a single dimension to the dims=3 or dims=2 case, regardless of the
271       // number of dimensions in the tensor. This would let some ops use faster
272       // lower-dimension code (and use optimized versions).
273 
274 #define HANDLE_REVERSE(NDIMS)                                           \
275   case NDIMS:                                                           \
276     HandleReverseV2Case<Device, T, NDIMS>(context, axes_dense, output); \
277     return;
278 
279       switch (input_dims) {
280         HANDLE_REVERSE(0);
281         HANDLE_REVERSE(1);
282         HANDLE_REVERSE(2);
283         HANDLE_REVERSE(3);
284         HANDLE_REVERSE(4);
285         HANDLE_REVERSE(5);
286         HANDLE_REVERSE(6);
287         HANDLE_REVERSE(7);
288         HANDLE_REVERSE(8);
289       }
290 #undef HANDLE_REVERSE
291     }
292   }
293 };
294 
295 #define REGISTER_KERNELS(T)                                  \
296   REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
297                               .Device(DEVICE_CPU)            \
298                               .TypeConstraint<T>("T")        \
299                               .HostMemory("dims"),           \
300                           ReverseOp<CPUDevice, T>)           \
301   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
302                               .Device(DEVICE_CPU)            \
303                               .TypeConstraint<T>("T")        \
304                               .TypeConstraint<int32>("Tidx") \
305                               .HostMemory("axis"),           \
306                           ReverseV2Op<CPUDevice, T, int32>)  \
307   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
308                               .Device(DEVICE_CPU)            \
309                               .TypeConstraint<T>("T")        \
310                               .TypeConstraint<int64>("Tidx") \
311                               .HostMemory("axis"),           \
312                           ReverseV2Op<CPUDevice, T, int64>)
313 TF_CALL_POD_TYPES(REGISTER_KERNELS);
314 TF_CALL_tstring(REGISTER_KERNELS);
315 #undef REGISTER_KERNELS
316 
317 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
318 
319 // Forward declarations of the function specializations for GPU (to prevent
320 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
321 namespace functor {
322 #define DECLARE_GPU_SPEC_DIM(T, DIM)                                  \
323   template <>                                                         \
324   void Reverse<GPUDevice, T, DIM>::operator()(                        \
325       const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \
326       const Eigen::array<bool, DIM>& reverse_dims,                    \
327       typename TTypes<T, DIM>::Tensor output);                        \
328   extern template struct Reverse<GPUDevice, T, DIM>;
329 #define DECLARE_GPU_SPEC(T)  \
330   DECLARE_GPU_SPEC_DIM(T, 0) \
331   DECLARE_GPU_SPEC_DIM(T, 1) \
332   DECLARE_GPU_SPEC_DIM(T, 2) \
333   DECLARE_GPU_SPEC_DIM(T, 3) \
334   DECLARE_GPU_SPEC_DIM(T, 4) \
335   DECLARE_GPU_SPEC_DIM(T, 5) \
336   DECLARE_GPU_SPEC_DIM(T, 6) \
337   DECLARE_GPU_SPEC_DIM(T, 7) \
338   DECLARE_GPU_SPEC_DIM(T, 8)
339 
340 TF_CALL_uint8(DECLARE_GPU_SPEC);
341 TF_CALL_int8(DECLARE_GPU_SPEC);
342 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
343 #undef DECLARE_GPU_SPEC
344 #undef DECLARE_GPU_SPEC_DIM
345 }  // namespace functor
346 
347 // Registration of the GPU implementations.
348 #define REGISTER_GPU_KERNELS(T)                              \
349   REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
350                               .Device(DEVICE_GPU)            \
351                               .TypeConstraint<T>("T")        \
352                               .HostMemory("dims"),           \
353                           ReverseOp<GPUDevice, T>)           \
354   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
355                               .Device(DEVICE_GPU)            \
356                               .TypeConstraint<T>("T")        \
357                               .TypeConstraint<int32>("Tidx") \
358                               .HostMemory("axis"),           \
359                           ReverseV2Op<GPUDevice, T, int32>)  \
360   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
361                               .Device(DEVICE_GPU)            \
362                               .TypeConstraint<T>("T")        \
363                               .TypeConstraint<int64>("Tidx") \
364                               .HostMemory("axis"),           \
365                           ReverseV2Op<GPUDevice, T, int64>)
366 TF_CALL_uint8(REGISTER_GPU_KERNELS);
367 TF_CALL_int8(REGISTER_GPU_KERNELS);
368 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
369 #undef REGISTER_GPU_KERNEL
370 
371 // A special GPU kernel for int32.
372 // TODO(b/25387198): Also enable int32 in device memory. This kernel
373 // registration requires all int32 inputs and outputs to be in host memory.
374 REGISTER_KERNEL_BUILDER(Name("Reverse")
375                             .Device(DEVICE_GPU)
376                             .TypeConstraint<int32>("T")
377                             .HostMemory("tensor")
378                             .HostMemory("dims")
379                             .HostMemory("output"),
380                         ReverseOp<CPUDevice, int32>);
381 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
382                             .Device(DEVICE_GPU)
383                             .TypeConstraint<int32>("T")
384                             .TypeConstraint<int32>("Tidx")
385                             .HostMemory("tensor")
386                             .HostMemory("axis")
387                             .HostMemory("output"),
388                         ReverseV2Op<CPUDevice, int32, int32>);
389 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
390                             .Device(DEVICE_GPU)
391                             .TypeConstraint<int32>("T")
392                             .TypeConstraint<int64>("Tidx")
393                             .HostMemory("tensor")
394                             .HostMemory("axis")
395                             .HostMemory("output"),
396                         ReverseV2Op<CPUDevice, int32, int64>);
397 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
398 
399 }  // namespace tensorflow
400