• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/array_ops.cc
17 #define EIGEN_USE_THREADS
18 
19 #include "tensorflow/core/kernels/reverse_op.h"
20 #include <memory>
21 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
22 #include "tensorflow/core/framework/bounds_check.h"
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/lib/core/status.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/util/work_sharder.h"
32 
33 namespace tensorflow {
34 
35 typedef Eigen::ThreadPoolDevice CPUDevice;
36 typedef Eigen::GpuDevice GPUDevice;
37 
38 namespace {
39 
40 // Reverse rows (middle dimension) of a three dimensional tensor.
41 // NUM_CHANNELS can be <= 0 to compute it dynamically from <input>
42 // Otherwise, it must equal input.dim_size(2) and is used as a compile-time
43 // constant.
44 template <typename T, int NUM_CHANNELS>
ReverseRows(OpKernelContext * context,const Tensor & input,Tensor * result)45 void ReverseRows(OpKernelContext* context, const Tensor& input,
46                  Tensor* result) {
47   auto work = [&input, result](int64_t start, int64_t end) {
48     const int64_t inner_size =
49         NUM_CHANNELS > 0 ? NUM_CHANNELS : input.dim_size(2);
50     const int64_t middle_size = input.dim_size(1);
51     const int64_t row_size = inner_size * middle_size;
52     DCHECK_EQ(input.dim_size(2), inner_size);
53 
54     const T* in_ptr = input.bit_casted_tensor<T, 3>().data();
55     T* out_ptr = result->bit_casted_tensor<T, 3>().data();
56 
57     in_ptr += start * row_size;
58     out_ptr += start * row_size;
59 
60     for (int outer_dim = start; outer_dim < end; ++outer_dim) {
61       out_ptr += row_size;
62       int remaining = middle_size;
63       while (remaining > 0) {
64         out_ptr -= inner_size;
65         memcpy(out_ptr, in_ptr, inner_size * sizeof(T));
66         in_ptr += inner_size;
67         --remaining;
68       }
69 
70       out_ptr += row_size;
71     }
72   };
73 
74   // Shard across outer dimension.
75   const int64_t N = input.dim_size(0);
76   const int64_t cost_per_unit = input.NumElements() / N;
77   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
78   Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit,
79         std::move(work));
80 }
81 
82 template <typename T>
83 struct data_type_can_memcpy {
84   static constexpr bool value =
85       std::is_same<T, uint8>::value || std::is_same<T, int8>::value ||
86       std::is_same<T, bool>::value || std::is_same<T, uint16>::value ||
87       std::is_same<T, int16>::value || std::is_same<T, Eigen::half>::value ||
88       std::is_same<T, int32>::value || std::is_same<T, float>::value ||
89       std::is_same<T, int64_t>::value || std::is_same<T, double>::value ||
90       std::is_same<T, complex64>::value || std::is_same<T, complex128>::value;
91 };
92 
93 template <typename T, int NUM_CHANNELS>
94 typename std::enable_if<data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)95 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
96                     Tensor* result) {
97   if (sizeof(T) == 1) {
98     static_assert(sizeof(uint8) == 1, "uint8 must be 1 byte.");
99     ReverseRows<uint8, NUM_CHANNELS>(context, input, result);
100   } else if (sizeof(T) == 2) {
101     static_assert(sizeof(uint16) == 2, "uint16 must be 2 bytes");
102     ReverseRows<uint16, NUM_CHANNELS>(context, input, result);
103   } else if (sizeof(T) == 4) {
104     static_assert(sizeof(uint32) == 4, "uint32 must be 4 bytes");
105     ReverseRows<uint32, NUM_CHANNELS>(context, input, result);
106   } else if (sizeof(T) == 8) {
107     static_assert(sizeof(uint64) == 8, "uint64 must be 8 bytes");
108     ReverseRows<uint64, NUM_CHANNELS>(context, input, result);
109   } else if (sizeof(T) == 16) {
110     static_assert(sizeof(complex128) == 16, "complex128 must be 16 bytes");
111     ReverseRows<complex128, NUM_CHANNELS>(context, input, result);
112   } else {
113     context->CtxFailure(errors::InvalidArgument(DataTypeString(input.dtype()),
114                                                 " has unexpected size of ",
115                                                 sizeof(T), " bytes"));
116   }
117 }
118 
119 template <typename T, int NUM_CHANNELS>
120 typename std::enable_if<!data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)121 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
122                     Tensor* result) {}
123 
124 }  // namespace
125 
126 template <typename Device, typename T, int NDIMS>
HandleReverseCase(OpKernelContext * context,typename TTypes<bool,1>::ConstTensor dims,Tensor * result)127 void HandleReverseCase(OpKernelContext* context,
128                        typename TTypes<bool, 1>::ConstTensor dims,
129                        Tensor* result) {
130   const Tensor& input = context->input(0);
131 
132   // Use optimized reverse if possible.
133   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
134       data_type_can_memcpy<T>::value && (!dims(0) && dims(1) && !dims(2))) {
135     if (input.dim_size(2) == 3) {
136       DoHandleReverseCase<T, 3>(context, input, result);
137     } else {
138       DoHandleReverseCase<T, -1>(context, input, result);
139     }
140     return;
141   }
142   typename Eigen::array<bool, NDIMS> axes_di;
143   for (int i = 0; i < NDIMS; i++) {
144     axes_di[i] = dims(i);
145   }
146   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
147                                        input.tensor<T, NDIMS>(), axes_di,
148                                        result->tensor<T, NDIMS>());
149 }
150 
151 template <typename Device, typename T>
152 class ReverseOp : public OpKernel {
153  public:
ReverseOp(OpKernelConstruction * context)154   explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {}
155 
Compute(OpKernelContext * context)156   void Compute(OpKernelContext* context) override {
157     const Tensor& input = context->input(0);
158     // If input is provided, check to make sure the first dimension is valid.
159     if (input.dims() > 0) {
160       OP_REQUIRES(
161           context, input.dim_size(0) != 0,
162           errors::InvalidArgument("Invalid input first dimension. Found 0."));
163     }
164     const Tensor& dims = context->input(1);
165 
166     if (TensorShapeUtils::IsScalar(input.shape())) {
167       context->set_output(0, input);
168     } else {
169       const int input_dims = input.dims();
170       OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
171                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
172                                           dims.dims()));
173 
174       OP_REQUIRES(
175           context, input_dims == dims.dim_size(0),
176           errors::InvalidArgument(
177               "'dims' must have the same number of values as 'input' has "
178               "dimensions. 'input' has ",
179               input_dims, "'dims' has ", dims.dim_size(0), " values"));
180       OP_REQUIRES(context, input_dims <= 8,
181                   errors::Unimplemented(
182                       "reverse is not implemented for tensors of rank > 8."));
183 
184       Tensor* output = nullptr;
185       OP_REQUIRES_OK(context,
186                      context->allocate_output(0, input.shape(), &output));
187 
188 #define HANDLE_REVERSE(NDIMS)                                               \
189   case NDIMS:                                                               \
190     HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output); \
191     return;
192 
193       switch (input_dims) {
194         HANDLE_REVERSE(0);
195         HANDLE_REVERSE(1);
196         HANDLE_REVERSE(2);
197         HANDLE_REVERSE(3);
198         HANDLE_REVERSE(4);
199         HANDLE_REVERSE(5);
200         HANDLE_REVERSE(6);
201         HANDLE_REVERSE(7);
202         HANDLE_REVERSE(8);
203       }
204 #undef HANDLE_REVERSE
205     }
206   }
207 };
208 
209 template <typename Device, typename T, int NDIMS>
HandleReverseV2Case(OpKernelContext * context,const gtl::ArraySlice<bool> axes,Tensor * result)210 void HandleReverseV2Case(OpKernelContext* context,
211                          const gtl::ArraySlice<bool> axes, Tensor* result) {
212   const Tensor& input = context->input(0);
213 
214   // Use optimized reverse if possible.
215   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
216       data_type_can_memcpy<T>::value && (!axes[0] && axes[1] && !axes[2])) {
217     if (input.dim_size(2) == 3) {
218       DoHandleReverseCase<T, 3>(context, input, result);
219     } else {
220       DoHandleReverseCase<T, -1>(context, input, result);
221     }
222     return;
223   }
224 
225   typename Eigen::array<bool, NDIMS> axes_di;
226   for (int i = 0; i < NDIMS; i++) {
227     axes_di[i] = axes[i];
228   }
229   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
230                                        input.tensor<T, NDIMS>(), axes_di,
231                                        result->tensor<T, NDIMS>());
232 }
233 
234 template <typename Device, typename T, typename Tidx>
235 class ReverseV2Op : public OpKernel {
236  public:
ReverseV2Op(OpKernelConstruction * context)237   explicit ReverseV2Op(OpKernelConstruction* context) : OpKernel(context) {}
238 
Compute(OpKernelContext * context)239   void Compute(OpKernelContext* context) override {
240     const Tensor& input = context->input(0);
241     const Tensor& sparse_dims = context->input(1);
242 
243     if (TensorShapeUtils::IsScalar(input.shape()) || input.NumElements() == 0) {
244       context->set_output(0, input);
245     } else {
246       const int input_dims = input.dims();
247       const TensorShape& sparse_dims_shape = sparse_dims.shape();
248       const auto& axes_sparse_flat = sparse_dims.flat<Tidx>();
249 
250       OP_REQUIRES(context, TensorShapeUtils::IsVector(sparse_dims_shape),
251                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
252                                           sparse_dims.dims()));
253       gtl::InlinedVector<bool, 8> axes_dense(input_dims, false);
254       for (int dummy = 0; dummy < axes_sparse_flat.size(); dummy++) {
255         Tidx axis = internal::SubtleMustCopy<Tidx>(axes_sparse_flat(dummy));
256         Tidx canonical_axis = axis < 0 ? input_dims + axis : axis;
257         OP_REQUIRES(context, canonical_axis >= 0 && canonical_axis < input_dims,
258                     errors::InvalidArgument("'axis'[", dummy, "] = ", axis,
259                                             " is out of valid range [", 0, ", ",
260                                             input_dims - 1));
261         OP_REQUIRES(context, !axes_dense[canonical_axis],
262                     errors::InvalidArgument("axis ", canonical_axis,
263                                             " specified more than once."));
264         axes_dense[canonical_axis] = true;
265       }
266 
267       OP_REQUIRES(context, input_dims <= 8,
268                   errors::Unimplemented(
269                       "reverse is not implemented for tensors of rank > 8."));
270 
271       Tensor* output = nullptr;
272       OP_REQUIRES_OK(context,
273                      context->allocate_output(0, input.shape(), &output));
274 
275       // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
276       // of a single dimension to the dims=3 or dims=2 case, regardless of the
277       // number of dimensions in the tensor. This would let some ops use faster
278       // lower-dimension code (and use optimized versions).
279 
280 #define HANDLE_REVERSE(NDIMS)                                           \
281   case NDIMS:                                                           \
282     HandleReverseV2Case<Device, T, NDIMS>(context, axes_dense, output); \
283     return;
284 
285       switch (input_dims) {
286         HANDLE_REVERSE(0);
287         HANDLE_REVERSE(1);
288         HANDLE_REVERSE(2);
289         HANDLE_REVERSE(3);
290         HANDLE_REVERSE(4);
291         HANDLE_REVERSE(5);
292         HANDLE_REVERSE(6);
293         HANDLE_REVERSE(7);
294         HANDLE_REVERSE(8);
295       }
296 #undef HANDLE_REVERSE
297     }
298   }
299 };
300 
301 #define REGISTER_KERNELS(T)                                    \
302   REGISTER_KERNEL_BUILDER(Name("Reverse")                      \
303                               .Device(DEVICE_CPU)              \
304                               .TypeConstraint<T>("T")          \
305                               .HostMemory("dims"),             \
306                           ReverseOp<CPUDevice, T>)             \
307   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                    \
308                               .Device(DEVICE_CPU)              \
309                               .TypeConstraint<T>("T")          \
310                               .TypeConstraint<int32>("Tidx")   \
311                               .HostMemory("axis"),             \
312                           ReverseV2Op<CPUDevice, T, int32>)    \
313   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                    \
314                               .Device(DEVICE_CPU)              \
315                               .TypeConstraint<T>("T")          \
316                               .TypeConstraint<int64_t>("Tidx") \
317                               .HostMemory("axis"),             \
318                           ReverseV2Op<CPUDevice, T, int64>)
319 TF_CALL_POD_TYPES(REGISTER_KERNELS);
320 TF_CALL_tstring(REGISTER_KERNELS);
321 #undef REGISTER_KERNELS
322 
323 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
324 
325 // Forward declarations of the function specializations for GPU (to prevent
326 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
327 namespace functor {
328 #define DECLARE_GPU_SPEC_DIM(T, DIM)                                  \
329   template <>                                                         \
330   void Reverse<GPUDevice, T, DIM>::operator()(                        \
331       const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \
332       const Eigen::array<bool, DIM>& reverse_dims,                    \
333       typename TTypes<T, DIM>::Tensor output);                        \
334   extern template struct Reverse<GPUDevice, T, DIM>;
335 #define DECLARE_GPU_SPEC(T)  \
336   DECLARE_GPU_SPEC_DIM(T, 0) \
337   DECLARE_GPU_SPEC_DIM(T, 1) \
338   DECLARE_GPU_SPEC_DIM(T, 2) \
339   DECLARE_GPU_SPEC_DIM(T, 3) \
340   DECLARE_GPU_SPEC_DIM(T, 4) \
341   DECLARE_GPU_SPEC_DIM(T, 5) \
342   DECLARE_GPU_SPEC_DIM(T, 6) \
343   DECLARE_GPU_SPEC_DIM(T, 7) \
344   DECLARE_GPU_SPEC_DIM(T, 8)
345 
346 TF_CALL_uint8(DECLARE_GPU_SPEC);
347 TF_CALL_int8(DECLARE_GPU_SPEC);
348 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
349 #undef DECLARE_GPU_SPEC
350 #undef DECLARE_GPU_SPEC_DIM
351 }  // namespace functor
352 
353 // Registration of the GPU implementations.
354 #define REGISTER_GPU_KERNELS(T)                                \
355   REGISTER_KERNEL_BUILDER(Name("Reverse")                      \
356                               .Device(DEVICE_GPU)              \
357                               .TypeConstraint<T>("T")          \
358                               .HostMemory("dims"),             \
359                           ReverseOp<GPUDevice, T>)             \
360   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                    \
361                               .Device(DEVICE_GPU)              \
362                               .TypeConstraint<T>("T")          \
363                               .TypeConstraint<int32>("Tidx")   \
364                               .HostMemory("axis"),             \
365                           ReverseV2Op<GPUDevice, T, int32>)    \
366   REGISTER_KERNEL_BUILDER(Name("ReverseV2")                    \
367                               .Device(DEVICE_GPU)              \
368                               .TypeConstraint<T>("T")          \
369                               .TypeConstraint<int64_t>("Tidx") \
370                               .HostMemory("axis"),             \
371                           ReverseV2Op<GPUDevice, T, int64>)
372 TF_CALL_uint8(REGISTER_GPU_KERNELS);
373 TF_CALL_int8(REGISTER_GPU_KERNELS);
374 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
375 #undef REGISTER_GPU_KERNEL
376 
377 // A special GPU kernel for int32.
378 // TODO(b/25387198): Also enable int32 in device memory. This kernel
379 // registration requires all int32 inputs and outputs to be in host memory.
380 REGISTER_KERNEL_BUILDER(Name("Reverse")
381                             .Device(DEVICE_GPU)
382                             .TypeConstraint<int32>("T")
383                             .HostMemory("tensor")
384                             .HostMemory("dims")
385                             .HostMemory("output"),
386                         ReverseOp<CPUDevice, int32>);
387 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
388                             .Device(DEVICE_GPU)
389                             .TypeConstraint<int32>("T")
390                             .TypeConstraint<int32>("Tidx")
391                             .HostMemory("tensor")
392                             .HostMemory("axis")
393                             .HostMemory("output"),
394                         ReverseV2Op<CPUDevice, int32, int32>);
395 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
396                             .Device(DEVICE_GPU)
397                             .TypeConstraint<int32>("T")
398                             .TypeConstraint<int64_t>("Tidx")
399                             .HostMemory("tensor")
400                             .HostMemory("axis")
401                             .HostMemory("output"),
402                         ReverseV2Op<CPUDevice, int32, int64>);
403 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
404 
405 }  // namespace tensorflow
406