1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/array_ops.cc
17 #define EIGEN_USE_THREADS
18
19 #include "tensorflow/core/kernels/reverse_op.h"
20 #include <memory>
21 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
22 #include "tensorflow/core/framework/bounds_check.h"
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/lib/core/status.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/util/work_sharder.h"
32
33 namespace tensorflow {
34
35 typedef Eigen::ThreadPoolDevice CPUDevice;
36 typedef Eigen::GpuDevice GPUDevice;
37
38 namespace {
39
40 // Reverse rows (middle dimension) of a three dimensional tensor.
41 // NUM_CHANNELS can be <= 0 to compute it dynamically from <input>
42 // Otherwise, it must equal input.dim_size(2) and is used as a compile-time
43 // constant.
44 template <typename T, int NUM_CHANNELS>
ReverseRows(OpKernelContext * context,const Tensor & input,Tensor * result)45 void ReverseRows(OpKernelContext* context, const Tensor& input,
46 Tensor* result) {
47 auto work = [&input, result](int64 start, int64 end) {
48 const int64 inner_size =
49 NUM_CHANNELS > 0 ? NUM_CHANNELS : input.dim_size(2);
50 const int64 middle_size = input.dim_size(1);
51 const int64 row_size = inner_size * middle_size;
52 DCHECK_EQ(input.dim_size(2), inner_size);
53
54 const T* in_ptr = input.bit_casted_tensor<T, 3>().data();
55 T* out_ptr = result->bit_casted_tensor<T, 3>().data();
56
57 in_ptr += start * row_size;
58 out_ptr += start * row_size;
59
60 for (int outer_dim = start; outer_dim < end; ++outer_dim) {
61 out_ptr += row_size;
62 int remaining = middle_size;
63 while (remaining > 0) {
64 out_ptr -= inner_size;
65 memcpy(out_ptr, in_ptr, inner_size * sizeof(T));
66 in_ptr += inner_size;
67 --remaining;
68 }
69
70 out_ptr += row_size;
71 }
72 };
73
74 // Shard across outer dimension.
75 const int64 N = input.dim_size(0);
76 const int64 cost_per_unit = input.NumElements() / N;
77 auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
78 Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit,
79 std::move(work));
80 }
81
82 template <typename T>
83 struct data_type_can_memcpy {
84 static constexpr bool value =
85 std::is_same<T, uint8>::value || std::is_same<T, int8>::value ||
86 std::is_same<T, bool>::value || std::is_same<T, uint16>::value ||
87 std::is_same<T, int16>::value || std::is_same<T, Eigen::half>::value ||
88 std::is_same<T, int32>::value || std::is_same<T, float>::value ||
89 std::is_same<T, int64>::value || std::is_same<T, double>::value ||
90 std::is_same<T, complex64>::value || std::is_same<T, complex128>::value;
91 };
92
93 template <typename T, int NUM_CHANNELS>
94 typename std::enable_if<data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)95 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
96 Tensor* result) {
97 if (sizeof(T) == 1) {
98 static_assert(sizeof(uint8) == 1, "uint8 must be 1 byte.");
99 ReverseRows<uint8, NUM_CHANNELS>(context, input, result);
100 } else if (sizeof(T) == 2) {
101 static_assert(sizeof(uint16) == 2, "uint16 must be 2 bytes");
102 ReverseRows<uint16, NUM_CHANNELS>(context, input, result);
103 } else if (sizeof(T) == 4) {
104 static_assert(sizeof(uint32) == 4, "uint32 must be 4 bytes");
105 ReverseRows<uint32, NUM_CHANNELS>(context, input, result);
106 } else if (sizeof(T) == 8) {
107 static_assert(sizeof(uint64) == 8, "uint64 must be 8 bytes");
108 ReverseRows<uint64, NUM_CHANNELS>(context, input, result);
109 } else if (sizeof(T) == 16) {
110 static_assert(sizeof(complex128) == 16, "complex128 must be 16 bytes");
111 ReverseRows<complex128, NUM_CHANNELS>(context, input, result);
112 } else {
113 context->CtxFailure(errors::InvalidArgument(DataTypeString(input.dtype()),
114 " has unexpected size of ",
115 sizeof(T), " bytes"));
116 }
117 }
118
119 template <typename T, int NUM_CHANNELS>
120 typename std::enable_if<!data_type_can_memcpy<T>::value>::type
DoHandleReverseCase(OpKernelContext * context,const Tensor & input,Tensor * result)121 DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
122 Tensor* result) {}
123
124 } // namespace
125
126 template <typename Device, typename T, int NDIMS>
HandleReverseCase(OpKernelContext * context,typename TTypes<bool,1>::ConstTensor dims,Tensor * result)127 void HandleReverseCase(OpKernelContext* context,
128 typename TTypes<bool, 1>::ConstTensor dims,
129 Tensor* result) {
130 const Tensor& input = context->input(0);
131
132 // Use optimized reverse if possible.
133 if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
134 data_type_can_memcpy<T>::value && (!dims(0) && dims(1) && !dims(2))) {
135 if (input.dim_size(2) == 3) {
136 DoHandleReverseCase<T, 3>(context, input, result);
137 } else {
138 DoHandleReverseCase<T, -1>(context, input, result);
139 }
140 return;
141 }
142 typename Eigen::array<bool, NDIMS> axes_di;
143 for (int i = 0; i < NDIMS; i++) {
144 axes_di[i] = dims(i);
145 }
146 functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
147 input.tensor<T, NDIMS>(), axes_di,
148 result->tensor<T, NDIMS>());
149 }
150
151 template <typename Device, typename T>
152 class ReverseOp : public OpKernel {
153 public:
ReverseOp(OpKernelConstruction * context)154 explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {}
155
Compute(OpKernelContext * context)156 void Compute(OpKernelContext* context) override {
157 const Tensor& input = context->input(0);
158 const Tensor& dims = context->input(1);
159
160 if (TensorShapeUtils::IsScalar(input.shape())) {
161 context->set_output(0, input);
162 } else {
163 const int input_dims = input.dims();
164 OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
165 errors::InvalidArgument("'dims' must be 1-dimension, not ",
166 dims.dims()));
167
168 OP_REQUIRES(
169 context, input_dims == dims.dim_size(0),
170 errors::InvalidArgument(
171 "'dims' must have the same number of values as 'input' has "
172 "dimensions. 'input' has ",
173 input_dims, "'dims' has ", dims.dim_size(0), " values"));
174 OP_REQUIRES(context, input_dims <= 8,
175 errors::Unimplemented(
176 "reverse is not implemented for tensors of rank > 8."));
177
178 Tensor* output = nullptr;
179 OP_REQUIRES_OK(context,
180 context->allocate_output(0, input.shape(), &output));
181
182 #define HANDLE_REVERSE(NDIMS) \
183 case NDIMS: \
184 HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output); \
185 return;
186
187 switch (input_dims) {
188 HANDLE_REVERSE(0);
189 HANDLE_REVERSE(1);
190 HANDLE_REVERSE(2);
191 HANDLE_REVERSE(3);
192 HANDLE_REVERSE(4);
193 HANDLE_REVERSE(5);
194 HANDLE_REVERSE(6);
195 HANDLE_REVERSE(7);
196 HANDLE_REVERSE(8);
197 }
198 #undef HANDLE_REVERSE
199 }
200 }
201 };
202
203 template <typename Device, typename T, int NDIMS>
HandleReverseV2Case(OpKernelContext * context,const gtl::ArraySlice<bool> & axes,Tensor * result)204 void HandleReverseV2Case(OpKernelContext* context,
205 const gtl::ArraySlice<bool>& axes, Tensor* result) {
206 const Tensor& input = context->input(0);
207
208 // Use optimized reverse if possible.
209 if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
210 data_type_can_memcpy<T>::value && (!axes[0] && axes[1] && !axes[2])) {
211 if (input.dim_size(2) == 3) {
212 DoHandleReverseCase<T, 3>(context, input, result);
213 } else {
214 DoHandleReverseCase<T, -1>(context, input, result);
215 }
216 return;
217 }
218
219 typename Eigen::array<bool, NDIMS> axes_di;
220 for (int i = 0; i < NDIMS; i++) {
221 axes_di[i] = axes[i];
222 }
223 functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
224 input.tensor<T, NDIMS>(), axes_di,
225 result->tensor<T, NDIMS>());
226 }
227
228 template <typename Device, typename T, typename Tidx>
229 class ReverseV2Op : public OpKernel {
230 public:
ReverseV2Op(OpKernelConstruction * context)231 explicit ReverseV2Op(OpKernelConstruction* context) : OpKernel(context) {}
232
Compute(OpKernelContext * context)233 void Compute(OpKernelContext* context) override {
234 const Tensor& input = context->input(0);
235 const Tensor& sparse_dims = context->input(1);
236
237 if (TensorShapeUtils::IsScalar(input.shape()) || input.NumElements() == 0) {
238 context->set_output(0, input);
239 } else {
240 const int input_dims = input.dims();
241 const TensorShape& sparse_dims_shape = sparse_dims.shape();
242 const auto& axes_sparse_flat = sparse_dims.flat<Tidx>();
243
244 OP_REQUIRES(context, TensorShapeUtils::IsVector(sparse_dims_shape),
245 errors::InvalidArgument("'dims' must be 1-dimension, not ",
246 sparse_dims.dims()));
247 gtl::InlinedVector<bool, 8> axes_dense(input_dims, false);
248 for (int dummy = 0; dummy < axes_sparse_flat.size(); dummy++) {
249 Tidx axis = internal::SubtleMustCopy<Tidx>(axes_sparse_flat(dummy));
250 Tidx canonical_axis = axis < 0 ? input_dims + axis : axis;
251 OP_REQUIRES(context, canonical_axis >= 0 && canonical_axis < input_dims,
252 errors::InvalidArgument("'axis'[", dummy, "] = ", axis,
253 " is out of valid range [", 0, ", ",
254 input_dims - 1));
255 OP_REQUIRES(context, !axes_dense[canonical_axis],
256 errors::InvalidArgument("axis ", canonical_axis,
257 " specified more than once."));
258 axes_dense[canonical_axis] = true;
259 }
260
261 OP_REQUIRES(context, input_dims <= 8,
262 errors::Unimplemented(
263 "reverse is not implemented for tensors of rank > 8."));
264
265 Tensor* output = nullptr;
266 OP_REQUIRES_OK(context,
267 context->allocate_output(0, input.shape(), &output));
268
269 // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
270 // of a single dimension to the dims=3 or dims=2 case, regardless of the
271 // number of dimensions in the tensor. This would let some ops use faster
272 // lower-dimension code (and use optimized versions).
273
274 #define HANDLE_REVERSE(NDIMS) \
275 case NDIMS: \
276 HandleReverseV2Case<Device, T, NDIMS>(context, axes_dense, output); \
277 return;
278
279 switch (input_dims) {
280 HANDLE_REVERSE(0);
281 HANDLE_REVERSE(1);
282 HANDLE_REVERSE(2);
283 HANDLE_REVERSE(3);
284 HANDLE_REVERSE(4);
285 HANDLE_REVERSE(5);
286 HANDLE_REVERSE(6);
287 HANDLE_REVERSE(7);
288 HANDLE_REVERSE(8);
289 }
290 #undef HANDLE_REVERSE
291 }
292 }
293 };
294
295 #define REGISTER_KERNELS(T) \
296 REGISTER_KERNEL_BUILDER(Name("Reverse") \
297 .Device(DEVICE_CPU) \
298 .TypeConstraint<T>("T") \
299 .HostMemory("dims"), \
300 ReverseOp<CPUDevice, T>) \
301 REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
302 .Device(DEVICE_CPU) \
303 .TypeConstraint<T>("T") \
304 .TypeConstraint<int32>("Tidx") \
305 .HostMemory("axis"), \
306 ReverseV2Op<CPUDevice, T, int32>) \
307 REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
308 .Device(DEVICE_CPU) \
309 .TypeConstraint<T>("T") \
310 .TypeConstraint<int64>("Tidx") \
311 .HostMemory("axis"), \
312 ReverseV2Op<CPUDevice, T, int64>)
313 TF_CALL_POD_TYPES(REGISTER_KERNELS);
314 TF_CALL_tstring(REGISTER_KERNELS);
315 #undef REGISTER_KERNELS
316
317 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
318
319 // Forward declarations of the function specializations for GPU (to prevent
320 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
321 namespace functor {
322 #define DECLARE_GPU_SPEC_DIM(T, DIM) \
323 template <> \
324 void Reverse<GPUDevice, T, DIM>::operator()( \
325 const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \
326 const Eigen::array<bool, DIM>& reverse_dims, \
327 typename TTypes<T, DIM>::Tensor output); \
328 extern template struct Reverse<GPUDevice, T, DIM>;
329 #define DECLARE_GPU_SPEC(T) \
330 DECLARE_GPU_SPEC_DIM(T, 0) \
331 DECLARE_GPU_SPEC_DIM(T, 1) \
332 DECLARE_GPU_SPEC_DIM(T, 2) \
333 DECLARE_GPU_SPEC_DIM(T, 3) \
334 DECLARE_GPU_SPEC_DIM(T, 4) \
335 DECLARE_GPU_SPEC_DIM(T, 5) \
336 DECLARE_GPU_SPEC_DIM(T, 6) \
337 DECLARE_GPU_SPEC_DIM(T, 7) \
338 DECLARE_GPU_SPEC_DIM(T, 8)
339
340 TF_CALL_uint8(DECLARE_GPU_SPEC);
341 TF_CALL_int8(DECLARE_GPU_SPEC);
342 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
343 #undef DECLARE_GPU_SPEC
344 #undef DECLARE_GPU_SPEC_DIM
345 } // namespace functor
346
347 // Registration of the GPU implementations.
348 #define REGISTER_GPU_KERNELS(T) \
349 REGISTER_KERNEL_BUILDER(Name("Reverse") \
350 .Device(DEVICE_GPU) \
351 .TypeConstraint<T>("T") \
352 .HostMemory("dims"), \
353 ReverseOp<GPUDevice, T>) \
354 REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
355 .Device(DEVICE_GPU) \
356 .TypeConstraint<T>("T") \
357 .TypeConstraint<int32>("Tidx") \
358 .HostMemory("axis"), \
359 ReverseV2Op<GPUDevice, T, int32>) \
360 REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
361 .Device(DEVICE_GPU) \
362 .TypeConstraint<T>("T") \
363 .TypeConstraint<int64>("Tidx") \
364 .HostMemory("axis"), \
365 ReverseV2Op<GPUDevice, T, int64>)
366 TF_CALL_uint8(REGISTER_GPU_KERNELS);
367 TF_CALL_int8(REGISTER_GPU_KERNELS);
368 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
369 #undef REGISTER_GPU_KERNEL
370
371 // A special GPU kernel for int32.
372 // TODO(b/25387198): Also enable int32 in device memory. This kernel
373 // registration requires all int32 inputs and outputs to be in host memory.
374 REGISTER_KERNEL_BUILDER(Name("Reverse")
375 .Device(DEVICE_GPU)
376 .TypeConstraint<int32>("T")
377 .HostMemory("tensor")
378 .HostMemory("dims")
379 .HostMemory("output"),
380 ReverseOp<CPUDevice, int32>);
381 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
382 .Device(DEVICE_GPU)
383 .TypeConstraint<int32>("T")
384 .TypeConstraint<int32>("Tidx")
385 .HostMemory("tensor")
386 .HostMemory("axis")
387 .HostMemory("output"),
388 ReverseV2Op<CPUDevice, int32, int32>);
389 REGISTER_KERNEL_BUILDER(Name("ReverseV2")
390 .Device(DEVICE_GPU)
391 .TypeConstraint<int32>("T")
392 .TypeConstraint<int64>("Tidx")
393 .HostMemory("tensor")
394 .HostMemory("axis")
395 .HostMemory("output"),
396 ReverseV2Op<CPUDevice, int32, int64>);
397 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
398
399 } // namespace tensorflow
400