1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/array_ops.cc.
17
18 #define EIGEN_USE_THREADS
19
20 #ifdef GOOGLE_CUDA
21 #define EIGEN_USE_GPU
22 #endif // GOOGLE_CUDA
23
24 #include <vector>
25
26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
27
28 #include "tensorflow/core/framework/numeric_op.h"
29 #include "tensorflow/core/framework/op_kernel.h"
30 #include "tensorflow/core/framework/register_types.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_types.h"
33 #include "tensorflow/core/framework/type_index.h"
34 #include "tensorflow/core/framework/types.h"
35 #include "tensorflow/core/lib/core/errors.h"
36 #include "tensorflow/core/lib/gtl/array_slice.h"
37 #include "tensorflow/core/platform/macros.h"
38 #include "tensorflow/core/platform/types.h"
39
40 namespace tensorflow {
41
42 typedef Eigen::ThreadPoolDevice CPUDevice;
43 typedef Eigen::GpuDevice GPUDevice;
44 #ifdef TENSORFLOW_USE_SYCL
45 typedef Eigen::SyclDevice SYCLDevice;
46 #endif // TENSORFLOW_USE_SYCL
47
48 // Forward declarations of functors that will be defined in tile_ops_impl.h
49 namespace functor {
50 template <typename Device, typename T, typename Tmultiple>
51 struct Tile {
52 void operator()(const Device& d, Tensor* out, const Tensor& in,
53 const gtl::ArraySlice<Tmultiple> broadcast_array) const;
54 };
55
56 template <typename Device, typename T, int NDIM>
57 struct TileGrad {
58 void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
59 typename TTypes<T, NDIM>::ConstTensor in,
60 const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,
61 const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes,
62 bool first) const;
63 };
64
65 template <typename Device, typename T>
66 struct TileGrad<Device, T, 0> {
67 void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
68 typename TTypes<T, 0>::ConstTensor in,
69 const Eigen::DSizes<Eigen::DenseIndex, 0>&,
70 const Eigen::DSizes<Eigen::DenseIndex, 0>&, bool first) const;
71 };
72
73 template <typename Device, typename T, int NDIM, int REDUCEDNDIM>
74 struct ReduceAndReshape {
75 void operator()(
76 const Device& d, typename TTypes<T, NDIM>::Tensor out,
77 typename TTypes<T, NDIM>::ConstTensor in,
78 const Eigen::DSizes<Eigen::DenseIndex, REDUCEDNDIM>& reduce_dim,
79 const Eigen::DSizes<Eigen::DenseIndex, NDIM>& reshape_dim) const;
80 };
81 } // namespace functor
82
83 // --------------------------------------------------------------------------
84 template <typename Device, typename Tmultiples>
85 class TileOp : public OpKernel {
86 public:
TileOp(OpKernelConstruction * context)87 explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {}
88
Compute(OpKernelContext * context)89 void Compute(OpKernelContext* context) override {
90 const Tensor& input = context->input(0);
91 const Tensor& multiples = context->input(1);
92
93 OP_REQUIRES(
94 context, IsLegacyVector(multiples.shape()),
95 errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
96 multiples.shape().DebugString()));
97 OP_REQUIRES(context, input.dims() == multiples.NumElements(),
98 errors::InvalidArgument(
99 "Expected multiples argument to be a vector of length ",
100 input.dims(), " but got length ", multiples.dim_size(0)));
101 const int input_dims = input.dims();
102
103 // Eigen doesn't support scalars on the GPU, so handle 0-D specially
104 if (input_dims == 0) {
105 context->set_output(0, input);
106 return;
107 }
108
109 const gtl::ArraySlice<Tmultiples> multiples_array(
110 multiples.flat<Tmultiples>().data(), input_dims);
111 TensorShape output_shape;
112 for (int i = 0; i < input_dims; ++i) {
113 OP_REQUIRES(
114 context, multiples_array[i] >= 0,
115 errors::InvalidArgument("Expected multiples[", i, "] >= 0, but got ",
116 multiples_array[i]));
117 output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
118 }
119 if (output_shape == input.shape()) {
120 context->set_output(0, input);
121 return;
122 }
123 Tensor* result = nullptr;
124 OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
125
126 // If there's no output, there's nothing to do.
127 if (output_shape.num_elements() == 0) return;
128
129 #define HANDLE_TYPE(DT) \
130 if (context->input(0).dtype() == DT) { \
131 HandleCase<DT>(context, multiples_array, result); \
132 return; \
133 }
134
135 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
136
137 // Invoke macro using TF_CALL_* so type-filtering for platform applies.
138 TF_CALL_bool(HANDLE_TYPE_NAME);
139 TF_CALL_bfloat16(HANDLE_TYPE_NAME);
140 TF_CALL_float(HANDLE_TYPE_NAME);
141 TF_CALL_double(HANDLE_TYPE_NAME);
142 TF_CALL_uint8(HANDLE_TYPE_NAME);
143 TF_CALL_int32(HANDLE_TYPE_NAME);
144 TF_CALL_int16(HANDLE_TYPE_NAME);
145 TF_CALL_int64(HANDLE_TYPE_NAME);
146 TF_CALL_half(HANDLE_TYPE_NAME);
147 TF_CALL_string(HANDLE_TYPE_NAME); // when DEVICE=CPUDevice.
148 TF_CALL_complex64(HANDLE_TYPE_NAME);
149 TF_CALL_complex128(HANDLE_TYPE_NAME);
150
151 #undef HANDLE_TYPE_NAME
152 #undef HANDLE_TYPE
153
154 OP_REQUIRES(
155 context, false,
156 errors::Unimplemented(
157 "TileOp : The input data type is not supported, DataType : ",
158 DataTypeString(context->input(0).dtype()),
159 ", Dimension : ", input_dims));
160 }
161
162 private:
163 template <DataType DT>
HandleCaseImpl(OpKernelContext * context,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)164 void HandleCaseImpl(OpKernelContext* context,
165 const gtl::ArraySlice<Tmultiples>& multiples_array,
166 Tensor* result) {
167 typedef typename EnumToDataType<DT>::Type T;
168 functor::Tile<Device, T, Tmultiples>()(context->eigen_device<Device>(),
169 result, context->input(0),
170 multiples_array);
171 }
172
173 template <DataType DT>
174 void HandleCase(OpKernelContext* context,
175 const gtl::ArraySlice<Tmultiples>& multiples_array,
176 Tensor* result);
177
178 TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
179 };
180
181 template <typename Device, typename Tmultiples>
182 template <DataType DT>
HandleCase(OpKernelContext * context,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)183 inline void TileOp<Device, Tmultiples>::HandleCase(
184 OpKernelContext* context,
185 const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
186 // TODO(vrv): print out the device name if useful. Currently disabled to avoid
187 // having to use RTTI.
188 LOG(FATAL) << "TileOp: Invalid combination of Device, DT: "
189 // << typeid(Device).name() << ", "
190 << DataTypeString(DT);
191 }
192
193 #define HANDLE_CASE(device, dtype, Tmultiples) \
194 template <> \
195 template <> \
196 void TileOp<device, Tmultiples>::HandleCase<dtype>( \
197 OpKernelContext * context, \
198 const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) { \
199 HandleCaseImpl<dtype>(context, multiples_array, result); \
200 }
201
202 #define HANDLE_TYPE_NAME_CPU(T) \
203 HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value, int32); \
204 HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value, int64);
205
206 #define HANDLE_TYPE_NAME_GPU(T) \
207 HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int32); \
208 HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int64);
209
210 #ifdef TENSORFLOW_USE_SYCL
211 #define HANDLE_TYPE_NAME_SYCL(T) \
212 HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int32); \
213 HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int64);
214 #endif // TENSORFLOW_USE_SYCL
215
216 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
217 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
218 TF_CALL_bfloat16(HANDLE_TYPE_NAME_CPU);
219 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
220 TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
221 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
222 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
223 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
224 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
225 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
226 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
227 TF_CALL_string(HANDLE_TYPE_NAME_CPU);
228
229 #if GOOGLE_CUDA
230 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
231 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
232 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
233 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
234 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
235 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
236 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
237 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
238 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
239 #endif // GOOGLE_CUDA
240
241 #ifdef TENSORFLOW_USE_SYCL
242 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
243 TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
244 TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
245 TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
246 TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
247 #endif // TENSORFLOW_USE_SYCL
248
249 #undef HANDLE_TYPE_NAME_CPU
250 #undef HANDLE_TYPE_NAME_GPU
251 #ifdef TENSORFLOW_USE_SYCL
252 #undef HANDLE_TYPE_NAME_SYCL
253 #endif // TENSORFLOW_USE_SYCL
254 #undef HANDLE_CASE
255
256 // --------------------------------------------------------------------------
257 template <typename Device, typename Tmultiples>
258 class TileGradientOp : public OpKernel {
259 public:
TileGradientOp(OpKernelConstruction * context)260 explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {}
261
Compute(OpKernelContext * context)262 void Compute(OpKernelContext* context) override {
263 const Tensor& input = context->input(0);
264 const Tensor& multiples = context->input(1);
265 OP_REQUIRES(
266 context, IsLegacyVector(multiples.shape()),
267 errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
268 multiples.shape().DebugString()));
269 OP_REQUIRES(context, input.dims() == multiples.NumElements(),
270 errors::InvalidArgument(
271 "Expected multiples argument to be a vector of length ",
272 input.dims(), " but got length ", multiples.dim_size(0)));
273
274 const int input_dims = input.dims();
275
276 // Eigen doesn't support scalars on the GPU, so handle 0-D specially
277 if (input_dims == 0) {
278 context->set_output(0, input);
279 return;
280 }
281
282 const gtl::ArraySlice<Tmultiples> multiples_array(
283 multiples.flat<Tmultiples>().data(), input_dims);
284 TensorShape output_shape;
285 std::vector<Tmultiples> input_dim_size_vec;
286 for (int i = 0; i < input_dims; ++i) {
287 OP_REQUIRES(
288 context, multiples_array[i] > 0,
289 errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
290 multiples_array[i]));
291 OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0,
292 errors::InvalidArgument("Expected input_dim[", i,
293 "] to be divisible by multiples[", i,
294 "], but ", input.dim_size(i), " % ",
295 multiples_array[i], " != 0"));
296 output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
297 input_dim_size_vec.push_back(input.dim_size(i));
298 }
299 if (output_shape == input.shape()) {
300 context->set_output(0, input);
301 return;
302 }
303 Tensor* result = nullptr;
304 OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
305
306 #define HANDLE_DIM(DT, NDIM) \
307 if (context->input(0).dtype() == DT && input_dims == NDIM) { \
308 HandleCase<DT, NDIM>(context, input_dim_size_vec, multiples_array, \
309 result); \
310 return; \
311 }
312
313 #define HANDLE_TYPE(T) \
314 HANDLE_DIM(T, 1) \
315 HANDLE_DIM(T, 2) \
316 HANDLE_DIM(T, 3) \
317 HANDLE_DIM(T, 4) \
318 HANDLE_DIM(T, 5) \
319 HANDLE_DIM(T, 6) \
320 HANDLE_DIM(T, 7)
321
322 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
323
324 TF_CALL_float(HANDLE_TYPE_NAME);
325 TF_CALL_double(HANDLE_TYPE_NAME);
326 TF_CALL_int32(HANDLE_TYPE_NAME);
327 TF_CALL_int16(HANDLE_TYPE_NAME);
328 TF_CALL_int64(HANDLE_TYPE_NAME);
329 TF_CALL_half(HANDLE_TYPE_NAME);
330 TF_CALL_bfloat16(HANDLE_TYPE_NAME);
331 TF_CALL_complex64(HANDLE_TYPE_NAME);
332 TF_CALL_complex128(HANDLE_TYPE_NAME);
333
334 #undef HANDLE_TYPE_NAME
335 #undef HANDLE_TYPE
336 #undef HANDLE_DIM
337
338 OP_REQUIRES(context, false,
339 errors::Unimplemented("TileGradientOp : The input data type or "
340 "dimension is not supported, DataType : ",
341 DataTypeString(context->input(0).dtype()),
342 ", Dimension : ", input_dims));
343 }
344
345 private:
346 template <DataType DT, int NDIM>
347 void HandleCase(OpKernelContext* context,
348 const std::vector<Tmultiples>& input_dims,
349 const gtl::ArraySlice<Tmultiples>& multiples_array,
350 Tensor* result);
351
352 template <DataType DT, int NDIM>
HandleCaseImpl(OpKernelContext * context,const std::vector<Tmultiples> & input_dims,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)353 void HandleCaseImpl(OpKernelContext* context,
354 const std::vector<Tmultiples>& input_dims,
355 const gtl::ArraySlice<Tmultiples>& multiples_array,
356 Tensor* result) {
357 typedef typename EnumToDataType<DT>::Type T;
358
359 bool reduction_only = true;
360 std::vector<Tmultiples> reduction_dims;
361
362 for (int i = 0; i < NDIM; ++i) {
363 if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) {
364 reduction_only = false;
365 break;
366 } else {
367 if (multiples_array[i] == input_dims[i]) {
368 reduction_dims.push_back(i);
369 }
370 }
371 }
372
373 if (reduction_only) {
374 #define HANDLE_DIM(D) \
375 if (reduction_dims.size() == (D)) { \
376 HandleReduce<T, NDIM, (D)>(context, reduction_dims, result); \
377 return; \
378 }
379 // NOTE(keveman): Handling the most common case here.
380 // Adding more cases here would require more templating and code
381 // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1.
382 HANDLE_DIM(1);
383
384 // Fall through to the unoptimized version.
385 #undef HANDLE_DIM
386 }
387
388 Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
389 Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
390
391 // Accumulate slices along the dimensions into the output. The number of
392 // slices along dimension 'i' is simply the multiple along dimension 'i'
393 // passed to the original Tile op.
394 for (int i = 0; i < NDIM; ++i) {
395 sizes[i] = input_dims[i] / multiples_array[i];
396 indices[i] = 0;
397 }
398
399 bool first = true;
400 while (true) {
401 functor::TileGrad<Device, T, NDIM>()(
402 context->eigen_device<Device>(), result->tensor<T, NDIM>(),
403 context->input(0).tensor<T, NDIM>(), indices, sizes, first);
404 first = false;
405 // Increment the begin indices.
406 int i = 0;
407 while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) {
408 indices[i] = 0;
409 ++i;
410 }
411 // We are finished if we have iterated to the maximum along all
412 // dimensions.
413 if (i == NDIM) {
414 break;
415 }
416 indices[i] += sizes[i];
417 }
418 }
419
420 template <typename T, int NDIM, int REDUCENDIM>
HandleReduce(OpKernelContext * context,const std::vector<Tmultiples> & reduce_dim_in,Tensor * result)421 void HandleReduce(OpKernelContext* context,
422 const std::vector<Tmultiples>& reduce_dim_in,
423 Tensor* result) {
424 static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions");
425 Eigen::DSizes<Eigen::DenseIndex, REDUCENDIM> reduce_dim;
426 Eigen::DSizes<Eigen::DenseIndex, NDIM> reshape_dim;
427
428 for (int i = 0; i < REDUCENDIM; ++i) {
429 reduce_dim[i] = reduce_dim_in[i];
430 }
431
432 for (int i = 0; i < NDIM; ++i) {
433 reshape_dim[i] = result->dim_size(i);
434 }
435
436 functor::ReduceAndReshape<Device, T, NDIM, REDUCENDIM>()(
437 context->eigen_device<Device>(), result->tensor<T, NDIM>(),
438 context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim);
439 }
440
441 TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp);
442 };
443
444 template <typename Device, typename Tmultiples>
445 template <DataType DT, int NDIM>
HandleCase(OpKernelContext * context,const std::vector<Tmultiples> & input_dims,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)446 inline void TileGradientOp<Device, Tmultiples>::HandleCase(
447 OpKernelContext* context, const std::vector<Tmultiples>& input_dims,
448 const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
449 LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
450 << MakeTypeIndex<Device>().name() << ", " << DataTypeString(DT)
451 << ", " << NDIM;
452 }
453
454 #define HANDLE_CASE(device, T, dtype, Tmultiples, ndim) \
455 template <> \
456 template <> \
457 void TileGradientOp<device, Tmultiples>::HandleCase<dtype, ndim>( \
458 OpKernelContext * context, const std::vector<Tmultiples>& input_dims, \
459 const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) { \
460 HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \
461 }
462
463 // 0-D handled specially above
464 #define HANDLE_CASE_DIM(device, T, dtype) \
465 HANDLE_CASE(device, T, dtype, int32, 1); \
466 HANDLE_CASE(device, T, dtype, int32, 2); \
467 HANDLE_CASE(device, T, dtype, int32, 3); \
468 HANDLE_CASE(device, T, dtype, int32, 4); \
469 HANDLE_CASE(device, T, dtype, int32, 5); \
470 HANDLE_CASE(device, T, dtype, int32, 6); \
471 HANDLE_CASE(device, T, dtype, int32, 7); \
472 HANDLE_CASE(device, T, dtype, int64, 1); \
473 HANDLE_CASE(device, T, dtype, int64, 2); \
474 HANDLE_CASE(device, T, dtype, int64, 3); \
475 HANDLE_CASE(device, T, dtype, int64, 4); \
476 HANDLE_CASE(device, T, dtype, int64, 5); \
477 HANDLE_CASE(device, T, dtype, int64, 6); \
478 HANDLE_CASE(device, T, dtype, int64, 7);
479
480 #define HANDLE_TYPE_NAME_CPU(T) \
481 HANDLE_CASE_DIM(CPUDevice, T, DataTypeToEnum<T>::value);
482
483 #define HANDLE_TYPE_NAME_GPU(T) \
484 HANDLE_CASE_DIM(GPUDevice, T, DataTypeToEnum<T>::value);
485
486 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
487 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
488 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
489 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
490 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
491 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
492 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
493 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
494
495 #if GOOGLE_CUDA
496 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
497 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
498 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
499 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
500 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
501 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
502 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
503 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
504 #endif // GOOGLE_CUDA
505
506 #if TENSORFLOW_USE_SYCL
507 #define HANDLE_TYPE_NAME_SYCL(T) \
508 HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
509
510 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
511 TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
512 TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
513 TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
514 TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
515 #undef HANDLE_TYPE_NAME_SYCL
516 #endif // TENSORFLOW_USE_SYCL
517
518 #undef HANDLE_TYPE_NAME_CPU
519 #undef HANDLE_TYPE_NAME_GPU
520 #undef HANDLE_CASE_DIM
521 #undef HANDLE_CASE
522
523 REGISTER_KERNEL_BUILDER(Name("Tile")
524 .Device(DEVICE_CPU)
525 .HostMemory("multiples")
526 .TypeConstraint<int32>("Tmultiples"),
527 TileOp<CPUDevice, int32>);
528 REGISTER_KERNEL_BUILDER(Name("Tile")
529 .Device(DEVICE_CPU)
530 .HostMemory("multiples")
531 .TypeConstraint<int64>("Tmultiples"),
532 TileOp<CPUDevice, int64>);
533 REGISTER_KERNEL_BUILDER(Name("TileGrad")
534 .Device(DEVICE_CPU)
535 .HostMemory("multiples")
536 .TypeConstraint<int32>("Tmultiples"),
537 TileGradientOp<CPUDevice, int32>);
538 REGISTER_KERNEL_BUILDER(Name("TileGrad")
539 .Device(DEVICE_CPU)
540 .HostMemory("multiples")
541 .TypeConstraint<int64>("Tmultiples"),
542 TileGradientOp<CPUDevice, int64>);
543
544 #if GOOGLE_CUDA
545 #define REGISTER_GPU_TILE(type) \
546 REGISTER_KERNEL_BUILDER(Name("Tile") \
547 .Device(DEVICE_GPU) \
548 .TypeConstraint<type>("T") \
549 .TypeConstraint<int32>("Tmultiples") \
550 .HostMemory("multiples"), \
551 TileOp<GPUDevice, int32>); \
552 REGISTER_KERNEL_BUILDER(Name("Tile") \
553 .Device(DEVICE_GPU) \
554 .TypeConstraint<type>("T") \
555 .TypeConstraint<int64>("Tmultiples") \
556 .HostMemory("multiples"), \
557 TileOp<GPUDevice, int64>);
558
559 #define REGISTER_GPU_TILE_GRAD(type) \
560 REGISTER_KERNEL_BUILDER(Name("TileGrad") \
561 .Device(DEVICE_GPU) \
562 .TypeConstraint<type>("T") \
563 .TypeConstraint<int32>("Tmultiples") \
564 .HostMemory("multiples"), \
565 TileGradientOp<GPUDevice, int32>); \
566 REGISTER_KERNEL_BUILDER(Name("TileGrad") \
567 .Device(DEVICE_GPU) \
568 .TypeConstraint<type>("T") \
569 .TypeConstraint<int64>("Tmultiples") \
570 .HostMemory("multiples"), \
571 TileGradientOp<GPUDevice, int64>);
572
573 #define REGISTER_GPU(type) \
574 REGISTER_GPU_TILE(type); \
575 REGISTER_GPU_TILE_GRAD(type);
576
577 TF_CALL_bool(REGISTER_GPU_TILE);
578 TF_CALL_float(REGISTER_GPU);
579 TF_CALL_double(REGISTER_GPU);
580 TF_CALL_half(REGISTER_GPU);
581 TF_CALL_int16(REGISTER_GPU);
582 TF_CALL_int32(REGISTER_GPU);
583 TF_CALL_int64(REGISTER_GPU);
584 TF_CALL_complex64(REGISTER_GPU);
585 TF_CALL_complex128(REGISTER_GPU)
586
587 #undef REGISTER_GPU_TILE
588 #undef REGISTER_GPU_TILE_GRAD
589 #undef REGISTER_GPU
590 #endif // GOOGLE_CUDA
591
592 #ifdef TENSORFLOW_USE_SYCL
593 #define REGISTER_SYCL(type) \
594 REGISTER_KERNEL_BUILDER(Name("Tile") \
595 .Device(DEVICE_SYCL) \
596 .TypeConstraint<type>("T") \
597 .TypeConstraint<int32>("Tmultiples") \
598 .HostMemory("multiples"), \
599 TileOp<SYCLDevice, int32>); \
600 REGISTER_KERNEL_BUILDER(Name("Tile") \
601 .Device(DEVICE_SYCL) \
602 .TypeConstraint<type>("T") \
603 .TypeConstraint<int64>("Tmultiples") \
604 .HostMemory("multiples"), \
605 TileOp<SYCLDevice, int64>); \
606 REGISTER_KERNEL_BUILDER(Name("TileGrad") \
607 .Device(DEVICE_SYCL) \
608 .TypeConstraint<type>("T") \
609 .TypeConstraint<int32>("Tmultiples") \
610 .HostMemory("multiples"), \
611 TileGradientOp<SYCLDevice, int32>); \
612 REGISTER_KERNEL_BUILDER(Name("TileGrad") \
613 .Device(DEVICE_SYCL) \
614 .TypeConstraint<type>("T") \
615 .TypeConstraint<int64>("Tmultiples") \
616 .HostMemory("multiples"), \
617 TileGradientOp<SYCLDevice, int64>);
618
619 TF_CALL_float(REGISTER_SYCL);
620 TF_CALL_double(REGISTER_SYCL);
621
622 #undef REGISTER_SYCL
623 #endif // TENSORFLOW_USE_SYCL
624
625 } // namespace tensorflow
626