• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/array_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #ifdef GOOGLE_CUDA
21 #define EIGEN_USE_GPU
22 #endif  // GOOGLE_CUDA
23 
24 #include <vector>
25 
26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
27 
28 #include "tensorflow/core/framework/numeric_op.h"
29 #include "tensorflow/core/framework/op_kernel.h"
30 #include "tensorflow/core/framework/register_types.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_types.h"
33 #include "tensorflow/core/framework/type_index.h"
34 #include "tensorflow/core/framework/types.h"
35 #include "tensorflow/core/lib/core/errors.h"
36 #include "tensorflow/core/lib/gtl/array_slice.h"
37 #include "tensorflow/core/platform/macros.h"
38 #include "tensorflow/core/platform/types.h"
39 
40 namespace tensorflow {
41 
42 typedef Eigen::ThreadPoolDevice CPUDevice;
43 typedef Eigen::GpuDevice GPUDevice;
44 #ifdef TENSORFLOW_USE_SYCL
45 typedef Eigen::SyclDevice SYCLDevice;
46 #endif  // TENSORFLOW_USE_SYCL
47 
48 // Forward declarations of functors that will be defined in tile_ops_impl.h
49 namespace functor {
50 template <typename Device, typename T, typename Tmultiple>
51 struct Tile {
52   void operator()(const Device& d, Tensor* out, const Tensor& in,
53                   const gtl::ArraySlice<Tmultiple> broadcast_array) const;
54 };
55 
56 template <typename Device, typename T, int NDIM>
57 struct TileGrad {
58   void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
59                   typename TTypes<T, NDIM>::ConstTensor in,
60                   const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,
61                   const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes,
62                   bool first) const;
63 };
64 
65 template <typename Device, typename T>
66 struct TileGrad<Device, T, 0> {
67   void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
68                   typename TTypes<T, 0>::ConstTensor in,
69                   const Eigen::DSizes<Eigen::DenseIndex, 0>&,
70                   const Eigen::DSizes<Eigen::DenseIndex, 0>&, bool first) const;
71 };
72 
73 template <typename Device, typename T, int NDIM, int REDUCEDNDIM>
74 struct ReduceAndReshape {
75   void operator()(
76       const Device& d, typename TTypes<T, NDIM>::Tensor out,
77       typename TTypes<T, NDIM>::ConstTensor in,
78       const Eigen::DSizes<Eigen::DenseIndex, REDUCEDNDIM>& reduce_dim,
79       const Eigen::DSizes<Eigen::DenseIndex, NDIM>& reshape_dim) const;
80 };
81 }  // namespace functor
82 
83 // --------------------------------------------------------------------------
84 template <typename Device, typename Tmultiples>
85 class TileOp : public OpKernel {
86  public:
TileOp(OpKernelConstruction * context)87   explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {}
88 
Compute(OpKernelContext * context)89   void Compute(OpKernelContext* context) override {
90     const Tensor& input = context->input(0);
91     const Tensor& multiples = context->input(1);
92 
93     OP_REQUIRES(
94         context, IsLegacyVector(multiples.shape()),
95         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
96                                 multiples.shape().DebugString()));
97     OP_REQUIRES(context, input.dims() == multiples.NumElements(),
98                 errors::InvalidArgument(
99                     "Expected multiples argument to be a vector of length ",
100                     input.dims(), " but got length ", multiples.dim_size(0)));
101     const int input_dims = input.dims();
102 
103     // Eigen doesn't support scalars on the GPU, so handle 0-D specially
104     if (input_dims == 0) {
105       context->set_output(0, input);
106       return;
107     }
108 
109     const gtl::ArraySlice<Tmultiples> multiples_array(
110         multiples.flat<Tmultiples>().data(), input_dims);
111     TensorShape output_shape;
112     for (int i = 0; i < input_dims; ++i) {
113       OP_REQUIRES(
114           context, multiples_array[i] >= 0,
115           errors::InvalidArgument("Expected multiples[", i, "] >= 0, but got ",
116                                   multiples_array[i]));
117       output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
118     }
119     if (output_shape == input.shape()) {
120       context->set_output(0, input);
121       return;
122     }
123     Tensor* result = nullptr;
124     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
125 
126     // If there's no output, there's nothing to do.
127     if (output_shape.num_elements() == 0) return;
128 
129 #define HANDLE_TYPE(DT)                               \
130   if (context->input(0).dtype() == DT) {              \
131     HandleCase<DT>(context, multiples_array, result); \
132     return;                                           \
133   }
134 
135 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
136 
137     // Invoke macro using TF_CALL_* so type-filtering for platform applies.
138     TF_CALL_bool(HANDLE_TYPE_NAME);
139     TF_CALL_bfloat16(HANDLE_TYPE_NAME);
140     TF_CALL_float(HANDLE_TYPE_NAME);
141     TF_CALL_double(HANDLE_TYPE_NAME);
142     TF_CALL_uint8(HANDLE_TYPE_NAME);
143     TF_CALL_int32(HANDLE_TYPE_NAME);
144     TF_CALL_int16(HANDLE_TYPE_NAME);
145     TF_CALL_int64(HANDLE_TYPE_NAME);
146     TF_CALL_half(HANDLE_TYPE_NAME);
147     TF_CALL_string(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
148     TF_CALL_complex64(HANDLE_TYPE_NAME);
149     TF_CALL_complex128(HANDLE_TYPE_NAME);
150 
151 #undef HANDLE_TYPE_NAME
152 #undef HANDLE_TYPE
153 
154     OP_REQUIRES(
155         context, false,
156         errors::Unimplemented(
157             "TileOp : The input data type is not supported, DataType : ",
158             DataTypeString(context->input(0).dtype()),
159             ", Dimension : ", input_dims));
160   }
161 
162  private:
163   template <DataType DT>
HandleCaseImpl(OpKernelContext * context,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)164   void HandleCaseImpl(OpKernelContext* context,
165                       const gtl::ArraySlice<Tmultiples>& multiples_array,
166                       Tensor* result) {
167     typedef typename EnumToDataType<DT>::Type T;
168     functor::Tile<Device, T, Tmultiples>()(context->eigen_device<Device>(),
169                                            result, context->input(0),
170                                            multiples_array);
171   }
172 
173   template <DataType DT>
174   void HandleCase(OpKernelContext* context,
175                   const gtl::ArraySlice<Tmultiples>& multiples_array,
176                   Tensor* result);
177 
178   TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
179 };
180 
181 template <typename Device, typename Tmultiples>
182 template <DataType DT>
HandleCase(OpKernelContext * context,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)183 inline void TileOp<Device, Tmultiples>::HandleCase(
184     OpKernelContext* context,
185     const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
186   // TODO(vrv): print out the device name if useful. Currently disabled to avoid
187   // having to use RTTI.
188   LOG(FATAL) << "TileOp: Invalid combination of Device, DT: "
189              // << typeid(Device).name() << ", "
190              << DataTypeString(DT);
191 }
192 
193 #define HANDLE_CASE(device, dtype, Tmultiples)                              \
194   template <>                                                               \
195   template <>                                                               \
196   void TileOp<device, Tmultiples>::HandleCase<dtype>(                       \
197       OpKernelContext * context,                                            \
198       const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) { \
199     HandleCaseImpl<dtype>(context, multiples_array, result);                \
200   }
201 
202 #define HANDLE_TYPE_NAME_CPU(T)                            \
203   HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value, int32); \
204   HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value, int64);
205 
206 #define HANDLE_TYPE_NAME_GPU(T)                            \
207   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int32); \
208   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int64);
209 
210 #ifdef TENSORFLOW_USE_SYCL
211 #define HANDLE_TYPE_NAME_SYCL(T)                            \
212   HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int32); \
213   HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int64);
214 #endif  // TENSORFLOW_USE_SYCL
215 
216 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
217 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
218 TF_CALL_bfloat16(HANDLE_TYPE_NAME_CPU);
219 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
220 TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
221 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
222 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
223 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
224 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
225 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
226 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
227 TF_CALL_string(HANDLE_TYPE_NAME_CPU);
228 
229 #if GOOGLE_CUDA
230 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
231 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
232 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
233 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
234 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
235 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
236 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
237 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
238 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
239 #endif  // GOOGLE_CUDA
240 
241 #ifdef TENSORFLOW_USE_SYCL
242 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
243 TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
244 TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
245 TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
246 TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
247 #endif  // TENSORFLOW_USE_SYCL
248 
249 #undef HANDLE_TYPE_NAME_CPU
250 #undef HANDLE_TYPE_NAME_GPU
251 #ifdef TENSORFLOW_USE_SYCL
252 #undef HANDLE_TYPE_NAME_SYCL
253 #endif  // TENSORFLOW_USE_SYCL
254 #undef HANDLE_CASE
255 
256 // --------------------------------------------------------------------------
257 template <typename Device, typename Tmultiples>
258 class TileGradientOp : public OpKernel {
259  public:
TileGradientOp(OpKernelConstruction * context)260   explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {}
261 
Compute(OpKernelContext * context)262   void Compute(OpKernelContext* context) override {
263     const Tensor& input = context->input(0);
264     const Tensor& multiples = context->input(1);
265     OP_REQUIRES(
266         context, IsLegacyVector(multiples.shape()),
267         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
268                                 multiples.shape().DebugString()));
269     OP_REQUIRES(context, input.dims() == multiples.NumElements(),
270                 errors::InvalidArgument(
271                     "Expected multiples argument to be a vector of length ",
272                     input.dims(), " but got length ", multiples.dim_size(0)));
273 
274     const int input_dims = input.dims();
275 
276     // Eigen doesn't support scalars on the GPU, so handle 0-D specially
277     if (input_dims == 0) {
278       context->set_output(0, input);
279       return;
280     }
281 
282     const gtl::ArraySlice<Tmultiples> multiples_array(
283         multiples.flat<Tmultiples>().data(), input_dims);
284     TensorShape output_shape;
285     std::vector<Tmultiples> input_dim_size_vec;
286     for (int i = 0; i < input_dims; ++i) {
287       OP_REQUIRES(
288           context, multiples_array[i] > 0,
289           errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ",
290                                   multiples_array[i]));
291       OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0,
292                   errors::InvalidArgument("Expected input_dim[", i,
293                                           "] to be divisible by multiples[", i,
294                                           "], but ", input.dim_size(i), " % ",
295                                           multiples_array[i], " != 0"));
296       output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
297       input_dim_size_vec.push_back(input.dim_size(i));
298     }
299     if (output_shape == input.shape()) {
300       context->set_output(0, input);
301       return;
302     }
303     Tensor* result = nullptr;
304     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
305 
306 #define HANDLE_DIM(DT, NDIM)                                           \
307   if (context->input(0).dtype() == DT && input_dims == NDIM) {         \
308     HandleCase<DT, NDIM>(context, input_dim_size_vec, multiples_array, \
309                          result);                                      \
310     return;                                                            \
311   }
312 
313 #define HANDLE_TYPE(T) \
314   HANDLE_DIM(T, 1)     \
315   HANDLE_DIM(T, 2)     \
316   HANDLE_DIM(T, 3)     \
317   HANDLE_DIM(T, 4)     \
318   HANDLE_DIM(T, 5)     \
319   HANDLE_DIM(T, 6)     \
320   HANDLE_DIM(T, 7)
321 
322 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
323 
324     TF_CALL_float(HANDLE_TYPE_NAME);
325     TF_CALL_double(HANDLE_TYPE_NAME);
326     TF_CALL_int32(HANDLE_TYPE_NAME);
327     TF_CALL_int16(HANDLE_TYPE_NAME);
328     TF_CALL_int64(HANDLE_TYPE_NAME);
329     TF_CALL_half(HANDLE_TYPE_NAME);
330     TF_CALL_bfloat16(HANDLE_TYPE_NAME);
331     TF_CALL_complex64(HANDLE_TYPE_NAME);
332     TF_CALL_complex128(HANDLE_TYPE_NAME);
333 
334 #undef HANDLE_TYPE_NAME
335 #undef HANDLE_TYPE
336 #undef HANDLE_DIM
337 
338     OP_REQUIRES(context, false,
339                 errors::Unimplemented("TileGradientOp : The input data type or "
340                                       "dimension is not supported, DataType : ",
341                                       DataTypeString(context->input(0).dtype()),
342                                       ", Dimension : ", input_dims));
343   }
344 
345  private:
346   template <DataType DT, int NDIM>
347   void HandleCase(OpKernelContext* context,
348                   const std::vector<Tmultiples>& input_dims,
349                   const gtl::ArraySlice<Tmultiples>& multiples_array,
350                   Tensor* result);
351 
352   template <DataType DT, int NDIM>
HandleCaseImpl(OpKernelContext * context,const std::vector<Tmultiples> & input_dims,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)353   void HandleCaseImpl(OpKernelContext* context,
354                       const std::vector<Tmultiples>& input_dims,
355                       const gtl::ArraySlice<Tmultiples>& multiples_array,
356                       Tensor* result) {
357     typedef typename EnumToDataType<DT>::Type T;
358 
359     bool reduction_only = true;
360     std::vector<Tmultiples> reduction_dims;
361 
362     for (int i = 0; i < NDIM; ++i) {
363       if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) {
364         reduction_only = false;
365         break;
366       } else {
367         if (multiples_array[i] == input_dims[i]) {
368           reduction_dims.push_back(i);
369         }
370       }
371     }
372 
373     if (reduction_only) {
374 #define HANDLE_DIM(D)                                            \
375   if (reduction_dims.size() == (D)) {                            \
376     HandleReduce<T, NDIM, (D)>(context, reduction_dims, result); \
377     return;                                                      \
378   }
379       // NOTE(keveman): Handling the most common case here.
380       // Adding more cases here would require more templating and code
381       // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1.
382       HANDLE_DIM(1);
383 
384 // Fall through to the unoptimized version.
385 #undef HANDLE_DIM
386     }
387 
388     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
389     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
390 
391     // Accumulate slices along the dimensions into the output. The number of
392     // slices along dimension 'i' is simply the multiple along dimension 'i'
393     // passed to the original Tile op.
394     for (int i = 0; i < NDIM; ++i) {
395       sizes[i] = input_dims[i] / multiples_array[i];
396       indices[i] = 0;
397     }
398 
399     bool first = true;
400     while (true) {
401       functor::TileGrad<Device, T, NDIM>()(
402           context->eigen_device<Device>(), result->tensor<T, NDIM>(),
403           context->input(0).tensor<T, NDIM>(), indices, sizes, first);
404       first = false;
405       // Increment the begin indices.
406       int i = 0;
407       while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) {
408         indices[i] = 0;
409         ++i;
410       }
411       // We are finished if we have iterated to the maximum along all
412       // dimensions.
413       if (i == NDIM) {
414         break;
415       }
416       indices[i] += sizes[i];
417     }
418   }
419 
420   template <typename T, int NDIM, int REDUCENDIM>
HandleReduce(OpKernelContext * context,const std::vector<Tmultiples> & reduce_dim_in,Tensor * result)421   void HandleReduce(OpKernelContext* context,
422                     const std::vector<Tmultiples>& reduce_dim_in,
423                     Tensor* result) {
424     static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions");
425     Eigen::DSizes<Eigen::DenseIndex, REDUCENDIM> reduce_dim;
426     Eigen::DSizes<Eigen::DenseIndex, NDIM> reshape_dim;
427 
428     for (int i = 0; i < REDUCENDIM; ++i) {
429       reduce_dim[i] = reduce_dim_in[i];
430     }
431 
432     for (int i = 0; i < NDIM; ++i) {
433       reshape_dim[i] = result->dim_size(i);
434     }
435 
436     functor::ReduceAndReshape<Device, T, NDIM, REDUCENDIM>()(
437         context->eigen_device<Device>(), result->tensor<T, NDIM>(),
438         context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim);
439   }
440 
441   TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp);
442 };
443 
444 template <typename Device, typename Tmultiples>
445 template <DataType DT, int NDIM>
HandleCase(OpKernelContext * context,const std::vector<Tmultiples> & input_dims,const gtl::ArraySlice<Tmultiples> & multiples_array,Tensor * result)446 inline void TileGradientOp<Device, Tmultiples>::HandleCase(
447     OpKernelContext* context, const std::vector<Tmultiples>& input_dims,
448     const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
449   LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
450              << MakeTypeIndex<Device>().name() << ", " << DataTypeString(DT)
451              << ", " << NDIM;
452 }
453 
454 #define HANDLE_CASE(device, T, dtype, Tmultiples, ndim)                        \
455   template <>                                                                  \
456   template <>                                                                  \
457   void TileGradientOp<device, Tmultiples>::HandleCase<dtype, ndim>(            \
458       OpKernelContext * context, const std::vector<Tmultiples>& input_dims,    \
459       const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {    \
460     HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \
461   }
462 
463 // 0-D handled specially above
464 #define HANDLE_CASE_DIM(device, T, dtype)  \
465   HANDLE_CASE(device, T, dtype, int32, 1); \
466   HANDLE_CASE(device, T, dtype, int32, 2); \
467   HANDLE_CASE(device, T, dtype, int32, 3); \
468   HANDLE_CASE(device, T, dtype, int32, 4); \
469   HANDLE_CASE(device, T, dtype, int32, 5); \
470   HANDLE_CASE(device, T, dtype, int32, 6); \
471   HANDLE_CASE(device, T, dtype, int32, 7); \
472   HANDLE_CASE(device, T, dtype, int64, 1); \
473   HANDLE_CASE(device, T, dtype, int64, 2); \
474   HANDLE_CASE(device, T, dtype, int64, 3); \
475   HANDLE_CASE(device, T, dtype, int64, 4); \
476   HANDLE_CASE(device, T, dtype, int64, 5); \
477   HANDLE_CASE(device, T, dtype, int64, 6); \
478   HANDLE_CASE(device, T, dtype, int64, 7);
479 
480 #define HANDLE_TYPE_NAME_CPU(T) \
481   HANDLE_CASE_DIM(CPUDevice, T, DataTypeToEnum<T>::value);
482 
483 #define HANDLE_TYPE_NAME_GPU(T) \
484   HANDLE_CASE_DIM(GPUDevice, T, DataTypeToEnum<T>::value);
485 
486 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
487 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
488 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
489 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
490 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
491 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
492 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
493 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
494 
495 #if GOOGLE_CUDA
496 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
497 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
498 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
499 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
500 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
501 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
502 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
503 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
504 #endif  // GOOGLE_CUDA
505 
506 #if TENSORFLOW_USE_SYCL
507 #define HANDLE_TYPE_NAME_SYCL(T) \
508   HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
509 
510 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
511 TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
512 TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
513 TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
514 TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
515 #undef HANDLE_TYPE_NAME_SYCL
516 #endif  // TENSORFLOW_USE_SYCL
517 
518 #undef HANDLE_TYPE_NAME_CPU
519 #undef HANDLE_TYPE_NAME_GPU
520 #undef HANDLE_CASE_DIM
521 #undef HANDLE_CASE
522 
523 REGISTER_KERNEL_BUILDER(Name("Tile")
524                             .Device(DEVICE_CPU)
525                             .HostMemory("multiples")
526                             .TypeConstraint<int32>("Tmultiples"),
527                         TileOp<CPUDevice, int32>);
528 REGISTER_KERNEL_BUILDER(Name("Tile")
529                             .Device(DEVICE_CPU)
530                             .HostMemory("multiples")
531                             .TypeConstraint<int64>("Tmultiples"),
532                         TileOp<CPUDevice, int64>);
533 REGISTER_KERNEL_BUILDER(Name("TileGrad")
534                             .Device(DEVICE_CPU)
535                             .HostMemory("multiples")
536                             .TypeConstraint<int32>("Tmultiples"),
537                         TileGradientOp<CPUDevice, int32>);
538 REGISTER_KERNEL_BUILDER(Name("TileGrad")
539                             .Device(DEVICE_CPU)
540                             .HostMemory("multiples")
541                             .TypeConstraint<int64>("Tmultiples"),
542                         TileGradientOp<CPUDevice, int64>);
543 
544 #if GOOGLE_CUDA
545 #define REGISTER_GPU_TILE(type)                                    \
546   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
547                               .Device(DEVICE_GPU)                  \
548                               .TypeConstraint<type>("T")           \
549                               .TypeConstraint<int32>("Tmultiples") \
550                               .HostMemory("multiples"),            \
551                           TileOp<GPUDevice, int32>);               \
552   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
553                               .Device(DEVICE_GPU)                  \
554                               .TypeConstraint<type>("T")           \
555                               .TypeConstraint<int64>("Tmultiples") \
556                               .HostMemory("multiples"),            \
557                           TileOp<GPUDevice, int64>);
558 
559 #define REGISTER_GPU_TILE_GRAD(type)                               \
560   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
561                               .Device(DEVICE_GPU)                  \
562                               .TypeConstraint<type>("T")           \
563                               .TypeConstraint<int32>("Tmultiples") \
564                               .HostMemory("multiples"),            \
565                           TileGradientOp<GPUDevice, int32>);       \
566   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
567                               .Device(DEVICE_GPU)                  \
568                               .TypeConstraint<type>("T")           \
569                               .TypeConstraint<int64>("Tmultiples") \
570                               .HostMemory("multiples"),            \
571                           TileGradientOp<GPUDevice, int64>);
572 
573 #define REGISTER_GPU(type) \
574   REGISTER_GPU_TILE(type); \
575   REGISTER_GPU_TILE_GRAD(type);
576 
577 TF_CALL_bool(REGISTER_GPU_TILE);
578 TF_CALL_float(REGISTER_GPU);
579 TF_CALL_double(REGISTER_GPU);
580 TF_CALL_half(REGISTER_GPU);
581 TF_CALL_int16(REGISTER_GPU);
582 TF_CALL_int32(REGISTER_GPU);
583 TF_CALL_int64(REGISTER_GPU);
584 TF_CALL_complex64(REGISTER_GPU);
585 TF_CALL_complex128(REGISTER_GPU)
586 
587 #undef REGISTER_GPU_TILE
588 #undef REGISTER_GPU_TILE_GRAD
589 #undef REGISTER_GPU
590 #endif  // GOOGLE_CUDA
591 
592 #ifdef TENSORFLOW_USE_SYCL
593 #define REGISTER_SYCL(type)                                        \
594   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
595                               .Device(DEVICE_SYCL)                 \
596                               .TypeConstraint<type>("T")           \
597                               .TypeConstraint<int32>("Tmultiples") \
598                               .HostMemory("multiples"),            \
599                           TileOp<SYCLDevice, int32>);              \
600   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
601                               .Device(DEVICE_SYCL)                 \
602                               .TypeConstraint<type>("T")           \
603                               .TypeConstraint<int64>("Tmultiples") \
604                               .HostMemory("multiples"),            \
605                           TileOp<SYCLDevice, int64>);              \
606   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
607                               .Device(DEVICE_SYCL)                 \
608                               .TypeConstraint<type>("T")           \
609                               .TypeConstraint<int32>("Tmultiples") \
610                               .HostMemory("multiples"),            \
611                           TileGradientOp<SYCLDevice, int32>);      \
612   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
613                               .Device(DEVICE_SYCL)                 \
614                               .TypeConstraint<type>("T")           \
615                               .TypeConstraint<int64>("Tmultiples") \
616                               .HostMemory("multiples"),            \
617                           TileGradientOp<SYCLDevice, int64>);
618 
619     TF_CALL_float(REGISTER_SYCL);
620 TF_CALL_double(REGISTER_SYCL);
621 
622 #undef REGISTER_SYCL
623 #endif  // TENSORFLOW_USE_SYCL
624 
625 }  // namespace tensorflow
626