• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
17 #define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
18 
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20 
21 #define EIGEN_USE_GPU
22 
23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/kernels/ops_util.h"
26 #include "tensorflow/core/kernels/tile_functor.h"
27 #include "tensorflow/core/util/gpu_kernel_helper.h"
28 
29 namespace tensorflow {
30 namespace internal {
31 
32 template <typename T>
TileKernel(int nthreads,const T * __restrict__ src,const int32 * __restrict__ buf,const int32 ndims,T * __restrict__ dst)33 __global__ void TileKernel(int nthreads, const T* __restrict__ src,
34                            const int32* __restrict__ buf, const int32 ndims,
35                            T* __restrict__ dst) {
36   const int32* in_strides = buf;
37   const int32* out_strides = buf + ndims;
38   const int32* in_dim_sizes = buf + ndims * 2;
39   GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
40     int32 i_idx = 0;
41     int32 t = o_idx;
42     for (int i = 0; i < ndims; ++i) {
43       i_idx += t / out_strides[i] % in_dim_sizes[i] * in_strides[i];
44       t %= out_strides[i];
45     }
46     dst[o_idx] = ldg(src + i_idx);
47   }
48 }
49 
50 template <typename T>
TileSimple(const Eigen::GpuDevice & d,Tensor * out,const Tensor & in)51 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
52   // Ensures we can use 32-bit index.
53   const int64 in_nelem = in.NumElements();
54   CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
55   const int64 out_nelem = out->NumElements();
56   CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
57   // Pack strides and input dimension sizes into one buffer.
58   const int32 ndims = in.dims();
59   gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
60   gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
61   gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
62   for (int i = 0; i < ndims; ++i) {
63     host_buf[i] = in_strides[i];
64     host_buf[ndims + i] = out_strides[i];
65     host_buf[ndims * 2 + i] = in.dim_size(i);
66   }
67   // Copies the input strides, output strides and input dimension sizes to the
68   // device.
69   auto num_bytes = sizeof(int32) * host_buf.size();
70   auto dev_buf = d.allocate(num_bytes);
71   // NOTE: host_buf is not allocated by GpuHostAllocator, and
72   // therefore we are doing a sync copy effectively.
73   d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
74   // Launch kernel to q[...] = p[...].
75   const T* p = in.flat<T>().data();
76   T* q = out->flat<T>().data();
77   GpuLaunchConfig cfg = GetGpuLaunchConfig(out_nelem, d);
78   TF_CHECK_OK(
79       GpuLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
80                       d.stream(), cfg.virtual_thread_count, p,
81                       reinterpret_cast<const int32*>(dev_buf), ndims, q));
82   // Safe to deallocate immediately after the kernel launch.
83   d.deallocate(dev_buf);
84 }
85 
86 }  // end namespace internal
87 }  // namespace tensorflow
88 
89 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
90 
91 #endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
92