1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17
18 #define EIGEN_USE_GPU
19
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/kernels/ops_util.h"
22 #include "tensorflow/core/kernels/transpose_functor.h"
23 #include "tensorflow/core/util/gpu_kernel_helper.h"
24
25 // TODO(yangzihao): Remove the dependency of conv_2d.h once we move all
26 // GPU util functions and transpose kernels into separate files.
27 #include "tensorflow/core/kernels/conv_2d.h"
28
29 typedef Eigen::GpuDevice GPUDevice;
30
31 namespace tensorflow {
32 namespace internal {
33
34 template <typename T, bool conjugate>
TransposeKernel(int nthreads,const T * __restrict__ src,const int32 * __restrict__ buf,const int32 ndims,T * __restrict__ dst)35 __global__ void TransposeKernel(int nthreads, const T* __restrict__ src,
36 const int32* __restrict__ buf,
37 const int32 ndims, T* __restrict__ dst) {
38 const int32* in_strides = buf;
39 const int32* out_strides = buf + ndims;
40 const int32* perm = buf + ndims * 2;
41 GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
42 int32 i_idx = 0;
43 int32 t = o_idx;
44 for (int32 i = 0; i < ndims; ++i) {
45 const int32 ratio = t / out_strides[i];
46 t -= ratio * out_strides[i];
47 i_idx += ratio * in_strides[perm[i]];
48 }
49 if (conjugate) {
50 dst[o_idx] = Eigen::numext::conj(ldg(src + i_idx));
51 } else {
52 dst[o_idx] = ldg(src + i_idx);
53 }
54 }
55 }
56
57 template <typename T, bool conjugate>
TransposeSimple(const GPUDevice & d,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)58 void TransposeSimple(const GPUDevice& d, const Tensor& in,
59 const gtl::ArraySlice<int32> perm, Tensor* out) {
60 // Ensures we can use 32-bit index.
61 const int64 nelem = in.NumElements();
62 CHECK_LT(nelem, kint32max) << "Tensor too large to transpose on GPU";
63 // Pack strides and permutation into one buffer.
64 const int32 ndims = in.dims();
65 gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
66 gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
67 gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
68 // Dimension permutation.
69 for (int i = 0; i < ndims; ++i) {
70 host_buf[i] = in_strides[i];
71 host_buf[ndims + i] = out_strides[i];
72 host_buf[ndims * 2 + i] = perm[i];
73 }
74 // Copies the input strides, output strides and permutation to the device.
75 auto num_bytes = sizeof(int32) * host_buf.size();
76 auto dev_buf = d.allocate(num_bytes);
77 // NOTE: host_buf is not allocated by GpuHostAllocator, and
78 // therefore we are doing a sync copy effectively.
79 d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
80 // Launch kernel to q[...] = p[...].
81 const T* p = reinterpret_cast<const T*>(in.tensor_data().data());
82 T* q = reinterpret_cast<T*>(const_cast<char*>((out->tensor_data().data())));
83 GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
84 TF_CHECK_OK(GpuLaunchKernel(
85 TransposeKernel<T, conjugate>, cfg.block_count, cfg.thread_per_block, 0,
86 d.stream(), cfg.virtual_thread_count, p,
87 reinterpret_cast<const int32*>(dev_buf), ndims, q));
88 // Safe to deallocate immediately after the kernel launch.
89 d.deallocate(dev_buf);
90 }
91
92 // TransposeUsingTile tries to reduce the dimension of the input tensor to 3 and
93 // then call special kernels to swap either dimension 1 and dimension 2 or
94 // dimension 0 and dimension 2. It returns true if the operation is success,
95 // false otherwise.
96 template <typename T, bool conjugate = false>
97 struct TransposeUsingTile {
runtensorflow::internal::TransposeUsingTile98 static bool run(const Eigen::GpuDevice& d, const Tensor& in,
99 const gtl::ArraySlice<int32> perm, Tensor* out) {
100 // First try to reduce the dimensions of the input tensor.
101 TransposePermsVec new_perm;
102 TransposeDimsVec new_dims;
103 ReduceTransposeDimensions(in.shape(), perm, &new_perm, &new_dims);
104
105 // Only use special GPU kernel when dimension is 2 or 3.
106 int dims = new_dims.size();
107 if (dims < 2 || dims > 3) return false;
108 auto in_data = reinterpret_cast<const T*>(in.tensor_data().data());
109 auto out_data =
110 reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data()));
111 switch (dims) {
112 case 2:
113 if (new_perm[0] == 1 && new_perm[1] == 0) {
114 // Add the first dimension size as 1.
115 new_dims.insert(new_dims.begin(), 1);
116 tensorflow::functor::SwapDimension1And2InTensor3<GPUDevice, T,
117 conjugate>()(
118 d, in_data, new_dims, out_data);
119 return true;
120 }
121 break;
122 case 3:
123 if (new_perm == TransposePermsVec({0, 2, 1})) {
124 tensorflow::functor::SwapDimension1And2InTensor3<GPUDevice, T,
125 conjugate>()(
126 d, in_data, new_dims, out_data);
127 return true;
128 } else if (new_perm == TransposePermsVec({2, 1, 0})) {
129 tensorflow::functor::SwapDimension0And2InTensor3<GPUDevice, T,
130 conjugate>()(
131 d, in_data, new_dims, out_data);
132 return true;
133 } else {
134 // do not handle other 3D permutations
135 return false;
136 }
137 break;
138 default:
139 return false;
140 }
141 return false;
142 }
143 };
144
145 template <bool conjugate>
146 struct TransposeUsingTile<complex64, conjugate> {
runtensorflow::internal::TransposeUsingTile147 static bool run(const Eigen::GpuDevice& d, const Tensor& in,
148 const gtl::ArraySlice<int32> perm, Tensor* out) {
149 if (!conjugate) {
150 return TransposeUsingTile<uint64>::run(d, in, perm, out);
151 } else {
152 return TransposeUsingTile<float2, true>::run(d, in, perm, out);
153 }
154 }
155 };
156
157 template <bool conjugate>
158 struct TransposeUsingTile<complex128, conjugate> {
runtensorflow::internal::TransposeUsingTile159 static bool run(const Eigen::GpuDevice& d, const Tensor& in,
160 const gtl::ArraySlice<int32> perm, Tensor* out) {
161 if (!conjugate) {
162 return TransposeUsingTile<float4>::run(d, in, perm, out);
163 } else {
164 return TransposeUsingTile<double2, true>::run(d, in, perm, out);
165 }
166 }
167 };
168
169 } // namespace internal
170
171 // Transpose kernel specialized for GPU Device.
172 #define HANDLE_DIM(DIM) \
173 case DIM: \
174 internal::TransposeUsingEigen<GPUDevice, T, DIM>(d, in, perm, conjugate, \
175 out); \
176 break
177
178 template <typename T, bool conjugate>
179 struct Transpose<GPUDevice, T, conjugate> {
runtensorflow::Transpose180 static void run(const GPUDevice& d, const Tensor& in,
181 const gtl::ArraySlice<int32> perm, Tensor* out) {
182 if (in.dims() < 2) return;
183 if (internal::TransposeUsingTile<T, conjugate>::run(d, in, perm, out)) {
184 return;
185 }
186
187 switch (in.dims()) {
188 HANDLE_DIM(2);
189 HANDLE_DIM(3);
190 HANDLE_DIM(4);
191 HANDLE_DIM(5);
192 HANDLE_DIM(6);
193 HANDLE_DIM(7);
194 HANDLE_DIM(8);
195 default:
196 internal::TransposeSimple<T, conjugate>(d, in, perm, out);
197 break;
198 }
199 }
200 };
201
202 #undef HANDLE_DIM
203
204 template <bool conjugate>
205 struct Transpose<GPUDevice, tstring, conjugate> {
runtensorflow::Transpose206 static void run(const GPUDevice& d, const Tensor& in,
207 const gtl::ArraySlice<int32> perm, Tensor* out) {
208 LOG(FATAL) << "Transpose of DT_STRING tensor not supported on GPU.";
209 }
210 };
211
212 // Explicit instantiation.
213 template struct Transpose<GPUDevice, tstring, false>;
214
215 template <>
DoTranspose(const GPUDevice & device,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)216 Status DoTranspose(const GPUDevice& device, const Tensor& in,
217 const gtl::ArraySlice<int32> perm, Tensor* out) {
218 return internal::DoTransposeImpl(device, in, perm, /*conjugate=*/false, out);
219 }
220 template <>
DoConjugateTranspose(const GPUDevice & device,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)221 Status DoConjugateTranspose(const GPUDevice& device, const Tensor& in,
222 const gtl::ArraySlice<int32> perm, Tensor* out) {
223 return internal::DoTransposeImpl(device, in, perm, /*conjugate=*/true, out);
224 }
225 template <>
DoMatrixTranspose(const GPUDevice & device,const Tensor & in,Tensor * out)226 Status DoMatrixTranspose(const GPUDevice& device, const Tensor& in,
227 Tensor* out) {
228 return internal::DoMatrixTransposeImpl(device, in, /*conjugate=*/false, out);
229 }
230 template <>
DoConjugateMatrixTranspose(const GPUDevice & device,const Tensor & in,Tensor * out)231 Status DoConjugateMatrixTranspose(const GPUDevice& device, const Tensor& in,
232 Tensor* out) {
233 return internal::DoMatrixTransposeImpl(device, in, /*conjugate=*/true, out);
234 }
235
236 } // namespace tensorflow
237 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
238