1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17
18 #define EIGEN_USE_GPU
19
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/kernels/ops_util.h"
22 #include "tensorflow/core/kernels/transpose_functor.h"
23 #include "tensorflow/core/util/cuda_kernel_helper.h"
24
25 // TODO(yangzihao): Remove the dependency of conv_2d.h once we move all
26 // GPU util functions and transpose kernels into separate files.
27 #include "tensorflow/core/kernels/conv_2d.h"
28
29 typedef Eigen::GpuDevice GPUDevice;
30
31 namespace tensorflow {
32 namespace internal {
33
34 template <typename T, bool conjugate>
TransposeKernel(int nthreads,const T * src,const int32 * buf,const int32 ndims,T * dst)35 __global__ void TransposeKernel(int nthreads, const T* src, const int32* buf,
36 const int32 ndims, T* dst) {
37 const int32* in_strides = buf;
38 const int32* out_strides = buf + ndims;
39 const int32* perm = buf + ndims * 2;
40 CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
41 int32 i_idx = 0;
42 int32 t = o_idx;
43 for (int32 i = 0; i < ndims; ++i) {
44 const int32 ratio = t / out_strides[i];
45 t -= ratio * out_strides[i];
46 i_idx += ratio * in_strides[perm[i]];
47 }
48 if (conjugate) {
49 dst[o_idx] = Eigen::numext::conj(ldg(src + i_idx));
50 } else {
51 dst[o_idx] = ldg(src + i_idx);
52 }
53 }
54 }
55
56 template <typename T, bool conjugate>
TransposeSimple(const GPUDevice & d,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)57 void TransposeSimple(const GPUDevice& d, const Tensor& in,
58 const gtl::ArraySlice<int32> perm, Tensor* out) {
59 // Ensures we can use 32-bit index.
60 const int64 nelem = in.NumElements();
61 CHECK_LT(nelem, kint32max) << "Tensor too large to transpose on GPU";
62 // Pack strides and permutation into one buffer.
63 const int32 ndims = in.dims();
64 gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
65 gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
66 gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
67 // Dimension permutation.
68 for (int i = 0; i < ndims; ++i) {
69 host_buf[i] = in_strides[i];
70 host_buf[ndims + i] = out_strides[i];
71 host_buf[ndims * 2 + i] = perm[i];
72 }
73 // Copies the input strides, output strides and permutation to the device.
74 auto num_bytes = sizeof(int64) * host_buf.size();
75 auto dev_buf = d.allocate(num_bytes);
76 // NOTE: host_buf is not allocated by CudaHostAllocator, and
77 // therefore we are doing a sync copy effectively.
78 d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
79 // Launch kernel to q[...] = p[...].
80 const T* p = reinterpret_cast<const T*>(in.tensor_data().data());
81 T* q = reinterpret_cast<T*>(const_cast<char*>((out->tensor_data().data())));
82 CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
83 TF_CHECK_OK(CudaLaunchKernel(
84 TransposeKernel<T, conjugate>, cfg.block_count, cfg.thread_per_block, 0,
85 d.stream(), cfg.virtual_thread_count, p,
86 reinterpret_cast<const int32*>(dev_buf), ndims, q));
87 // Safe to deallocate immediately after the kernel launch.
88 d.deallocate(dev_buf);
89 }
90
91 // TransposeUsingTile tries to reduce the dimension of the input tensor to 3 and
92 // then call special kernels to swap either dimension 1 and dimension 2 or
93 // dimension 0 and dimension 2. It returns true if the operation is success,
94 // false otherwise.
95 template <typename T, bool conjugate = false>
96 struct TransposeUsingTile {
runtensorflow::internal::TransposeUsingTile97 static bool run(const Eigen::GpuDevice& d, const Tensor& in,
98 const gtl::ArraySlice<int32> perm, Tensor* out) {
99 // First try to reduce the dimensions of the input tensor.
100 TransposePermsVec new_perm;
101 TransposeDimsVec new_dims;
102 ReduceTransposeDimensions(in.shape(), perm, &new_perm, &new_dims);
103
104 // Only use special GPU kernel when dimension is 2 or 3.
105 int dims = new_dims.size();
106 if (dims < 2 || dims > 3) return false;
107 auto in_data = reinterpret_cast<const T*>(in.tensor_data().data());
108 auto out_data =
109 reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data()));
110 switch (dims) {
111 case 2:
112 if (new_perm[0] == 1 && new_perm[1] == 0) {
113 // Add the first dimension size as 1.
114 new_dims.insert(new_dims.begin(), 1);
115 tensorflow::functor::SwapDimension1And2InTensor3<GPUDevice, T,
116 conjugate>()(
117 d, in_data, new_dims, out_data);
118 return true;
119 }
120 break;
121 case 3:
122 if (new_perm == TransposePermsVec({0, 2, 1})) {
123 tensorflow::functor::SwapDimension1And2InTensor3<GPUDevice, T,
124 conjugate>()(
125 d, in_data, new_dims, out_data);
126 return true;
127 } else if (new_perm == TransposePermsVec({2, 1, 0})) {
128 tensorflow::functor::SwapDimension0And2InTensor3<GPUDevice, T,
129 conjugate>()(
130 d, in_data, new_dims, out_data);
131 return true;
132 } else {
133 // do not handle other 3D permutations
134 return false;
135 }
136 break;
137 default:
138 return false;
139 }
140 return false;
141 }
142 };
143
144 template <bool conjugate>
145 struct TransposeUsingTile<complex64, conjugate> {
runtensorflow::internal::TransposeUsingTile146 static bool run(const Eigen::GpuDevice& d, const Tensor& in,
147 const gtl::ArraySlice<int32> perm, Tensor* out) {
148 if (!conjugate) {
149 return TransposeUsingTile<uint64>::run(d, in, perm, out);
150 } else {
151 return TransposeUsingTile<float2, true>::run(d, in, perm, out);
152 }
153 }
154 };
155
156 template <bool conjugate>
157 struct TransposeUsingTile<complex128, conjugate> {
runtensorflow::internal::TransposeUsingTile158 static bool run(const Eigen::GpuDevice& d, const Tensor& in,
159 const gtl::ArraySlice<int32> perm, Tensor* out) {
160 if (!conjugate) {
161 return TransposeUsingTile<float4>::run(d, in, perm, out);
162 } else {
163 return TransposeUsingTile<double2, true>::run(d, in, perm, out);
164 }
165 }
166 };
167
168 } // namespace internal
169
170 // Transpose kernel specialized for GPU Device.
171 #define HANDLE_DIM(DIM) \
172 case DIM: \
173 internal::TransposeUsingEigen<GPUDevice, T, DIM>(d, in, perm, conjugate, \
174 out); \
175 break
176
177 template <typename T, bool conjugate>
178 struct Transpose<GPUDevice, T, conjugate> {
runtensorflow::Transpose179 static void run(const GPUDevice& d, const Tensor& in,
180 const gtl::ArraySlice<int32> perm, Tensor* out) {
181 if (in.dims() < 2) return;
182 if (internal::TransposeUsingTile<T, conjugate>::run(d, in, perm, out)) {
183 return;
184 }
185
186 switch (in.dims()) {
187 HANDLE_DIM(2);
188 HANDLE_DIM(3);
189 HANDLE_DIM(4);
190 HANDLE_DIM(5);
191 HANDLE_DIM(6);
192 HANDLE_DIM(7);
193 HANDLE_DIM(8);
194 default:
195 internal::TransposeSimple<T, conjugate>(d, in, perm, out);
196 break;
197 }
198 }
199 };
200
201 #undef HANDLE_DIM
202
203 template <bool conjugate>
204 struct Transpose<GPUDevice, string, conjugate> {
runtensorflow::Transpose205 static void run(const GPUDevice& d, const Tensor& in,
206 const gtl::ArraySlice<int32> perm, Tensor* out) {
207 LOG(FATAL) << "Transpose of DT_STRING tensor not supported on GPU.";
208 }
209 };
210
211 // Explicit instantiation.
212 template struct Transpose<GPUDevice, string, false>;
213
214 template <>
DoTranspose(const GPUDevice & device,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)215 Status DoTranspose(const GPUDevice& device, const Tensor& in,
216 const gtl::ArraySlice<int32> perm, Tensor* out) {
217 return internal::DoTransposeImpl(device, in, perm, /*conjugate=*/false, out);
218 }
219 template <>
DoConjugateTranspose(const GPUDevice & device,const Tensor & in,const gtl::ArraySlice<int32> perm,Tensor * out)220 Status DoConjugateTranspose(const GPUDevice& device, const Tensor& in,
221 const gtl::ArraySlice<int32> perm, Tensor* out) {
222 return internal::DoTransposeImpl(device, in, perm, /*conjugate=*/true, out);
223 }
224 template <>
DoMatrixTranspose(const GPUDevice & device,const Tensor & in,Tensor * out)225 Status DoMatrixTranspose(const GPUDevice& device, const Tensor& in,
226 Tensor* out) {
227 return internal::DoMatrixTransposeImpl(device, in, /*conjugate=*/false, out);
228 }
229 template <>
DoConjugateMatrixTranspose(const GPUDevice & device,const Tensor & in,Tensor * out)230 Status DoConjugateMatrixTranspose(const GPUDevice& device, const Tensor& in,
231 Tensor* out) {
232 return internal::DoMatrixTransposeImpl(device, in, /*conjugate=*/true, out);
233 }
234
235 } // namespace tensorflow
236 #endif // GOOGLE_CUDA
237