1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17
18 #define EIGEN_USE_GPU
19
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/framework/register_types.h"
22 #include "tensorflow/core/kernels/scatter_nd_op.h"
23 #include "tensorflow/core/platform/types.h"
24 #include "tensorflow/core/util/cuda_kernel_helper.h"
25
26 namespace tensorflow {
27
28 typedef Eigen::GpuDevice GPUDevice;
29
30 namespace {
31
32 template <typename T, scatter_nd_op::UpdateOp Op>
33 struct LeftUpdate {
34 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val);
35 };
36
37 template <typename T>
38 struct LeftUpdate<T, scatter_nd_op::UpdateOp::ASSIGN> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate39 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val) {
40 *out = val;
41 }
42 };
43
44 template <typename T>
45 struct LeftUpdate<T, scatter_nd_op::UpdateOp::ADD> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate46 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val) {
47 CudaAtomicAdd(out, val);
48 }
49 };
50
51 template <typename T>
52 struct LeftUpdate<T, scatter_nd_op::UpdateOp::SUB> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate53 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val) {
54 CudaAtomicSub(out, val);
55 }
56 };
57
58 // Specializations for std::complex, updating real and imaginary part
59 // individually. Even though this is not an atomic op anymore, it is safe
60 // because there is only one type of op per kernel.
61 template <typename T>
62 struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate63 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
64 std::complex<T>* out, const std::complex<T>& val) {
65 T* ptr = reinterpret_cast<T*>(out);
66 CudaAtomicAdd(ptr, val.real());
67 CudaAtomicAdd(ptr, val.imag());
68 }
69 };
70
71 template <typename T>
72 struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::SUB> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate73 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
74 std::complex<T>* out, const std::complex<T>& val) {
75 LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD>()(out, -val);
76 }
77 };
78
79 } // namespace
80
81 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
ScatterNdOpKernel(const Index * indices,const T * updates,T * out,const Eigen::array<Eigen::DenseIndex,IXDIM> output_shape_prefix,const Eigen::array<int64,IXDIM> batch_strides,const int64 num_indices,const Index slice_size)82 __global__ void ScatterNdOpKernel(
83 const Index* indices, const T* updates, T* out,
84 const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
85 const Eigen::array<int64, IXDIM> batch_strides, const int64 num_indices,
86 const Index slice_size) {
87 auto update = LeftUpdate<T, op>();
88
89 CUDA_1D_KERNEL_LOOP(index, num_indices) {
90 Index i = 0;
91 bool out_of_bounds = false;
92 #pragma unroll
93 for (int dim = 0; dim < IXDIM; ++dim) {
94 int offset = (IXDIM * index + dim);
95 const Index ix_d = internal::SubtleMustCopy(ldg(indices + offset));
96 out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
97 i += ix_d * batch_strides[dim] * slice_size;
98 }
99 if (!out_of_bounds) {
100 #pragma unroll
101 for (int si = 0; si < slice_size; si++) {
102 update(out + i + si, ldg(updates + (index * slice_size + si)));
103 }
104 }
105 }
106 }
107
108 namespace functor {
109
110 // Functor used by ScatterOp to do the computations.
111 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
112 struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
operator ()tensorflow::functor::ScatterNdFunctor113 Index operator()(
114 const GPUDevice& d, const Index slice_size,
115 const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
116 typename TTypes<T, 2>::Tensor Tparams,
117 typename TTypes<Index, 2>::ConstTensor Tindices,
118 typename TTypes<T, 2>::ConstTensor Tupdates,
119 typename TTypes<T, 2>::Tensor Toutput) {
120 // TODO(ebrevdo): The performance of this for small indices (large
121 // slices) is poor. Write a kernel whose splitting is
122 // independent of the slice size. Same for CPU. See the
123 // gather_nd kernel for an example.
124
125 const Eigen::DenseIndex batch_size = Tindices.dimension(0);
126
127 // Index batch_strides[IXDIM];
128 Eigen::array<int64, IXDIM> batch_strides;
129 for (int dim = IXDIM - 1; dim >= 0; --dim) {
130 if (dim == IXDIM - 1) {
131 batch_strides[dim] = 1;
132 } else {
133 batch_strides[dim] =
134 batch_strides[dim + 1] * output_shape_prefix[dim + 1];
135 }
136 }
137
138 CudaLaunchConfig config = GetCudaLaunchConfig(Toutput.size(), d);
139
140 TF_CHECK_OK(CudaLaunchKernel(ScatterNdOpKernel<T, Index, op, IXDIM>,
141 config.block_count, config.thread_per_block, 0,
142 d.stream(), Tindices.data(), Tupdates.data(),
143 Toutput.data(), output_shape_prefix,
144 batch_strides, batch_size, slice_size));
145
146 return -1;
147 }
148 };
149
150 } // namespace functor
151
152 #define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM) \
153 template struct functor::ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>;
154
155 #define DECLARE_GPU_SPECS_INDEX_OP(T, Index, op) \
156 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 1); \
157 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
158 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
159 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
160 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
161 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
162 DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
163
164 #define DECLARE_GPU_SPECS_INDEX(T, Index) \
165 DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
166 DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ADD); \
167 DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::SUB)
168
169 #define DECLARE_GPU_SPECS(T) \
170 DECLARE_GPU_SPECS_INDEX(T, int32); \
171 DECLARE_GPU_SPECS_INDEX(T, int64)
172
173 TF_CALL_int32(DECLARE_GPU_SPECS);
174 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
175 TF_CALL_complex64(DECLARE_GPU_SPECS);
176 TF_CALL_complex128(DECLARE_GPU_SPECS);
177
178 #undef DECLARE_GPU_SPECS
179 #undef DECLARE_GPU_SPECS_INDEX
180 #undef DECLARE_GPU_SPECS_INDEX_OP
181
182 } // namespace tensorflow
183
184 #endif // GOOGLE_CUDA
185