• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 
18 #define EIGEN_USE_GPU
19 
20 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
21 #include "tensorflow/core/framework/register_types.h"
22 #include "tensorflow/core/kernels/scatter_nd_op.h"
23 #include "tensorflow/core/platform/types.h"
24 #include "tensorflow/core/util/cuda_kernel_helper.h"
25 
26 namespace tensorflow {
27 
28 typedef Eigen::GpuDevice GPUDevice;
29 
30 namespace {
31 
32 template <typename T, scatter_nd_op::UpdateOp Op>
33 struct LeftUpdate {
34   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val);
35 };
36 
37 template <typename T>
38 struct LeftUpdate<T, scatter_nd_op::UpdateOp::ASSIGN> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate39   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val) {
40     *out = val;
41   }
42 };
43 
44 template <typename T>
45 struct LeftUpdate<T, scatter_nd_op::UpdateOp::ADD> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate46   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val) {
47     CudaAtomicAdd(out, val);
48   }
49 };
50 
51 template <typename T>
52 struct LeftUpdate<T, scatter_nd_op::UpdateOp::SUB> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate53   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(T* out, const T& val) {
54     CudaAtomicSub(out, val);
55   }
56 };
57 
58 // Specializations for std::complex, updating real and imaginary part
59 // individually. Even though this is not an atomic op anymore, it is safe
60 // because there is only one type of op per kernel.
61 template <typename T>
62 struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate63   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
64       std::complex<T>* out, const std::complex<T>& val) {
65     T* ptr = reinterpret_cast<T*>(out);
66     CudaAtomicAdd(ptr, val.real());
67     CudaAtomicAdd(ptr, val.imag());
68   }
69 };
70 
71 template <typename T>
72 struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::SUB> {
operator ()tensorflow::__anond5e516a30111::LeftUpdate73   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
74       std::complex<T>* out, const std::complex<T>& val) {
75     LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD>()(out, -val);
76   }
77 };
78 
79 }  // namespace
80 
81 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
ScatterNdOpKernel(const Index * indices,const T * updates,T * out,const Eigen::array<Eigen::DenseIndex,IXDIM> output_shape_prefix,const Eigen::array<int64,IXDIM> batch_strides,const int64 num_indices,const Index slice_size)82 __global__ void ScatterNdOpKernel(
83     const Index* indices, const T* updates, T* out,
84     const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
85     const Eigen::array<int64, IXDIM> batch_strides, const int64 num_indices,
86     const Index slice_size) {
87   auto update = LeftUpdate<T, op>();
88 
89   CUDA_1D_KERNEL_LOOP(index, num_indices) {
90     Index i = 0;
91     bool out_of_bounds = false;
92 #pragma unroll
93     for (int dim = 0; dim < IXDIM; ++dim) {
94       int offset = (IXDIM * index + dim);
95       const Index ix_d = internal::SubtleMustCopy(ldg(indices + offset));
96       out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
97       i += ix_d * batch_strides[dim] * slice_size;
98     }
99     if (!out_of_bounds) {
100 #pragma unroll
101       for (int si = 0; si < slice_size; si++) {
102         update(out + i + si, ldg(updates + (index * slice_size + si)));
103       }
104     }
105   }
106 }
107 
108 namespace functor {
109 
110 // Functor used by ScatterOp to do the computations.
111 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
112 struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
operator ()tensorflow::functor::ScatterNdFunctor113   Index operator()(
114       const GPUDevice& d, const Index slice_size,
115       const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
116       typename TTypes<T, 2>::Tensor Tparams,
117       typename TTypes<Index, 2>::ConstTensor Tindices,
118       typename TTypes<T, 2>::ConstTensor Tupdates,
119       typename TTypes<T, 2>::Tensor Toutput) {
120     // TODO(ebrevdo): The performance of this for small indices (large
121     // slices) is poor.  Write a kernel whose splitting is
122     // independent of the slice size.  Same for CPU.  See the
123     // gather_nd kernel for an example.
124 
125     const Eigen::DenseIndex batch_size = Tindices.dimension(0);
126 
127     // Index batch_strides[IXDIM];
128     Eigen::array<int64, IXDIM> batch_strides;
129     for (int dim = IXDIM - 1; dim >= 0; --dim) {
130       if (dim == IXDIM - 1) {
131         batch_strides[dim] = 1;
132       } else {
133         batch_strides[dim] =
134             batch_strides[dim + 1] * output_shape_prefix[dim + 1];
135       }
136     }
137 
138     CudaLaunchConfig config = GetCudaLaunchConfig(Toutput.size(), d);
139 
140     TF_CHECK_OK(CudaLaunchKernel(ScatterNdOpKernel<T, Index, op, IXDIM>,
141                                  config.block_count, config.thread_per_block, 0,
142                                  d.stream(), Tindices.data(), Tupdates.data(),
143                                  Toutput.data(), output_shape_prefix,
144                                  batch_strides, batch_size, slice_size));
145 
146     return -1;
147   }
148 };
149 
150 }  // namespace functor
151 
152 #define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM) \
153   template struct functor::ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>;
154 
155 #define DECLARE_GPU_SPECS_INDEX_OP(T, Index, op)     \
156   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 1); \
157   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
158   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
159   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
160   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
161   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
162   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
163 
164 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
165   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
166   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ADD);    \
167   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::SUB)
168 
169 #define DECLARE_GPU_SPECS(T)         \
170   DECLARE_GPU_SPECS_INDEX(T, int32); \
171   DECLARE_GPU_SPECS_INDEX(T, int64)
172 
173 TF_CALL_int32(DECLARE_GPU_SPECS);
174 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
175 TF_CALL_complex64(DECLARE_GPU_SPECS);
176 TF_CALL_complex128(DECLARE_GPU_SPECS);
177 
178 #undef DECLARE_GPU_SPECS
179 #undef DECLARE_GPU_SPECS_INDEX
180 #undef DECLARE_GPU_SPECS_INDEX_OP
181 
182 }  // namespace tensorflow
183 
184 #endif  // GOOGLE_CUDA
185