1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #define EIGEN_USE_THREADS
17
18 #include "tensorflow/core/kernels/sparse_fill_empty_rows_op.h"
19
20 #include <algorithm>
21 #include <numeric>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25
26 #include "tensorflow/core/framework/op_kernel.h"
27 #include "tensorflow/core/framework/register_types.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/tensor_util.h"
30 #include "tensorflow/core/framework/types.h"
31 #include "tensorflow/core/lib/gtl/inlined_vector.h"
32 #include "tensorflow/core/util/sparse/sparse_tensor.h"
33
34 namespace tensorflow {
35
36 using CPUDevice = Eigen::ThreadPoolDevice;
37 using GPUDevice = Eigen::GpuDevice;
38
39 namespace functor {
40
41 template <typename T, typename Tindex>
42 struct SparseFillEmptyRows<CPUDevice, T, Tindex> {
operator ()tensorflow::functor::SparseFillEmptyRows43 Status operator()(OpKernelContext* context, const Tensor& default_value_t,
44 const Tensor& indices_t, const Tensor& values_t,
45 const Tensor& dense_shape_t,
46 typename AsyncOpKernel::DoneCallback done) {
47 (void)done; // Unused (only used in GPU implementation)
48 const int kOutputIndicesOutput = 0;
49 const int kOutputValuesOutput = 1;
50 const int kEmptyRowIndicatorOutput = 2;
51 const int kReverseIndexMapOutput = 3;
52
53 const T& default_value = default_value_t.scalar<T>()();
54 const auto indices = indices_t.matrix<Tindex>();
55 const auto values = values_t.vec<T>();
56 const auto dense_shape = dense_shape_t.vec<Tindex>();
57
58 const Tindex N = indices_t.shape().dim_size(0);
59 const Tindex dense_rows = dense_shape(0);
60
61 bool* empty_row_indicator = nullptr;
62 if (context->output_required(kEmptyRowIndicatorOutput)) {
63 Tensor* empty_row_indicator_t = nullptr;
64 TF_RETURN_IF_ERROR(context->allocate_output(kEmptyRowIndicatorOutput,
65 TensorShape({dense_rows}),
66 &empty_row_indicator_t));
67 empty_row_indicator = empty_row_indicator_t->vec<bool>().data();
68 }
69 Tindex* reverse_index_map = nullptr;
70 if (context->output_required(kReverseIndexMapOutput)) {
71 Tensor* reverse_index_map_t = nullptr;
72 TF_RETURN_IF_ERROR(context->allocate_output(
73 kReverseIndexMapOutput, TensorShape({N}), &reverse_index_map_t));
74 reverse_index_map = reverse_index_map_t->vec<Tindex>().data();
75 }
76
77 int rank = indices_t.shape().dim_size(1);
78
79 if (dense_rows == 0) {
80 if (N != 0) {
81 return errors::InvalidArgument(
82 "Received SparseTensor with dense_shape[0] = 0 but "
83 "indices.shape[0] = ",
84 N);
85 }
86 Tensor* output_indices_t;
87 TensorShape output_indices_shape({0, rank});
88 TF_RETURN_IF_ERROR(context->allocate_output(
89 kOutputIndicesOutput, output_indices_shape, &output_indices_t));
90 Tensor* output_values_t;
91 TF_RETURN_IF_ERROR(context->allocate_output(
92 kOutputValuesOutput, TensorShape({0}), &output_values_t));
93
94 // Exit early, nothing more to do.
95 return Status::OK();
96 }
97
98 bool rows_are_ordered = true;
99 Tindex last_indices_row = 0;
100 std::vector<Tindex> csr_offset(dense_rows, 0);
101 for (int i = 0; i < N; ++i) {
102 const Tindex row = indices(i, 0);
103 if (row < 0 || row >= dense_rows) {
104 return errors::InvalidArgument("indices(", i, ", 0) is invalid: ", row,
105 " >= ", dense_rows);
106 }
107 ++csr_offset[row];
108 rows_are_ordered = rows_are_ordered & (row >= last_indices_row);
109 last_indices_row = row;
110 }
111 bool all_rows_full = true;
112 for (int row = 0; row < dense_rows; ++row) {
113 // csr_offset here describes the number of elements in this dense row
114 bool row_empty = (csr_offset[row] == 0);
115 if (empty_row_indicator) {
116 empty_row_indicator[row] = row_empty;
117 }
118 all_rows_full = all_rows_full & !row_empty;
119 // In filled version, each row has at least one element.
120 csr_offset[row] = std::max(csr_offset[row], Tindex{1});
121 // Update csr_offset to represent the number of elements up to and
122 // including dense_row + 1:
123 // csr_offset(0) == #{elements of row 0}
124 // csr_offset(1) == #{elements of row 1} + #{elements of row 0}
125 // ..
126 // csr_offset(i) == starting index for elements in row i + 1.
127 if (row > 0) {
128 csr_offset[row] += csr_offset[row - 1];
129 }
130 }
131
132 if (all_rows_full && rows_are_ordered) {
133 context->set_output(kOutputIndicesOutput, indices_t);
134 context->set_output(kOutputValuesOutput, values_t);
135 if (reverse_index_map) {
136 for (Tindex i = 0; i < N; ++i) {
137 reverse_index_map[i] = i;
138 }
139 }
140 } else {
141 Tensor* output_indices_t;
142 const Tindex N_full = csr_offset[dense_rows - 1];
143 TensorShape output_indices_shape({N_full, rank});
144 TF_RETURN_IF_ERROR(context->allocate_output(
145 kOutputIndicesOutput, output_indices_shape, &output_indices_t));
146 auto output_indices = output_indices_t->matrix<Tindex>();
147
148 Tensor* output_values_t;
149 TF_RETURN_IF_ERROR(context->allocate_output(
150 kOutputValuesOutput, TensorShape({N_full}), &output_values_t));
151 auto output_values = output_values_t->vec<T>();
152
153 std::vector<Tindex> filled_count(dense_rows, 0);
154
155 // Fill in values for rows that are not missing
156 for (Tindex i = 0; i < N; ++i) {
157 const Tindex row = indices(i, 0);
158 Tindex& offset = filled_count[row];
159 const Tindex output_i = ((row == 0) ? 0 : csr_offset[row - 1]) + offset;
160 offset++; // Increment the filled count for this row.
161 std::copy_n(&indices(i, 0), rank, &output_indices(output_i, 0));
162 output_values(output_i) = values(i);
163 // We'll need this reverse index map to backprop correctly.
164 if (reverse_index_map) {
165 reverse_index_map[i] = output_i;
166 }
167 }
168
169 // Fill in values for rows that are missing
170 for (Tindex row = 0; row < dense_rows; ++row) {
171 const Tindex row_count = filled_count[row];
172 if (row_count == 0) { // We haven't filled this row
173 const Tindex starting_index = (row == 0) ? 0 : csr_offset[row - 1];
174 // Remaining index values were set to zero already.
175 // Just need to set the row index in the right location.
176 output_indices(starting_index, 0) = row;
177 for (Tindex col = 1; col < rank; ++col) {
178 output_indices(starting_index, col) = 0;
179 }
180 output_values(starting_index) = default_value;
181 }
182 }
183 }
184
185 return Status::OK();
186 }
187 };
188
189 } // namespace functor
190
191 namespace {
192
193 template <typename Device, typename T, typename Tindex>
SparseFillEmptyRowsOpImpl(OpKernelContext * context,AsyncOpKernel::DoneCallback done=nullptr)194 void SparseFillEmptyRowsOpImpl(OpKernelContext* context,
195 AsyncOpKernel::DoneCallback done = nullptr) {
196 // Note that setting this empty lambda as the default parameter value directly
197 // can cause strange compiler/linker errors, so we do it like this instead.
198 if (!done) {
199 done = [] {};
200 }
201
202 const int kIndicesInput = 0;
203 const int kValuesInput = 1;
204 const int kDenseShapeInput = 2;
205 const int kDefaultValueInput = 3;
206
207 const Tensor& indices_t = context->input(kIndicesInput);
208 const Tensor& values_t = context->input(kValuesInput);
209 const Tensor& dense_shape_t = context->input(kDenseShapeInput);
210 const Tensor& default_value_t = context->input(kDefaultValueInput);
211
212 OP_REQUIRES_ASYNC(
213 context, TensorShapeUtils::IsVector(dense_shape_t.shape()),
214 errors::InvalidArgument("dense_shape must be a vector, saw: ",
215 dense_shape_t.shape().DebugString()),
216 done);
217 OP_REQUIRES_ASYNC(context, TensorShapeUtils::IsMatrix(indices_t.shape()),
218 errors::InvalidArgument("indices must be a matrix, saw: ",
219 indices_t.shape().DebugString()),
220 done);
221 OP_REQUIRES_ASYNC(context, TensorShapeUtils::IsVector(values_t.shape()),
222 errors::InvalidArgument("values must be a vector, saw: ",
223 values_t.shape().DebugString()),
224 done);
225 OP_REQUIRES_ASYNC(
226 context, TensorShapeUtils::IsScalar(default_value_t.shape()),
227 errors::InvalidArgument("default_value must be a scalar, saw: ",
228 default_value_t.shape().DebugString()),
229 done);
230 // TODO(ebrevdo): add shape checks between values, indices,
231 // Also add check that dense rank > 0.
232 OP_REQUIRES_ASYNC(context, dense_shape_t.NumElements() != 0,
233 errors::InvalidArgument("Dense shape cannot be empty."),
234 done);
235
236 using FunctorType = functor::SparseFillEmptyRows<Device, T, Tindex>;
237 OP_REQUIRES_OK_ASYNC(context,
238 FunctorType()(context, default_value_t, indices_t,
239 values_t, dense_shape_t, done),
240 done);
241 }
242
243 } // namespace
244
245 template <typename Device, typename T, typename Tindex>
246 class SparseFillEmptyRowsOp : public OpKernel {
247 public:
SparseFillEmptyRowsOp(OpKernelConstruction * context)248 explicit SparseFillEmptyRowsOp(OpKernelConstruction* context)
249 : OpKernel(context) {}
250
Compute(OpKernelContext * context)251 void Compute(OpKernelContext* context) override {
252 SparseFillEmptyRowsOpImpl<Device, T, Tindex>(context);
253 }
254 };
255
256 #define REGISTER_KERNELS(D, T, Tindex) \
257 REGISTER_KERNEL_BUILDER(Name("SparseFillEmptyRows") \
258 .Device(DEVICE_##D) \
259 .HostMemory("dense_shape") \
260 .TypeConstraint<T>("T"), \
261 SparseFillEmptyRowsOp<D##Device, T, Tindex>)
262
263 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, int64)
264 TF_CALL_ALL_TYPES(REGISTER_CPU_KERNELS);
265 #undef REGISTER_CPU_KERNELS
266
267 #undef REGISTER_KERNELS
268
269 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
270
271 // The GPU implementation is async because it requires waiting for a
272 // host->device memcpy before the output is allocated (similar to
273 // SegmentSumGPUOp).
274 template <typename T, typename Tindex>
275 class SparseFillEmptyRowsGPUOp : public AsyncOpKernel {
276 public:
SparseFillEmptyRowsGPUOp(OpKernelConstruction * context)277 explicit SparseFillEmptyRowsGPUOp(OpKernelConstruction* context)
278 : AsyncOpKernel(context) {}
279
ComputeAsync(OpKernelContext * context,DoneCallback done)280 void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
281 SparseFillEmptyRowsOpImpl<GPUDevice, T, Tindex>(context, done);
282 }
283 };
284
285 #define REGISTER_KERNELS(T, Tindex) \
286 REGISTER_KERNEL_BUILDER(Name("SparseFillEmptyRows") \
287 .Device(DEVICE_GPU) \
288 .HostMemory("dense_shape") \
289 .TypeConstraint<T>("T"), \
290 SparseFillEmptyRowsGPUOp<T, Tindex>)
291
292 // Forward declarations of the functor specializations for GPU.
293 namespace functor {
294 #define DECLARE_GPU_SPEC(T, Tindex) \
295 template <> \
296 Status SparseFillEmptyRows<GPUDevice, T, Tindex>::operator()( \
297 OpKernelContext* context, const Tensor& default_value_t, \
298 const Tensor& indices_t, const Tensor& values_t, \
299 const Tensor& dense_shape_t, typename AsyncOpKernel::DoneCallback done); \
300 extern template struct SparseFillEmptyRows<GPUDevice, T, Tindex>;
301 #define DECLARE_GPU_SPEC_INT64(T) DECLARE_GPU_SPEC(T, int64)
302 TF_CALL_POD_TYPES(DECLARE_GPU_SPEC_INT64)
303 #undef DECLARE_GPU_SPEC_INT64
304 #undef DECLARE_GPU_SPEC
305 } // namespace functor
306
307 #define REGISTER_KERNELS_TINDEX(T) REGISTER_KERNELS(T, int64)
308 TF_CALL_POD_TYPES(REGISTER_KERNELS_TINDEX)
309 #undef REGISTER_KERNELS_TINDEX
310
311 #undef REGISTER_KERNELS
312
313 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
314
315 namespace functor {
316
317 template <typename T, typename Tindex>
318 struct SparseFillEmptyRowsGrad<CPUDevice, T, Tindex> {
operator ()tensorflow::functor::SparseFillEmptyRowsGrad319 Status operator()(OpKernelContext* context,
320 typename TTypes<Tindex>::ConstVec reverse_index_map,
321 typename TTypes<T>::ConstVec grad_values,
322 typename TTypes<T>::Vec d_values,
323 typename TTypes<T>::Scalar d_default_value) {
324 const CPUDevice& device = context->eigen_device<CPUDevice>();
325 const Tindex N = reverse_index_map.dimension(0);
326 const Tindex N_full = grad_values.dimension(0);
327
328 T& d_default_value_scalar = d_default_value();
329 d_default_value_scalar = T();
330
331 Tensor visited_t;
332 TF_RETURN_IF_ERROR(
333 context->allocate_temp(DT_BOOL, TensorShape({N_full}), &visited_t));
334 auto visited = visited_t.vec<bool>();
335 visited.device(device) = visited.constant(false);
336
337 for (int i = 0; i < N; ++i) {
338 // Locate the index of the output of the forward prop associated
339 // with this location in the input of the forward prop. Copy
340 // the gradient into it. Mark it as visited.
341 int64_t reverse_index = reverse_index_map(i);
342 if (reverse_index < 0 || reverse_index >= N_full) {
343 return errors::InvalidArgument(
344 "Elements in reverse index must be in [0, ", N_full, ") but got ",
345 reverse_index);
346 }
347 d_values(i) = grad_values(reverse_index);
348 visited(reverse_index) = true;
349 }
350 for (int j = 0; j < N_full; ++j) {
351 // The default value gradient gets the accumulated remainder of
352 // the backprop values (since the default value was used to fill
353 // in these slots in the forward calculation).
354 if (!visited(j)) {
355 d_default_value_scalar += grad_values(j);
356 }
357 }
358 return Status::OK();
359 }
360 };
361
362 } // namespace functor
363
364 template <typename Device, typename T, typename Tindex>
365 class SparseFillEmptyRowsGradOp : public OpKernel {
366 public:
SparseFillEmptyRowsGradOp(OpKernelConstruction * context)367 explicit SparseFillEmptyRowsGradOp(OpKernelConstruction* context)
368 : OpKernel(context) {}
369
Compute(OpKernelContext * context)370 void Compute(OpKernelContext* context) override {
371 const Tensor* reverse_index_map_t;
372 const Tensor* grad_values_t;
373 OP_REQUIRES_OK(context,
374 context->input("reverse_index_map", &reverse_index_map_t));
375 OP_REQUIRES_OK(context, context->input("grad_values", &grad_values_t));
376
377 OP_REQUIRES(
378 context, TensorShapeUtils::IsVector(reverse_index_map_t->shape()),
379 errors::InvalidArgument("reverse_index_map must be a vector, saw: ",
380 reverse_index_map_t->shape().DebugString()));
381 OP_REQUIRES(context, TensorShapeUtils::IsVector(grad_values_t->shape()),
382 errors::InvalidArgument("grad_values must be a vector, saw: ",
383 grad_values_t->shape().DebugString()));
384
385 const auto reverse_index_map = reverse_index_map_t->vec<Tindex>();
386 const auto grad_values = grad_values_t->vec<T>();
387
388 const Tindex N = reverse_index_map_t->shape().dim_size(0);
389
390 Tensor* d_values_t;
391 OP_REQUIRES_OK(context, context->allocate_output(
392 "d_values", TensorShape({N}), &d_values_t));
393 auto d_values = d_values_t->vec<T>();
394 Tensor* d_default_value_t;
395 OP_REQUIRES_OK(context,
396 context->allocate_output("d_default_value", TensorShape({}),
397 &d_default_value_t));
398 auto d_default_value = d_default_value_t->scalar<T>();
399
400 OP_REQUIRES_OK(context,
401 functor::SparseFillEmptyRowsGrad<Device, T, Tindex>()(
402 context, reverse_index_map, grad_values, d_values,
403 d_default_value));
404 }
405 };
406
407 #define REGISTER_KERNELS(D, T, Tindex) \
408 REGISTER_KERNEL_BUILDER(Name("SparseFillEmptyRowsGrad") \
409 .Device(DEVICE_##D) \
410 .TypeConstraint<T>("T"), \
411 SparseFillEmptyRowsGradOp<D##Device, T, Tindex>)
412
413 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, int64)
414 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
415 #undef REGISTER_CPU_KERNELS
416
417 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
418
419 // Forward declarations of the functor specializations for GPU.
420 namespace functor {
421 #define DECLARE_GPU_SPEC(T, Tindex) \
422 template <> \
423 Status SparseFillEmptyRowsGrad<GPUDevice, T, Tindex>::operator()( \
424 OpKernelContext* context, \
425 typename TTypes<Tindex>::ConstVec reverse_index_map, \
426 typename TTypes<T>::ConstVec grad_values, \
427 typename TTypes<T>::Vec d_values, \
428 typename TTypes<T>::Scalar d_default_value); \
429 extern template struct SparseFillEmptyRowsGrad<GPUDevice, T, Tindex>;
430 #define DECLARE_GPU_SPEC_INT64(T) DECLARE_GPU_SPEC(T, int64)
431 TF_CALL_REAL_NUMBER_TYPES(DECLARE_GPU_SPEC_INT64);
432 #undef DECLARE_GPU_SPEC_INT64
433 #undef DECLARE_GPU_SPEC
434 } // namespace functor
435
436 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, int64)
437 TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_KERNELS);
438 #undef REGISTER_GPU_KERNELS
439
440 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
441
442 #undef REGISTER_KERNELS
443 } // namespace tensorflow
444