1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // See docs in ../ops/data_flow_ops.cc.
17
18 #define EIGEN_USE_THREADS
19
20 #include <limits>
21 #include <vector>
22 // TODO(b/31496047): Fix non-standard include order.
23 #include <numeric> // clang-format off
24
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/framework/bounds_check.h"
27 #include "tensorflow/core/framework/op_kernel.h"
28 #include "tensorflow/core/framework/register_types.h"
29 #include "tensorflow/core/framework/resource_mgr.h"
30 #include "tensorflow/core/framework/tensor.h"
31 #include "tensorflow/core/framework/tensor_shape.h"
32 #include "tensorflow/core/framework/types.h"
33 #include "tensorflow/core/kernels/concat_lib.h"
34 #include "tensorflow/core/kernels/split_lib.h"
35 #include "tensorflow/core/kernels/tensor_array.h"
36 #include "tensorflow/core/lib/core/errors.h"
37 #include "tensorflow/core/lib/core/refcount.h"
38 #include "tensorflow/core/lib/strings/strcat.h"
39 #include "tensorflow/core/platform/dynamic_annotations.h"
40 #include "tensorflow/core/platform/logging.h"
41 #include "tensorflow/core/platform/thread_annotations.h"
42 #include "tensorflow/core/platform/types.h"
43 #include "tensorflow/core/util/ptr_util.h"
44
45 typedef Eigen::ThreadPoolDevice CPUDevice;
46 #if GOOGLE_CUDA
47 typedef Eigen::GpuDevice GPUDevice;
48 #endif // GOOGLE_CUDA
49
50 // clang-format on
51
52 namespace tensorflow {
53
GetHandle(OpKernelContext * ctx,string * container,string * ta_handle)54 Status GetHandle(OpKernelContext* ctx, string* container, string* ta_handle) {
55 {
56 Tensor tensor;
57 // Assuming that handle is the input at index 0.
58 if (IsRefType(ctx->input_dtype(0))) {
59 tensor = ctx->mutable_input(0, false);
60 } else {
61 tensor = ctx->input(0);
62 }
63 if (tensor.NumElements() != 2) {
64 return errors::InvalidArgument(
65 "Tensor array handle must be 2-element vector, but had shape: ",
66 tensor.shape().DebugString());
67 }
68 auto h = tensor.flat<string>();
69 *container = h(0);
70 *ta_handle = h(1);
71 }
72 return Status::OK();
73 }
74
GetTensorArray(OpKernelContext * ctx,TensorArray ** tensor_array)75 Status GetTensorArray(OpKernelContext* ctx, TensorArray** tensor_array) {
76 string container;
77 string ta_handle;
78 if (ctx->input_dtype(0) != DT_RESOURCE) {
79 TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &ta_handle));
80 ResourceMgr* rm = ctx->resource_manager();
81 if (rm == nullptr) return errors::Internal("No resource manager.");
82 TF_RETURN_IF_ERROR(rm->Lookup(ctx->step_container()->name(),
83 container + ta_handle, tensor_array));
84 return Status::OK();
85 } else {
86 return LookupResource(ctx, HandleFromInput(ctx, 0), tensor_array);
87 }
88 }
89
SetupFlowControlInputs(OpKernelContext * ctx,bool set_output)90 Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) {
91 const Tensor* flow_control;
92 TF_RETURN_IF_ERROR(ctx->input("flow_in", &flow_control));
93 if (set_output) {
94 TF_RETURN_IF_ERROR(ctx->set_output("flow_out", *flow_control));
95 }
96 return Status::OK();
97 }
98
99 // CREATION *******************************************************************
100
101 // Virtual class for shared behavior between TensorArrayOp and
102 // TensorArrayGradOp.
103 class TensorArrayCreationOp : public OpKernel {
104 public:
TensorArrayCreationOp(OpKernelConstruction * context)105 explicit TensorArrayCreationOp(OpKernelConstruction* context)
106 : OpKernel(context), device_type_(context->device_type()) {}
107
Compute(OpKernelContext * ctx)108 void Compute(OpKernelContext* ctx) override {
109 Tensor tensor_array_output_handle;
110
111 AllocatorAttributes alloc_attr;
112 alloc_attr.set_on_host(true);
113 OP_REQUIRES_OK(ctx, ctx->allocate_temp(
114 tensorflow::DT_STRING, tensorflow::TensorShape({2}),
115 &tensor_array_output_handle, alloc_attr));
116 // Store the handle in a per-step container of the RM.
117 ResourceMgr* rm = ctx->resource_manager();
118 OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
119
120 TensorArray* output_tensor_array;
121 OP_REQUIRES_OK(ctx, CreateTensorArray(ctx, rm, &tensor_array_output_handle,
122 &output_tensor_array));
123 if (IsRefType(ctx->expected_output_dtype(0))) {
124 ctx->set_output_ref(0, output_tensor_array->mu(),
125 output_tensor_array->handle());
126 } else if (ctx->expected_output_dtype(0) == DT_STRING) {
127 ctx->set_output(0, *output_tensor_array->handle());
128 } else {
129 Tensor* handle;
130 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
131 handle->flat<ResourceHandle>()(0) =
132 output_tensor_array->resource_handle(ctx);
133 }
134 if (ctx->num_outputs() == 2) {
135 // Create the flow output.
136 Tensor* flow;
137 OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow));
138 if (device_type_ == DEVICE_CPU) {
139 // Value doesn't matter, but this makes msan not complaint about
140 // copying an uninitialized value. To do this on GPU would require
141 // a kernel launch or a host->device memcpy, so we avoid that.
142 flow->flat<float>()(0) = 0;
143 }
144 }
145 }
146
147 protected:
148 virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
149 Tensor* tensor_array_output_handle,
150 TensorArray** output_tensor_array) = 0;
151
152 private:
153 const DeviceType device_type_;
154 };
155
156 // A per-run local tensor array. The tensor array uses a "per-step" resource
157 // manager which ensures that correct garbage collection on error or
158 // successful completion.
159 class TensorArrayOp : public TensorArrayCreationOp {
160 public:
TensorArrayOp(OpKernelConstruction * context)161 explicit TensorArrayOp(OpKernelConstruction* context)
162 : TensorArrayCreationOp(context) {
163 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
164 OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
165 OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
166 // The HasAttr check is for backwards compatibility with older op
167 // versions which do not have this attribute.
168 if (context->HasAttr("identical_element_shapes")) {
169 OP_REQUIRES_OK(context, context->GetAttr("identical_element_shapes",
170 &identical_element_shapes_));
171 } else {
172 identical_element_shapes_ = false;
173 }
174 OP_REQUIRES_OK(context,
175 context->GetAttr("clear_after_read", &clear_after_read_));
176 OP_REQUIRES_OK(context,
177 context->GetAttr("tensor_array_name", &tensor_array_name_));
178 if (tensor_array_name_.empty()) tensor_array_name_ = name();
179 }
180
CreateTensorArray(OpKernelContext * ctx,ResourceMgr * rm,Tensor * tensor_array_output_handle,TensorArray ** output_tensor_array)181 Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
182 Tensor* tensor_array_output_handle,
183 TensorArray** output_tensor_array) override {
184 const Tensor* tensor_size;
185 TF_RETURN_IF_ERROR(ctx->input("size", &tensor_size));
186
187 if (!TensorShapeUtils::IsScalar(tensor_size->shape())) {
188 return errors::InvalidArgument(
189 "TensorArray size must be scalar, but had shape: ",
190 tensor_size->shape().DebugString());
191 }
192 const int32 size = tensor_size->scalar<int32>()();
193 if (size < 0) {
194 return errors::InvalidArgument("Size should be >= 0.");
195 }
196
197 auto handle = tensor_array_output_handle->flat<string>();
198 string unique_tensor_array_name =
199 strings::StrCat(tensor_array_name_, "_",
200 TensorArray::tensor_array_counter.fetch_add(1));
201 handle(0) = "_tensor_arrays";
202 handle(1) = unique_tensor_array_name;
203
204 auto key = strings::StrCat(handle(0), unique_tensor_array_name);
205
206 TensorArray* tensor_array = new TensorArray(
207 key, dtype_, *tensor_array_output_handle, size, element_shape_,
208 identical_element_shapes_, dynamic_size_,
209 false /* multiple_writes_aggregate */, false /* is_grad */,
210 -1 /* marked_size */, clear_after_read_);
211
212 TF_RETURN_IF_ERROR(
213 rm->Create(ctx->step_container()->name(), key, tensor_array));
214
215 *output_tensor_array = tensor_array;
216
217 return Status::OK();
218 }
219
220 private:
221 DataType dtype_;
222 PartialTensorShape element_shape_;
223 bool identical_element_shapes_;
224 bool dynamic_size_;
225 bool clear_after_read_;
226 string tensor_array_name_; // The name used to create the TensorArray.
227
228 TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
229 };
230
231 REGISTER_KERNEL_BUILDER(Name("TensorArray").Device(DEVICE_CPU), TensorArrayOp);
232 REGISTER_KERNEL_BUILDER(Name("TensorArrayV2").Device(DEVICE_CPU),
233 TensorArrayOp);
234 REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
235 TensorArrayOp);
236
237 #if GOOGLE_CUDA
238
239 #define REGISTER_GPU(type) \
240 REGISTER_KERNEL_BUILDER(Name("TensorArray") \
241 .Device(DEVICE_GPU) \
242 .TypeConstraint<type>("dtype") \
243 .HostMemory("size") \
244 .HostMemory("handle"), \
245 TensorArrayOp); \
246 REGISTER_KERNEL_BUILDER(Name("TensorArrayV2") \
247 .Device(DEVICE_GPU) \
248 .TypeConstraint<type>("dtype") \
249 .HostMemory("size") \
250 .HostMemory("handle"), \
251 TensorArrayOp); \
252 REGISTER_KERNEL_BUILDER(Name("TensorArrayV3") \
253 .Device(DEVICE_GPU) \
254 .TypeConstraint<type>("dtype") \
255 .HostMemory("size") \
256 .HostMemory("handle"), \
257 TensorArrayOp);
258
259 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
260 TF_CALL_complex64(REGISTER_GPU);
261 TF_CALL_complex128(REGISTER_GPU);
262 TF_CALL_int64(REGISTER_GPU);
263 REGISTER_GPU(bfloat16);
264 #undef REGISTER_GPU
265
266 #endif // GOOGLE_CUDA
267
268 // GRADIENT *******************************************************************
269 // Note that this op may have an optional third input. If present, it represents
270 // a shape value. It indicates that element shape of this gradient array is that
271 // shape value concatenated with the element shape of the original tensor array.
272 // See TensorArrayGradWithShape.
273 class TensorArrayGradOp : public TensorArrayCreationOp {
274 public:
TensorArrayGradOp(OpKernelConstruction * context)275 explicit TensorArrayGradOp(OpKernelConstruction* context)
276 : TensorArrayCreationOp(context) {
277 OP_REQUIRES_OK(context, context->GetAttr("source", &source_));
278 }
279
CreateTensorArray(OpKernelContext * ctx,ResourceMgr * rm,Tensor * tensor_array_output_handle,TensorArray ** output_tensor_array)280 Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
281 Tensor* tensor_array_output_handle,
282 TensorArray** output_tensor_array) override {
283 string container;
284 string tensor_array_name;
285 if (ctx->input_dtype(0) != DT_RESOURCE) {
286 TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &tensor_array_name));
287 if (container != "_tensor_arrays") {
288 return errors::InvalidArgument(
289 "Input container should be '_tensor_arrays', but received '",
290 container, "'");
291 }
292 } else {
293 container = "_tensor_arrays";
294 const auto& resource = ctx->input(0).flat<ResourceHandle>()(0);
295 if (StringPiece(resource.name()).substr(0, container.size()) !=
296 container) {
297 return errors::InvalidArgument("Wrong input container. ",
298 resource.name());
299 }
300 tensor_array_name =
301 string(StringPiece(resource.name()).substr(container.size()));
302 }
303
304 auto output_handle = tensor_array_output_handle->flat<string>();
305 output_handle(0) = "_tensor_array_grads";
306 output_handle(1) = strings::StrCat(tensor_array_name, "@", source_);
307
308 TensorArray* tensor_array;
309 TF_RETURN_IF_ERROR(rm->Lookup(ctx->step_container()->name(),
310 strings::StrCat(container, tensor_array_name),
311 &tensor_array));
312 core::ScopedUnref unref(tensor_array);
313
314 // Once gradients are being calculated, the forward TensorArray
315 // may no longer be resized by new Writes.
316 tensor_array->DisableDynamicSize();
317
318 int32 array_size = 0;
319 int32 marked_size = 0;
320 TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));
321 TF_RETURN_IF_ERROR(tensor_array->MarkedSize(&marked_size));
322
323 if (array_size < 0) {
324 return errors::InvalidArgument("ArraySize should be >= 0.");
325 }
326 if (!tensor_array->GradientsAllowed()) {
327 return errors::InvalidArgument(
328 "Unable to create a gradients TensorArray for ", tensor_array_name,
329 ". Perhaps you used the multiple_writes_aggregate flag on a "
330 "previous write? Gradient calculation is impossible when multiple "
331 "writes are performed to the same index.");
332 }
333 TensorShape shape_to_prepend;
334 auto element_shape = PartialTensorShape();
335 if (ctx->num_inputs() > 2) {
336 TF_RETURN_IF_ERROR(
337 ctx->op_kernel().MakeShape(ctx->input(2), &shape_to_prepend));
338 auto ta_element_shape = tensor_array->ElemShape();
339 if (!ta_element_shape.unknown_rank()) {
340 std::vector<int64> dims;
341 for (auto dim : shape_to_prepend) {
342 dims.push_back(dim.size);
343 }
344 for (auto dim : ta_element_shape) {
345 dims.push_back(dim.size);
346 }
347 TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
348 gtl::ArraySlice<int64>(dims), &element_shape));
349 }
350 } else {
351 element_shape = tensor_array->ElemShape();
352 }
353
354 const auto key = strings::StrCat(output_handle(0), output_handle(1));
355 auto creator = [key, tensor_array, array_size, marked_size, element_shape,
356 shape_to_prepend,
357 tensor_array_output_handle](TensorArray** ret) -> Status {
358 *ret = new TensorArray(
359 key, tensor_array->ElemType(), *tensor_array_output_handle,
360 array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
361 false /* dynamic_size */, true /* multiple_writes_aggregate */,
362 true /* is_grad */, marked_size /* marked_size */,
363 true /* close_after_read */);
364 return (*ret)->CopyShapesFrom(tensor_array, &shape_to_prepend);
365 };
366
367 Status s = rm->LookupOrCreate<TensorArray>(
368 ctx->step_container()->name(), key, output_tensor_array, creator);
369 (*output_tensor_array)->Unref();
370
371 return s;
372 }
373
374 private:
375 // The gradient source for creating the given
376 // gradient TensorArray. This should be unique to each gradients
377 // call. Typical values look like "gradients", "gradients_1", ...
378 string source_;
379
380 TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGradOp);
381 };
382
383 REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad").Device(DEVICE_CPU),
384 TensorArrayGradOp);
385 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2").Device(DEVICE_CPU),
386 TensorArrayGradOp);
387 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3").Device(DEVICE_CPU),
388 TensorArrayGradOp);
389 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape").Device(DEVICE_CPU),
390 TensorArrayGradOp);
391 REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad")
392 .Device(DEVICE_GPU)
393 .HostMemory("handle")
394 .HostMemory("grad_handle"),
395 TensorArrayGradOp);
396 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2")
397 .Device(DEVICE_GPU)
398 .HostMemory("handle")
399 .HostMemory("grad_handle"),
400 TensorArrayGradOp);
401 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3")
402 .Device(DEVICE_GPU)
403 .HostMemory("handle")
404 .HostMemory("grad_handle"),
405 TensorArrayGradOp);
406 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape")
407 .Device(DEVICE_GPU)
408 .HostMemory("handle")
409 .HostMemory("shape_to_prepend")
410 .HostMemory("grad_handle"),
411 TensorArrayGradOp);
412
413 // WRITE **********************************************************************
414
415 template <typename Device, typename T>
416 class TensorArrayWriteOp : public OpKernel {
417 public:
TensorArrayWriteOp(OpKernelConstruction * context)418 explicit TensorArrayWriteOp(OpKernelConstruction* context)
419 : OpKernel(context) {}
420
Compute(OpKernelContext * ctx)421 void Compute(OpKernelContext* ctx) override {
422 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
423
424 const Tensor* tensor_index;
425 const Tensor* tensor_value;
426 OP_REQUIRES_OK(ctx, ctx->input("index", &tensor_index));
427 OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
428
429 OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_index->shape()),
430 errors::InvalidArgument(
431 "TensorArray index must be scalar, but had shape: ",
432 tensor_index->shape().DebugString()));
433
434 TensorArray* tensor_array = nullptr;
435 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
436 core::ScopedUnref unref(tensor_array);
437 const int32 index = tensor_index->scalar<int32>()();
438 OP_REQUIRES(
439 ctx, tensor_value->dtype() == tensor_array->ElemType(),
440 errors::InvalidArgument("TensorArray dtype is ",
441 DataTypeString(tensor_array->ElemType()),
442 " but Op is trying to write dtype ",
443 DataTypeString(tensor_value->dtype()), "."));
444 PersistentTensor persistent_tensor(*tensor_value);
445 Status s = tensor_array->WriteOrAggregate<Device, T>(ctx, index,
446 &persistent_tensor);
447 OP_REQUIRES_OK(ctx, s);
448 }
449 };
450
451 #define REGISTER_WRITE(type) \
452 REGISTER_KERNEL_BUILDER( \
453 Name("TensorArrayWrite").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
454 TensorArrayWriteOp<CPUDevice, type>); \
455 REGISTER_KERNEL_BUILDER( \
456 Name("TensorArrayWriteV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
457 TensorArrayWriteOp<CPUDevice, type>); \
458 REGISTER_KERNEL_BUILDER( \
459 Name("TensorArrayWriteV3").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
460 TensorArrayWriteOp<CPUDevice, type>);
461
462 TF_CALL_ALL_TYPES(REGISTER_WRITE);
463
464 #undef REGISTER_WRITE
465
466 #if GOOGLE_CUDA
467
468 #define REGISTER_GPU(type) \
469 REGISTER_KERNEL_BUILDER(Name("TensorArrayWrite") \
470 .Device(DEVICE_GPU) \
471 .TypeConstraint<type>("T") \
472 .HostMemory("handle") \
473 .HostMemory("index"), \
474 TensorArrayWriteOp<GPUDevice, type>); \
475 REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV2") \
476 .Device(DEVICE_GPU) \
477 .TypeConstraint<type>("T") \
478 .HostMemory("handle") \
479 .HostMemory("index"), \
480 TensorArrayWriteOp<GPUDevice, type>); \
481 REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV3") \
482 .Device(DEVICE_GPU) \
483 .TypeConstraint<type>("T") \
484 .HostMemory("handle") \
485 .HostMemory("index"), \
486 TensorArrayWriteOp<GPUDevice, type>);
487
488 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
489 TF_CALL_complex64(REGISTER_GPU);
490 TF_CALL_complex128(REGISTER_GPU);
491 REGISTER_GPU(bfloat16);
492 #undef REGISTER_GPU
493
494 #endif // GOOGLE_CUDA
495
496 // READ ***********************************************************************
497
498 template <typename Device, typename T>
499 class TensorArrayReadOp : public OpKernel {
500 public:
TensorArrayReadOp(OpKernelConstruction * context)501 explicit TensorArrayReadOp(OpKernelConstruction* context)
502 : OpKernel(context) {
503 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
504 }
505
Compute(OpKernelContext * ctx)506 void Compute(OpKernelContext* ctx) override {
507 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
508
509 const Tensor* tensor_index;
510 OP_REQUIRES_OK(ctx, ctx->input("index", &tensor_index));
511
512 OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_index->shape()),
513 errors::InvalidArgument(
514 "TensorArray index must be scalar, but had shape: ",
515 tensor_index->shape().DebugString()));
516
517 TensorArray* tensor_array = nullptr;
518 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
519 core::ScopedUnref unref(tensor_array);
520
521 const int32 index = tensor_index->scalar<int32>()();
522 OP_REQUIRES(
523 ctx, dtype_ == tensor_array->ElemType(),
524 errors::InvalidArgument(
525 "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
526 " but Op requested dtype ", DataTypeString(dtype_), "."));
527 PersistentTensor value;
528 Status s = tensor_array->Read<Device, T>(ctx, index, &value);
529 OP_REQUIRES_OK(ctx, s);
530 ctx->set_output(0, *value.AccessTensor(ctx));
531 }
532
533 private:
534 DataType dtype_;
535 };
536
537 #define REGISTER_READ(type) \
538 REGISTER_KERNEL_BUILDER(Name("TensorArrayRead") \
539 .Device(DEVICE_CPU) \
540 .TypeConstraint<type>("dtype"), \
541 TensorArrayReadOp<CPUDevice, type>); \
542 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2") \
543 .Device(DEVICE_CPU) \
544 .TypeConstraint<type>("dtype"), \
545 TensorArrayReadOp<CPUDevice, type>); \
546 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV3") \
547 .Device(DEVICE_CPU) \
548 .TypeConstraint<type>("dtype"), \
549 TensorArrayReadOp<CPUDevice, type>);
550
551 TF_CALL_ALL_TYPES(REGISTER_READ)
552
553 #undef REGISTER_READ
554
555 #if GOOGLE_CUDA
556
557 #define REGISTER_GPU(type) \
558 REGISTER_KERNEL_BUILDER(Name("TensorArrayRead") \
559 .Device(DEVICE_GPU) \
560 .TypeConstraint<type>("dtype") \
561 .HostMemory("handle") \
562 .HostMemory("index"), \
563 TensorArrayReadOp<GPUDevice, type>); \
564 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2") \
565 .Device(DEVICE_GPU) \
566 .TypeConstraint<type>("dtype") \
567 .HostMemory("handle") \
568 .HostMemory("index"), \
569 TensorArrayReadOp<GPUDevice, type>); \
570 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV3") \
571 .Device(DEVICE_GPU) \
572 .TypeConstraint<type>("dtype") \
573 .HostMemory("handle") \
574 .HostMemory("index"), \
575 TensorArrayReadOp<GPUDevice, type>);
576
577 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
578 TF_CALL_complex64(REGISTER_GPU);
579 TF_CALL_complex128(REGISTER_GPU);
580 TF_CALL_int64(REGISTER_GPU);
581 REGISTER_GPU(bfloat16);
582 #undef REGISTER_GPU
583
584 #endif // GOOGLE_CUDA
585
586 // PACK and GATHER ************************************************************
587
588 // Concatenate the elements in a TensorArray. All elements must be
589 // defined and have the same shape.
590 template <typename Device, typename T, bool LEGACY_PACK>
591 class TensorArrayPackOrGatherOp : public OpKernel {
592 public:
593 typedef typename TTypes<T, 2>::ConstMatrix ConstMatrix;
594 typedef std::vector<std::unique_ptr<ConstMatrix> > ConstMatrixVector;
595
TensorArrayPackOrGatherOp(OpKernelConstruction * context)596 explicit TensorArrayPackOrGatherOp(OpKernelConstruction* context)
597 : OpKernel(context) {
598 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
599 OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
600 }
601
Compute(OpKernelContext * ctx)602 void Compute(OpKernelContext* ctx) override {
603 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
604
605 TensorArray* tensor_array = nullptr;
606 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
607
608 core::ScopedUnref unref(tensor_array);
609 OP_REQUIRES(
610 ctx, dtype_ == tensor_array->ElemType(),
611 errors::InvalidArgument(
612 "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
613 " but Op requested dtype ", DataTypeString(dtype_), "."));
614
615 // Ensure new element shape is compatible with the one stored in the
616 // TensorArray.
617 OP_REQUIRES_OK(ctx, tensor_array->SetElemShape(element_shape_));
618
619 int32 num_indices;
620 std::vector<PersistentTensor> values;
621 std::vector<int32> indices;
622 if (LEGACY_PACK) {
623 OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&num_indices));
624 indices.resize(num_indices);
625 std::iota(indices.begin(), indices.end(), 0);
626 } else {
627 const Tensor* tensor_indices;
628 OP_REQUIRES_OK(ctx, ctx->input("indices", &tensor_indices));
629 OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_indices->shape()),
630 errors::InvalidArgument(
631 "Expected indices to be a vector, but received shape: ",
632 tensor_indices->shape().DebugString()));
633 const auto indices_t = tensor_indices->vec<int32>();
634 num_indices = tensor_indices->NumElements();
635 indices.resize(num_indices);
636 std::copy(indices_t.data(), indices_t.data() + num_indices,
637 indices.begin());
638 }
639
640 // If there are no elements to return, return a zero-element Tensor with
641 // shape [0] + element_shape_
642 if (num_indices == 0) {
643 OP_REQUIRES(ctx, element_shape_.IsFullyDefined(),
644 errors::Unimplemented(
645 "TensorArray has size zero, but element shape ",
646 element_shape_.DebugString(),
647 " is not fully defined. "
648 "Currently only static shapes are supported when packing "
649 "zero-size TensorArrays."));
650 TensorShape empty_shape;
651 element_shape_.AsTensorShape(&empty_shape);
652 empty_shape.InsertDim(0, 0);
653 Tensor* empty_unused;
654 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &empty_unused));
655 return;
656 }
657
658 // Read all the PersistentTensors into a vector to keep track of
659 // their memory.
660 Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
661 OP_REQUIRES_OK(ctx, s);
662
663 const Tensor* value_0_t = values[0].AccessTensor(ctx);
664
665 OP_REQUIRES(
666 ctx, element_shape_.IsCompatibleWith(value_0_t->shape()),
667 errors::InvalidArgument("TensorArray was passed element_shape ",
668 element_shape_.DebugString(),
669 " which does not match the Tensor at index 0: ",
670 value_0_t->shape().DebugString()));
671
672 TensorShape output_shape(value_0_t->shape());
673 output_shape.InsertDim(0, num_indices);
674
675 Tensor* output_tensor = nullptr;
676 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
677
678 // If output_tensor is empty, there is nothing to concatenate so return it.
679 if (output_shape.num_elements() == 0) {
680 return;
681 }
682
683 ConstMatrixVector input_tensors_flat;
684 input_tensors_flat.reserve(num_indices);
685 auto output_flat =
686 output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
687
688 // Insert the first value
689 input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
690 value_0_t->shaped<T, 2>({1, value_0_t->NumElements()})));
691
692 for (int i = 1; i < num_indices; ++i) {
693 const Tensor* value_t = values[i].AccessTensor(ctx);
694 OP_REQUIRES(
695 ctx, value_0_t->shape() == value_t->shape(),
696 errors::InvalidArgument(
697 "TensorArray has inconsistent shapes. Index 0 has shape: ",
698 value_0_t->shape().DebugString(), " but index ", i,
699 " has shape: ", value_t->shape().DebugString()));
700 input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
701 value_t->shaped<T, 2>({1, value_t->NumElements()})));
702 }
703
704 #if GOOGLE_CUDA
705 if (std::is_same<Device, GPUDevice>::value) {
706 ConcatGPU<T>(ctx, input_tensors_flat, output_tensor, &output_flat);
707 return;
708 }
709 #endif // GOOGLE_CUDA
710 ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
711 }
712
713 private:
714 DataType dtype_;
715 PartialTensorShape element_shape_;
716 };
717
718 #define REGISTER_GATHER_AND_PACK(type) \
719 REGISTER_KERNEL_BUILDER( \
720 Name("TensorArrayPack") \
721 .Device(DEVICE_CPU) \
722 .TypeConstraint<type>("dtype"), \
723 TensorArrayPackOrGatherOp<CPUDevice, type, true /* LEGACY_PACK */>); \
724 REGISTER_KERNEL_BUILDER( \
725 Name("TensorArrayGather") \
726 .Device(DEVICE_CPU) \
727 .TypeConstraint<type>("dtype"), \
728 TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
729 REGISTER_KERNEL_BUILDER( \
730 Name("TensorArrayGatherV2") \
731 .Device(DEVICE_CPU) \
732 .TypeConstraint<type>("dtype"), \
733 TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
734 REGISTER_KERNEL_BUILDER( \
735 Name("TensorArrayGatherV3") \
736 .Device(DEVICE_CPU) \
737 .TypeConstraint<type>("dtype"), \
738 TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>);
739
740 TF_CALL_POD_STRING_TYPES(REGISTER_GATHER_AND_PACK);
741 TF_CALL_variant(REGISTER_GATHER_AND_PACK);
742 REGISTER_GATHER_AND_PACK(quint8);
743 REGISTER_GATHER_AND_PACK(qint8);
744 REGISTER_GATHER_AND_PACK(qint32);
745
746 #undef REGISTER_GATHER_AND_PACK
747
748 #if GOOGLE_CUDA
749
750 #define REGISTER_GPU(type) \
751 REGISTER_KERNEL_BUILDER( \
752 Name("TensorArrayPack") \
753 .Device(DEVICE_GPU) \
754 .TypeConstraint<type>("dtype") \
755 .HostMemory("handle"), \
756 TensorArrayPackOrGatherOp<GPUDevice, type, true /* LEGACY_PACK */>); \
757 REGISTER_KERNEL_BUILDER( \
758 Name("TensorArrayGather") \
759 .Device(DEVICE_GPU) \
760 .TypeConstraint<type>("dtype") \
761 .HostMemory("indices") \
762 .HostMemory("handle"), \
763 TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
764 REGISTER_KERNEL_BUILDER( \
765 Name("TensorArrayGatherV2") \
766 .Device(DEVICE_GPU) \
767 .TypeConstraint<type>("dtype") \
768 .HostMemory("indices") \
769 .HostMemory("handle"), \
770 TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
771 REGISTER_KERNEL_BUILDER( \
772 Name("TensorArrayGatherV3") \
773 .Device(DEVICE_GPU) \
774 .TypeConstraint<type>("dtype") \
775 .HostMemory("indices") \
776 .HostMemory("handle"), \
777 TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
778
779 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
780 TF_CALL_complex64(REGISTER_GPU);
781 TF_CALL_complex128(REGISTER_GPU);
782 REGISTER_GPU(bfloat16);
783 #undef REGISTER_GPU
784
785 // A special GPU kernel for int32.
786 // TODO(b/25387198): Also enable int32 in device memory. This kernel
787 // registration requires all int32 inputs and outputs to be in host memory.
788 REGISTER_KERNEL_BUILDER(
789 Name("TensorArrayGather")
790 .Device(DEVICE_GPU)
791 .TypeConstraint<int32>("dtype")
792 .HostMemory("indices")
793 .HostMemory("handle"),
794 TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
795 REGISTER_KERNEL_BUILDER(
796 Name("TensorArrayGatherV2")
797 .Device(DEVICE_GPU)
798 .TypeConstraint<int32>("dtype")
799 .HostMemory("indices")
800 .HostMemory("handle"),
801 TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
802 REGISTER_KERNEL_BUILDER(
803 Name("TensorArrayGatherV3")
804 .Device(DEVICE_GPU)
805 .TypeConstraint<int32>("dtype")
806 .HostMemory("indices")
807 .HostMemory("handle"),
808 TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
809
810 #endif // GOOGLE_CUDA
811
812 // CONCAT *********************************************************************
813
814 // Concatenate the elements in a TensorArray. All elements must be
815 // defined and (excepting the first dimension) have the same shape.
816 template <typename Device, typename T>
817 class TensorArrayConcatOp : public OpKernel {
818 public:
819 typedef typename TTypes<T, 2>::ConstMatrix ConstMatrix;
820 typedef std::vector<std::unique_ptr<ConstMatrix> > ConstMatrixVector;
821
TensorArrayConcatOp(OpKernelConstruction * context)822 explicit TensorArrayConcatOp(OpKernelConstruction* context)
823 : OpKernel(context) {
824 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
825 OP_REQUIRES_OK(context, context->GetAttr("element_shape_except0",
826 &element_shape_except0_));
827 }
828
Compute(OpKernelContext * ctx)829 void Compute(OpKernelContext* ctx) override {
830 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
831
832 TensorArray* tensor_array = nullptr;
833 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
834 core::ScopedUnref unref(tensor_array);
835 OP_REQUIRES(
836 ctx, dtype_ == tensor_array->ElemType(),
837 errors::InvalidArgument(
838 "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
839 " but Op requested dtype ", DataTypeString(dtype_), "."));
840
841 int32 array_size;
842 OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&array_size));
843
844 // If there are no elements, return a zero-element Tensor with
845 // shape [0] + element_shape_except0_
846 if (array_size == 0) {
847 OP_REQUIRES(
848 ctx, element_shape_except0_.IsFullyDefined(),
849 errors::Unimplemented(
850 "TensorArray has size zero, but element_shape_except0 ",
851 element_shape_except0_.DebugString(),
852 " is not fully defined. "
853 "Currently only static shapes are supported when concatenating "
854 "zero-size TensorArrays."));
855 TensorShape empty_shape;
856 element_shape_except0_.AsTensorShape(&empty_shape);
857 empty_shape.InsertDim(0, 0);
858 Tensor* empty_unused;
859 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &empty_unused));
860 OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {0}, &empty_unused));
861 return;
862 }
863
864 // Read all the PersistentTensors into a vector to keep track of
865 // their memory.
866 std::vector<PersistentTensor> values;
867 std::vector<int32> indices(array_size);
868 std::iota(indices.begin(), indices.end(), 0);
869 Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
870 OP_REQUIRES_OK(ctx, s);
871
872 std::vector<const Tensor*> value_tensors;
873 value_tensors.resize(values.size());
874
875 Tensor* lengths_tensor = nullptr;
876 OP_REQUIRES_OK(ctx, ctx->allocate_output(
877 1, TensorShape({static_cast<int64>(values.size())}),
878 &lengths_tensor));
879 auto lengths_tensor_t = lengths_tensor->vec<int64>();
880
881 TensorShape output_shape;
882 TensorShape output_shape_except0;
883 for (std::size_t i = 0; i < values.size(); ++i) {
884 value_tensors[i] = values[i].AccessTensor(ctx);
885 TensorShape value_shape_t = value_tensors[i]->shape();
886
887 OP_REQUIRES(
888 ctx, TensorShapeUtils::IsVectorOrHigher(value_shape_t),
889 errors::InvalidArgument(
890 "Concat saw a scalar shape at index ", i,
891 " but requires at least vectors. Did you mean to call pack?"));
892
893 lengths_tensor_t(i) = value_shape_t.dim_size(0);
894
895 TensorShape value_shape_t_except0 = value_shape_t;
896 value_shape_t_except0.RemoveDim(0);
897 if (i == 0) {
898 output_shape = value_shape_t;
899 output_shape_except0 = value_shape_t_except0;
900 OP_REQUIRES(
901 ctx, element_shape_except0_.IsCompatibleWith(output_shape_except0),
902 errors::InvalidArgument(
903 "TensorArray was passed element_shape_except0 ",
904 element_shape_except0_.DebugString(),
905 " but index 0 has (excepting dimension 0) shape: ",
906 value_shape_t_except0.DebugString(), " which does not match."));
907 } else {
908 OP_REQUIRES(ctx, output_shape_except0 == value_shape_t_except0,
909 errors::InvalidArgument(
910 "TensorArray has inconsistent shapes. Index 0 has "
911 "(excepting dimension 0) shape: ",
912 output_shape_except0.DebugString(), " but index ", i,
913 " has (excepting dimension 0) shape: ",
914 value_shape_t_except0.DebugString()));
915 // Store the previous maximum length as the offset for this tensor.
916 output_shape.set_dim(
917 0, output_shape.dim_size(0) + value_shape_t.dim_size(0));
918 }
919 }
920
921 Tensor* output_tensor = nullptr;
922 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
923 ConstMatrixVector input_tensors_flat;
924 input_tensors_flat.reserve(values.size());
925 for (size_t i = 0; i < values.size(); ++i) {
926 const Tensor* value_t = value_tensors[i];
927 if (value_t->NumElements() > 0) {
928 input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
929 value_t->shaped<T, 2>({1, value_t->NumElements()})));
930 }
931 }
932
933 if (output_shape.num_elements() > 0) {
934 auto output_flat =
935 output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
936 #if GOOGLE_CUDA
937 if (std::is_same<Device, GPUDevice>::value) {
938 ConcatGPU<T>(ctx, input_tensors_flat, output_tensor, &output_flat);
939 return;
940 }
941 #endif // GOOGLE_CUDA
942 ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
943 }
944 }
945
946 private:
947 DataType dtype_;
948 PartialTensorShape element_shape_except0_;
949 };
950
951 #define REGISTER_CONCAT(type) \
952 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat") \
953 .Device(DEVICE_CPU) \
954 .TypeConstraint<type>("dtype") \
955 .HostMemory("lengths") \
956 .HostMemory("handle"), \
957 TensorArrayConcatOp<CPUDevice, type>); \
958 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2") \
959 .Device(DEVICE_CPU) \
960 .TypeConstraint<type>("dtype") \
961 .HostMemory("lengths") \
962 .HostMemory("handle"), \
963 TensorArrayConcatOp<CPUDevice, type>) \
964 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3") \
965 .Device(DEVICE_CPU) \
966 .TypeConstraint<type>("dtype") \
967 .HostMemory("lengths") \
968 .HostMemory("handle"), \
969 TensorArrayConcatOp<CPUDevice, type>)
970
971 TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
972 REGISTER_CONCAT(quint8);
973 REGISTER_CONCAT(qint8);
974 REGISTER_CONCAT(qint32);
975
976 #undef REGISTER_CONCAT
977
978 #if GOOGLE_CUDA
979
980 #define REGISTER_GPU(type) \
981 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat") \
982 .Device(DEVICE_GPU) \
983 .TypeConstraint<type>("dtype") \
984 .HostMemory("lengths") \
985 .HostMemory("handle"), \
986 TensorArrayConcatOp<GPUDevice, type>); \
987 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2") \
988 .Device(DEVICE_GPU) \
989 .TypeConstraint<type>("dtype") \
990 .HostMemory("lengths") \
991 .HostMemory("handle"), \
992 TensorArrayConcatOp<GPUDevice, type>) \
993 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3") \
994 .Device(DEVICE_GPU) \
995 .TypeConstraint<type>("dtype") \
996 .HostMemory("lengths") \
997 .HostMemory("handle"), \
998 TensorArrayConcatOp<GPUDevice, type>)
999
1000 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1001 TF_CALL_complex64(REGISTER_GPU);
1002 TF_CALL_complex128(REGISTER_GPU);
1003 REGISTER_GPU(bfloat16);
1004 #undef REGISTER_GPU
1005
1006 // A special GPU kernel for int32.
1007 // TODO(b/25387198): Also enable int32 in device memory. This kernel
1008 // registration requires all int32 inputs and outputs to be in host memory.
1009 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")
1010 .Device(DEVICE_GPU)
1011 .TypeConstraint<int32>("dtype")
1012 .HostMemory("lengths")
1013 .HostMemory("handle"),
1014 TensorArrayConcatOp<CPUDevice, int32>);
1015 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")
1016 .Device(DEVICE_GPU)
1017 .TypeConstraint<int32>("dtype")
1018 .HostMemory("lengths")
1019 .HostMemory("handle"),
1020 TensorArrayConcatOp<CPUDevice, int32>);
1021 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
1022 .Device(DEVICE_GPU)
1023 .TypeConstraint<int32>("dtype")
1024 .HostMemory("lengths")
1025 .HostMemory("handle"),
1026 TensorArrayConcatOp<CPUDevice, int32>);
1027
1028 #endif // GOOGLE_CUDA
1029
1030 // UNPACK and SCATTER *********************************************************
1031
1032 template <typename Device, typename T, bool LEGACY_UNPACK>
1033 class TensorArrayUnpackOrScatterOp : public OpKernel {
1034 public:
TensorArrayUnpackOrScatterOp(OpKernelConstruction * context)1035 explicit TensorArrayUnpackOrScatterOp(OpKernelConstruction* context)
1036 : OpKernel(context) {}
1037
Compute(OpKernelContext * ctx)1038 void Compute(OpKernelContext* ctx) override {
1039 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
1040
1041 TensorArray* tensor_array = nullptr;
1042 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1043 core::ScopedUnref unref(tensor_array);
1044 const Tensor* tensor_value;
1045 OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
1046 TensorShape element_shape(tensor_value->shape());
1047
1048 OP_REQUIRES(ctx,
1049 FastBoundsCheck(element_shape.dim_size(0),
1050 std::numeric_limits<int32>::max()),
1051 errors::InvalidArgument("tensor dim0 too large to unpack"));
1052
1053 OP_REQUIRES(
1054 ctx, tensor_value->dtype() == tensor_array->ElemType(),
1055 errors::InvalidArgument("TensorArray dtype is ",
1056 DataTypeString(tensor_array->ElemType()),
1057 " but Op is trying to write dtype ",
1058 DataTypeString(tensor_value->dtype()), "."));
1059 OP_REQUIRES(ctx, element_shape.dims() > 0,
1060 errors::InvalidArgument("Input value for unpack must be at "
1061 "least a vector but received shape: ",
1062 element_shape.DebugString()));
1063 int32 array_size;
1064 OP_REQUIRES_OK(ctx, tensor_array->Size(&array_size));
1065
1066 int32 max_index;
1067 int32 num_values;
1068 std::vector<int32> write_indices;
1069 if (LEGACY_UNPACK) {
1070 num_values = element_shape.dim_size(0);
1071 max_index = num_values - 1;
1072 write_indices.resize(num_values);
1073 std::iota(write_indices.begin(), write_indices.end(), 0);
1074 } else {
1075 const Tensor* tensor_indices;
1076 OP_REQUIRES_OK(ctx, ctx->input("indices", &tensor_indices));
1077 OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_indices->shape()),
1078 errors::InvalidArgument(
1079 "Expected indices to be a vector, but received shape: ",
1080 tensor_indices->shape().DebugString()));
1081 OP_REQUIRES(ctx,
1082 tensor_indices->NumElements() == element_shape.dim_size(0),
1083 errors::InvalidArgument(
1084 "Expected len(indices) == values.shape[0], but saw: ",
1085 tensor_indices->NumElements(), " vs. ",
1086 element_shape.dim_size(0)));
1087 const auto indices_t = tensor_indices->vec<int32>();
1088 num_values = tensor_indices->NumElements();
1089 max_index = (num_values == 0)
1090 ? -1
1091 : *std::max_element(indices_t.data(),
1092 indices_t.data() + num_values);
1093 write_indices.resize(num_values);
1094 // Copy into write_indices.
1095 std::copy(indices_t.data(), indices_t.data() + num_values,
1096 write_indices.begin());
1097 }
1098
1099 bool dynamic_size = tensor_array->HasDynamicSize();
1100
1101 // If dynamic size, we may have to resize the TensorArray to fit.
1102 if (dynamic_size && array_size < max_index + 1) {
1103 array_size = static_cast<int32>(max_index + 1);
1104 }
1105
1106 if (LEGACY_UNPACK) {
1107 OP_REQUIRES(
1108 ctx, element_shape.dim_size(0) == array_size,
1109 errors::InvalidArgument(
1110 "Input value must have first dimension equal to the array size (",
1111 element_shape.dim_size(0), " vs. ", array_size, ")"));
1112 } else {
1113 OP_REQUIRES(
1114 ctx, max_index < array_size,
1115 errors::InvalidArgument("Max scatter index must be < array size (",
1116 max_index, " vs. ", array_size, ")"));
1117 }
1118 element_shape.RemoveDim(0);
1119
1120 auto tensor_value_t = tensor_value->shaped<T, 3>(
1121 {1, num_values, element_shape.num_elements()});
1122
1123 Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
1124 Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
1125 1, 1, static_cast<Eigen::DenseIndex>(element_shape.num_elements())};
1126
1127 std::vector<PersistentTensor> write_values;
1128 write_values.reserve(num_values);
1129
1130 for (int i = 0; i < num_values; ++i) {
1131 Tensor* tensor_value_i;
1132 PersistentTensor persistent_tensor;
1133 OP_REQUIRES_OK(
1134 ctx, ctx->allocate_persistent(tensor_array->ElemType(), element_shape,
1135 &persistent_tensor, &tensor_value_i));
1136 auto tensor_value_i_t =
1137 tensor_value_i->shaped<T, 3>({1, 1, element_shape.num_elements()});
1138 indices[1] = i;
1139
1140 if (element_shape.num_elements() > 0) {
1141 functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
1142 tensor_value_i_t, tensor_value_t,
1143 indices, sizes);
1144 }
1145
1146 write_values.push_back(persistent_tensor);
1147 }
1148
1149 // Record the pack size of the TensorArray.
1150 if (LEGACY_UNPACK) {
1151 OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
1152 }
1153
1154 Status s = tensor_array->WriteOrAggregateMany<Device, T>(ctx, write_indices,
1155 &write_values);
1156 OP_REQUIRES_OK(ctx, s);
1157 }
1158 };
1159
1160 #define REGISTER_SCATTER_AND_UNPACK(type) \
1161 REGISTER_KERNEL_BUILDER( \
1162 Name("TensorArrayUnpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1163 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1164 true /* LEGACY_UNPACK */>); \
1165 REGISTER_KERNEL_BUILDER( \
1166 Name("TensorArrayScatter").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1167 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1168 false /* LEGACY_UNPACK */>); \
1169 REGISTER_KERNEL_BUILDER( \
1170 Name("TensorArrayScatterV2") \
1171 .Device(DEVICE_CPU) \
1172 .TypeConstraint<type>("T"), \
1173 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1174 false /* LEGACY_UNPACK */>); \
1175 REGISTER_KERNEL_BUILDER( \
1176 Name("TensorArrayScatterV3") \
1177 .Device(DEVICE_CPU) \
1178 .TypeConstraint<type>("T"), \
1179 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1180 false /* LEGACY_UNPACK */>);
1181
1182 TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
1183 #undef REGISTER_SCATTER_AND_UNPACK
1184
1185 #if GOOGLE_CUDA
1186
1187 #define REGISTER_GPU(type) \
1188 REGISTER_KERNEL_BUILDER( \
1189 Name("TensorArrayUnpack") \
1190 .Device(DEVICE_GPU) \
1191 .TypeConstraint<type>("T") \
1192 .HostMemory("handle"), \
1193 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1194 true /* LEGACY_UNPACK */>); \
1195 REGISTER_KERNEL_BUILDER( \
1196 Name("TensorArrayScatter") \
1197 .Device(DEVICE_GPU) \
1198 .TypeConstraint<type>("T") \
1199 .HostMemory("indices") \
1200 .HostMemory("handle"), \
1201 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1202 false /* LEGACY_UNPACK */>); \
1203 REGISTER_KERNEL_BUILDER( \
1204 Name("TensorArrayScatterV2") \
1205 .Device(DEVICE_GPU) \
1206 .TypeConstraint<type>("T") \
1207 .HostMemory("indices") \
1208 .HostMemory("handle"), \
1209 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1210 false /* LEGACY_UNPACK */>); \
1211 REGISTER_KERNEL_BUILDER( \
1212 Name("TensorArrayScatterV3") \
1213 .Device(DEVICE_GPU) \
1214 .TypeConstraint<type>("T") \
1215 .HostMemory("indices") \
1216 .HostMemory("handle"), \
1217 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1218 false /* LEGACY_UNPACK */>);
1219
1220 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1221 TF_CALL_complex64(REGISTER_GPU);
1222 TF_CALL_complex128(REGISTER_GPU);
1223 TF_CALL_int64(REGISTER_GPU);
1224 #undef REGISTER_GPU
1225
1226 #endif // GOOGLE_CUDA
1227
1228 // SPLIT *********************************************************************
1229
1230 template <typename Device, typename T>
1231 class TensorArraySplitOp : public OpKernel {
1232 public:
TensorArraySplitOp(OpKernelConstruction * context)1233 explicit TensorArraySplitOp(OpKernelConstruction* context)
1234 : OpKernel(context) {}
1235
Compute(OpKernelContext * ctx)1236 void Compute(OpKernelContext* ctx) override {
1237 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
1238
1239 TensorArray* tensor_array = nullptr;
1240 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1241 core::ScopedUnref unref(tensor_array);
1242 const Tensor* tensor_value;
1243 OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
1244 const Tensor* tensor_lengths;
1245 OP_REQUIRES_OK(ctx, ctx->input("lengths", &tensor_lengths));
1246
1247 OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_lengths->shape()),
1248 errors::InvalidArgument(
1249 "Expected lengths to be a vector, received shape: ",
1250 tensor_lengths->shape().DebugString()));
1251 OP_REQUIRES(ctx,
1252 FastBoundsCheck(tensor_lengths->NumElements(),
1253 std::numeric_limits<int32>::max()),
1254 errors::InvalidArgument(
1255 "Expected lengths to have < max int32 entries"));
1256
1257 int32 num_tensors = static_cast<int32>(tensor_lengths->NumElements());
1258 auto tensor_lengths_t = tensor_lengths->vec<int64>();
1259 std::vector<int64> cumulative_lengths;
1260 cumulative_lengths.reserve(num_tensors);
1261 int64 total_length = 0;
1262 for (int i = 0; i < num_tensors; ++i) {
1263 total_length += tensor_lengths_t(i);
1264 cumulative_lengths.push_back(total_length);
1265 }
1266
1267 OP_REQUIRES(
1268 ctx, TensorShapeUtils::IsVectorOrHigher(tensor_value->shape()),
1269 errors::InvalidArgument(
1270 "Expected value to be at least a vector, but received shape: ",
1271 tensor_value->shape().DebugString()));
1272
1273 OP_REQUIRES(
1274 ctx, total_length == tensor_value->shape().dim_size(0),
1275 errors::InvalidArgument("Expected sum of lengths to be equal to "
1276 "values.shape[0], but sum of lengths is ",
1277 total_length, " and value's shape is: ",
1278 tensor_value->shape().DebugString()));
1279 int64 elements_per_row =
1280 (total_length == 0) ? 0 : (tensor_value->NumElements() / total_length);
1281
1282 int32 array_size;
1283 OP_REQUIRES_OK(ctx, tensor_array->Size(&array_size));
1284 bool dynamic_size = tensor_array->HasDynamicSize();
1285
1286 std::vector<TensorShape> element_shapes(num_tensors, tensor_value->shape());
1287 for (int32 i = 0; i < num_tensors; ++i) {
1288 element_shapes[i].set_dim(0, tensor_lengths_t(i));
1289 }
1290
1291 // If dynamic size, we may have to resize the TensorArray to fit.
1292 if (dynamic_size && array_size < num_tensors) {
1293 array_size = num_tensors;
1294 }
1295
1296 OP_REQUIRES(
1297 ctx, array_size == num_tensors,
1298 errors::InvalidArgument(
1299 "TensorArray's size is not equal to the size of lengths (",
1300 array_size, " vs. ", num_tensors, "), and the TensorArray is not ",
1301 "marked as dynamically resizeable"));
1302
1303 OP_REQUIRES(
1304 ctx, tensor_value->dtype() == tensor_array->ElemType(),
1305 errors::InvalidArgument("TensorArray dtype is ",
1306 DataTypeString(tensor_array->ElemType()),
1307 " but Op is trying to write dtype ",
1308 DataTypeString(tensor_value->dtype()), "."));
1309
1310 auto tensor_value_t =
1311 tensor_value->shaped<T, 3>({1, total_length, elements_per_row});
1312
1313 std::vector<PersistentTensor> write_values;
1314 write_values.reserve(array_size);
1315
1316 for (int i = 0; i < array_size; ++i) {
1317 Tensor* tensor_value_i;
1318 PersistentTensor persistent_tensor;
1319
1320 int64 previous_length = (i == 0) ? 0 : cumulative_lengths[i - 1];
1321 Eigen::DSizes<Eigen::DenseIndex, 3> indices{
1322 0, static_cast<Eigen::DenseIndex>(previous_length), 0};
1323 Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
1324 1, static_cast<Eigen::DenseIndex>(tensor_lengths_t(i)),
1325 static_cast<Eigen::DenseIndex>(elements_per_row)};
1326
1327 OP_REQUIRES_OK(ctx, ctx->allocate_persistent(
1328 tensor_array->ElemType(), element_shapes[i],
1329 &persistent_tensor, &tensor_value_i));
1330
1331 if (tensor_lengths_t(i) > 0) {
1332 auto tensor_value_i_t = tensor_value_i->shaped<T, 3>(
1333 {1, tensor_lengths_t(i), elements_per_row});
1334
1335 functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
1336 tensor_value_i_t, tensor_value_t,
1337 indices, sizes);
1338 }
1339
1340 write_values.push_back(persistent_tensor);
1341 }
1342
1343 // Record the concat size of the TensorArray.
1344 OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
1345
1346 std::vector<int32> indices(array_size);
1347 std::iota(indices.begin(), indices.end(), 0);
1348
1349 Status s = tensor_array->WriteOrAggregateMany<Device, T>(ctx, indices,
1350 &write_values);
1351 OP_REQUIRES_OK(ctx, s);
1352 }
1353 };
1354
1355 #define REGISTER_SPLIT(type) \
1356 REGISTER_KERNEL_BUILDER( \
1357 Name("TensorArraySplit").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1358 TensorArraySplitOp<CPUDevice, type>); \
1359 REGISTER_KERNEL_BUILDER( \
1360 Name("TensorArraySplitV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1361 TensorArraySplitOp<CPUDevice, type>); \
1362 REGISTER_KERNEL_BUILDER( \
1363 Name("TensorArraySplitV3").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1364 TensorArraySplitOp<CPUDevice, type>);
1365
1366 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
1367 #undef REGISTER_SPLIT
1368
1369 #if GOOGLE_CUDA
1370
1371 #define REGISTER_GPU(type) \
1372 REGISTER_KERNEL_BUILDER(Name("TensorArraySplit") \
1373 .Device(DEVICE_GPU) \
1374 .TypeConstraint<type>("T") \
1375 .HostMemory("lengths") \
1376 .HostMemory("handle"), \
1377 TensorArraySplitOp<GPUDevice, type>); \
1378 REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV2") \
1379 .Device(DEVICE_GPU) \
1380 .TypeConstraint<type>("T") \
1381 .HostMemory("lengths") \
1382 .HostMemory("handle"), \
1383 TensorArraySplitOp<GPUDevice, type>); \
1384 REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV3") \
1385 .Device(DEVICE_GPU) \
1386 .TypeConstraint<type>("T") \
1387 .HostMemory("lengths") \
1388 .HostMemory("handle"), \
1389 TensorArraySplitOp<GPUDevice, type>);
1390
1391 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1392 TF_CALL_complex64(REGISTER_GPU);
1393 TF_CALL_complex128(REGISTER_GPU);
1394 #undef REGISTER_GPU
1395
1396 #endif // GOOGLE_CUDA
1397
1398 // SIZE ***********************************************************************
1399
1400 // Get the size of the TensorArray
1401 class TensorArraySizeOp : public OpKernel {
1402 public:
TensorArraySizeOp(OpKernelConstruction * context)1403 explicit TensorArraySizeOp(OpKernelConstruction* context)
1404 : OpKernel(context) {}
1405
Compute(OpKernelContext * ctx)1406 void Compute(OpKernelContext* ctx) override {
1407 TensorArray* tensor_array;
1408 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1409 core::ScopedUnref unref(tensor_array);
1410 Tensor* output = nullptr;
1411 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
1412 OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32>()())));
1413 }
1414 };
1415
1416 REGISTER_KERNEL_BUILDER(Name("TensorArraySize").Device(DEVICE_CPU),
1417 TensorArraySizeOp);
1418 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV2").Device(DEVICE_CPU),
1419 TensorArraySizeOp);
1420 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV3").Device(DEVICE_CPU),
1421 TensorArraySizeOp);
1422
1423 REGISTER_KERNEL_BUILDER(Name("TensorArraySize")
1424 .Device(DEVICE_GPU)
1425 .HostMemory("handle")
1426 .HostMemory("size"),
1427 TensorArraySizeOp);
1428 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV2")
1429 .Device(DEVICE_GPU)
1430 .HostMemory("handle")
1431 .HostMemory("size"),
1432 TensorArraySizeOp);
1433 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV3")
1434 .Device(DEVICE_GPU)
1435 .HostMemory("handle")
1436 .HostMemory("size"),
1437 TensorArraySizeOp);
1438
1439 // CLOSE
1440 // **********************************************************************
1441
1442 // Delete the TensorArray from its resource container. This enables
1443 // the user to close and release the resource in the middle of a step/run.
1444 // TODO(ebrevdo): decide whether closing the grad op should happen
1445 // here or on the python side.
1446 class TensorArrayCloseOp : public OpKernel {
1447 public:
TensorArrayCloseOp(OpKernelConstruction * context)1448 explicit TensorArrayCloseOp(OpKernelConstruction* context)
1449 : OpKernel(context) {}
1450
Compute(OpKernelContext * ctx)1451 void Compute(OpKernelContext* ctx) override {
1452 TensorArray* tensor_array;
1453 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1454 core::ScopedUnref unref(tensor_array);
1455 // Instead of deleting this TA from the ResourceManager, we just
1456 // clear it away and mark it as closed. The remaining memory
1457 // consumed store its mutex and handle Tensor. This will be
1458 // cleared out at the end of the step anyway, so it's fine to keep
1459 // it around until the end of the step. Further calls to the
1460 // TensorArray will fail because TensorArray checks internally to
1461 // see if it is closed or not.
1462 tensor_array->ClearAndMarkClosed();
1463 }
1464 };
1465
1466 REGISTER_KERNEL_BUILDER(Name("TensorArrayClose").Device(DEVICE_CPU),
1467 TensorArrayCloseOp);
1468 REGISTER_KERNEL_BUILDER(Name("TensorArrayCloseV2").Device(DEVICE_CPU),
1469 TensorArrayCloseOp);
1470 REGISTER_KERNEL_BUILDER(Name("TensorArrayCloseV3").Device(DEVICE_CPU),
1471 TensorArrayCloseOp);
1472
1473 REGISTER_KERNEL_BUILDER(
1474 Name("TensorArrayClose").Device(DEVICE_GPU).HostMemory("handle"),
1475 TensorArrayCloseOp);
1476 REGISTER_KERNEL_BUILDER(
1477 Name("TensorArrayCloseV2").Device(DEVICE_GPU).HostMemory("handle"),
1478 TensorArrayCloseOp);
1479 REGISTER_KERNEL_BUILDER(
1480 Name("TensorArrayCloseV3").Device(DEVICE_GPU).HostMemory("handle"),
1481 TensorArrayCloseOp);
1482
1483 } // namespace tensorflow
1484