• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/data_flow_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include <limits>
21 #include <vector>
22 // TODO(b/31496047): Fix non-standard include order.
23 #include <numeric>  // clang-format off
24 
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/framework/bounds_check.h"
27 #include "tensorflow/core/framework/op_kernel.h"
28 #include "tensorflow/core/framework/register_types.h"
29 #include "tensorflow/core/framework/resource_mgr.h"
30 #include "tensorflow/core/framework/tensor.h"
31 #include "tensorflow/core/framework/tensor_shape.h"
32 #include "tensorflow/core/framework/types.h"
33 #include "tensorflow/core/kernels/concat_lib.h"
34 #include "tensorflow/core/kernels/split_lib.h"
35 #include "tensorflow/core/kernels/tensor_array.h"
36 #include "tensorflow/core/lib/core/errors.h"
37 #include "tensorflow/core/lib/core/refcount.h"
38 #include "tensorflow/core/lib/strings/strcat.h"
39 #include "tensorflow/core/platform/dynamic_annotations.h"
40 #include "tensorflow/core/platform/logging.h"
41 #include "tensorflow/core/platform/thread_annotations.h"
42 #include "tensorflow/core/platform/types.h"
43 #include "tensorflow/core/util/ptr_util.h"
44 
45 typedef Eigen::ThreadPoolDevice CPUDevice;
46 #if GOOGLE_CUDA
47 typedef Eigen::GpuDevice GPUDevice;
48 #endif  // GOOGLE_CUDA
49 
50 // clang-format on
51 
52 namespace tensorflow {
53 
GetHandle(OpKernelContext * ctx,string * container,string * ta_handle)54 Status GetHandle(OpKernelContext* ctx, string* container, string* ta_handle) {
55   {
56     Tensor tensor;
57     // Assuming that handle is the input at index 0.
58     if (IsRefType(ctx->input_dtype(0))) {
59       tensor = ctx->mutable_input(0, false);
60     } else {
61       tensor = ctx->input(0);
62     }
63     if (tensor.NumElements() != 2) {
64       return errors::InvalidArgument(
65           "Tensor array handle must be 2-element vector, but had shape: ",
66           tensor.shape().DebugString());
67     }
68     auto h = tensor.flat<string>();
69     *container = h(0);
70     *ta_handle = h(1);
71   }
72   return Status::OK();
73 }
74 
GetTensorArray(OpKernelContext * ctx,TensorArray ** tensor_array)75 Status GetTensorArray(OpKernelContext* ctx, TensorArray** tensor_array) {
76   string container;
77   string ta_handle;
78   if (ctx->input_dtype(0) != DT_RESOURCE) {
79     TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &ta_handle));
80     ResourceMgr* rm = ctx->resource_manager();
81     if (rm == nullptr) return errors::Internal("No resource manager.");
82     TF_RETURN_IF_ERROR(rm->Lookup(ctx->step_container()->name(),
83                                   container + ta_handle, tensor_array));
84     return Status::OK();
85   } else {
86     return LookupResource(ctx, HandleFromInput(ctx, 0), tensor_array);
87   }
88 }
89 
SetupFlowControlInputs(OpKernelContext * ctx,bool set_output)90 Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) {
91   const Tensor* flow_control;
92   TF_RETURN_IF_ERROR(ctx->input("flow_in", &flow_control));
93   if (set_output) {
94     TF_RETURN_IF_ERROR(ctx->set_output("flow_out", *flow_control));
95   }
96   return Status::OK();
97 }
98 
99 // CREATION *******************************************************************
100 
101 // Virtual class for shared behavior between TensorArrayOp and
102 // TensorArrayGradOp.
103 class TensorArrayCreationOp : public OpKernel {
104  public:
TensorArrayCreationOp(OpKernelConstruction * context)105   explicit TensorArrayCreationOp(OpKernelConstruction* context)
106       : OpKernel(context), device_type_(context->device_type()) {}
107 
Compute(OpKernelContext * ctx)108   void Compute(OpKernelContext* ctx) override {
109     Tensor tensor_array_output_handle;
110 
111     AllocatorAttributes alloc_attr;
112     alloc_attr.set_on_host(true);
113     OP_REQUIRES_OK(ctx, ctx->allocate_temp(
114                             tensorflow::DT_STRING, tensorflow::TensorShape({2}),
115                             &tensor_array_output_handle, alloc_attr));
116     // Store the handle in a per-step container of the RM.
117     ResourceMgr* rm = ctx->resource_manager();
118     OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
119 
120     TensorArray* output_tensor_array;
121     OP_REQUIRES_OK(ctx, CreateTensorArray(ctx, rm, &tensor_array_output_handle,
122                                           &output_tensor_array));
123     if (IsRefType(ctx->expected_output_dtype(0))) {
124       ctx->set_output_ref(0, output_tensor_array->mu(),
125                           output_tensor_array->handle());
126     } else if (ctx->expected_output_dtype(0) == DT_STRING) {
127       ctx->set_output(0, *output_tensor_array->handle());
128     } else {
129       Tensor* handle;
130       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
131       handle->flat<ResourceHandle>()(0) =
132           output_tensor_array->resource_handle(ctx);
133     }
134     if (ctx->num_outputs() == 2) {
135       // Create the flow output.
136       Tensor* flow;
137       OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow));
138       if (device_type_ == DEVICE_CPU) {
139         // Value doesn't matter, but this makes msan not complaint about
140         // copying an uninitialized value. To do this on GPU would require
141         // a kernel launch or a host->device memcpy, so we avoid that.
142         flow->flat<float>()(0) = 0;
143       }
144     }
145   }
146 
147  protected:
148   virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
149                                    Tensor* tensor_array_output_handle,
150                                    TensorArray** output_tensor_array) = 0;
151 
152  private:
153   const DeviceType device_type_;
154 };
155 
156 // A per-run local tensor array. The tensor array uses a "per-step" resource
157 // manager which ensures that correct garbage collection on error or
158 // successful completion.
159 class TensorArrayOp : public TensorArrayCreationOp {
160  public:
TensorArrayOp(OpKernelConstruction * context)161   explicit TensorArrayOp(OpKernelConstruction* context)
162       : TensorArrayCreationOp(context) {
163     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
164     OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
165     OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
166     // The HasAttr check is for backwards compatibility with older op
167     // versions which do not have this attribute.
168     if (context->HasAttr("identical_element_shapes")) {
169       OP_REQUIRES_OK(context, context->GetAttr("identical_element_shapes",
170                                                &identical_element_shapes_));
171     } else {
172       identical_element_shapes_ = false;
173     }
174     OP_REQUIRES_OK(context,
175                    context->GetAttr("clear_after_read", &clear_after_read_));
176     OP_REQUIRES_OK(context,
177                    context->GetAttr("tensor_array_name", &tensor_array_name_));
178     if (tensor_array_name_.empty()) tensor_array_name_ = name();
179   }
180 
CreateTensorArray(OpKernelContext * ctx,ResourceMgr * rm,Tensor * tensor_array_output_handle,TensorArray ** output_tensor_array)181   Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
182                            Tensor* tensor_array_output_handle,
183                            TensorArray** output_tensor_array) override {
184     const Tensor* tensor_size;
185     TF_RETURN_IF_ERROR(ctx->input("size", &tensor_size));
186 
187     if (!TensorShapeUtils::IsScalar(tensor_size->shape())) {
188       return errors::InvalidArgument(
189           "TensorArray size must be scalar, but had shape: ",
190           tensor_size->shape().DebugString());
191     }
192     const int32 size = tensor_size->scalar<int32>()();
193     if (size < 0) {
194       return errors::InvalidArgument("Size should be >= 0.");
195     }
196 
197     auto handle = tensor_array_output_handle->flat<string>();
198     string unique_tensor_array_name =
199         strings::StrCat(tensor_array_name_, "_",
200                         TensorArray::tensor_array_counter.fetch_add(1));
201     handle(0) = "_tensor_arrays";
202     handle(1) = unique_tensor_array_name;
203 
204     auto key = strings::StrCat(handle(0), unique_tensor_array_name);
205 
206     TensorArray* tensor_array = new TensorArray(
207         key, dtype_, *tensor_array_output_handle, size, element_shape_,
208         identical_element_shapes_, dynamic_size_,
209         false /* multiple_writes_aggregate */, false /* is_grad */,
210         -1 /* marked_size */, clear_after_read_);
211 
212     TF_RETURN_IF_ERROR(
213         rm->Create(ctx->step_container()->name(), key, tensor_array));
214 
215     *output_tensor_array = tensor_array;
216 
217     return Status::OK();
218   }
219 
220  private:
221   DataType dtype_;
222   PartialTensorShape element_shape_;
223   bool identical_element_shapes_;
224   bool dynamic_size_;
225   bool clear_after_read_;
226   string tensor_array_name_;  // The name used to create the TensorArray.
227 
228   TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
229 };
230 
231 REGISTER_KERNEL_BUILDER(Name("TensorArray").Device(DEVICE_CPU), TensorArrayOp);
232 REGISTER_KERNEL_BUILDER(Name("TensorArrayV2").Device(DEVICE_CPU),
233                         TensorArrayOp);
234 REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
235                         TensorArrayOp);
236 
237 #if GOOGLE_CUDA
238 
239 #define REGISTER_GPU(type)                                   \
240   REGISTER_KERNEL_BUILDER(Name("TensorArray")                \
241                               .Device(DEVICE_GPU)            \
242                               .TypeConstraint<type>("dtype") \
243                               .HostMemory("size")            \
244                               .HostMemory("handle"),         \
245                           TensorArrayOp);                    \
246   REGISTER_KERNEL_BUILDER(Name("TensorArrayV2")              \
247                               .Device(DEVICE_GPU)            \
248                               .TypeConstraint<type>("dtype") \
249                               .HostMemory("size")            \
250                               .HostMemory("handle"),         \
251                           TensorArrayOp);                    \
252   REGISTER_KERNEL_BUILDER(Name("TensorArrayV3")              \
253                               .Device(DEVICE_GPU)            \
254                               .TypeConstraint<type>("dtype") \
255                               .HostMemory("size")            \
256                               .HostMemory("handle"),         \
257                           TensorArrayOp);
258 
259 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
260 TF_CALL_complex64(REGISTER_GPU);
261 TF_CALL_complex128(REGISTER_GPU);
262 TF_CALL_int64(REGISTER_GPU);
263 REGISTER_GPU(bfloat16);
264 #undef REGISTER_GPU
265 
266 #endif  // GOOGLE_CUDA
267 
268 // GRADIENT *******************************************************************
269 // Note that this op may have an optional third input. If present, it represents
270 // a shape value. It indicates that element shape of this gradient array is that
271 // shape value concatenated with the element shape of the original tensor array.
272 // See TensorArrayGradWithShape.
273 class TensorArrayGradOp : public TensorArrayCreationOp {
274  public:
TensorArrayGradOp(OpKernelConstruction * context)275   explicit TensorArrayGradOp(OpKernelConstruction* context)
276       : TensorArrayCreationOp(context) {
277     OP_REQUIRES_OK(context, context->GetAttr("source", &source_));
278   }
279 
CreateTensorArray(OpKernelContext * ctx,ResourceMgr * rm,Tensor * tensor_array_output_handle,TensorArray ** output_tensor_array)280   Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
281                            Tensor* tensor_array_output_handle,
282                            TensorArray** output_tensor_array) override {
283     string container;
284     string tensor_array_name;
285     if (ctx->input_dtype(0) != DT_RESOURCE) {
286       TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &tensor_array_name));
287       if (container != "_tensor_arrays") {
288         return errors::InvalidArgument(
289             "Input container should be '_tensor_arrays',  but received '",
290             container, "'");
291       }
292     } else {
293       container = "_tensor_arrays";
294       const auto& resource = ctx->input(0).flat<ResourceHandle>()(0);
295       if (StringPiece(resource.name()).substr(0, container.size()) !=
296           container) {
297         return errors::InvalidArgument("Wrong input container. ",
298                                        resource.name());
299       }
300       tensor_array_name =
301           string(StringPiece(resource.name()).substr(container.size()));
302     }
303 
304     auto output_handle = tensor_array_output_handle->flat<string>();
305     output_handle(0) = "_tensor_array_grads";
306     output_handle(1) = strings::StrCat(tensor_array_name, "@", source_);
307 
308     TensorArray* tensor_array;
309     TF_RETURN_IF_ERROR(rm->Lookup(ctx->step_container()->name(),
310                                   strings::StrCat(container, tensor_array_name),
311                                   &tensor_array));
312     core::ScopedUnref unref(tensor_array);
313 
314     // Once gradients are being calculated, the forward TensorArray
315     // may no longer be resized by new Writes.
316     tensor_array->DisableDynamicSize();
317 
318     int32 array_size = 0;
319     int32 marked_size = 0;
320     TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));
321     TF_RETURN_IF_ERROR(tensor_array->MarkedSize(&marked_size));
322 
323     if (array_size < 0) {
324       return errors::InvalidArgument("ArraySize should be >= 0.");
325     }
326     if (!tensor_array->GradientsAllowed()) {
327       return errors::InvalidArgument(
328           "Unable to create a gradients TensorArray for ", tensor_array_name,
329           ".  Perhaps you used the multiple_writes_aggregate flag on a "
330           "previous write?  Gradient calculation is impossible when multiple "
331           "writes are performed to the same index.");
332     }
333     TensorShape shape_to_prepend;
334     auto element_shape = PartialTensorShape();
335     if (ctx->num_inputs() > 2) {
336       TF_RETURN_IF_ERROR(
337           ctx->op_kernel().MakeShape(ctx->input(2), &shape_to_prepend));
338       auto ta_element_shape = tensor_array->ElemShape();
339       if (!ta_element_shape.unknown_rank()) {
340         std::vector<int64> dims;
341         for (auto dim : shape_to_prepend) {
342           dims.push_back(dim.size);
343         }
344         for (auto dim : ta_element_shape) {
345           dims.push_back(dim.size);
346         }
347         TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
348             gtl::ArraySlice<int64>(dims), &element_shape));
349       }
350     } else {
351       element_shape = tensor_array->ElemShape();
352     }
353 
354     const auto key = strings::StrCat(output_handle(0), output_handle(1));
355     auto creator = [key, tensor_array, array_size, marked_size, element_shape,
356                     shape_to_prepend,
357                     tensor_array_output_handle](TensorArray** ret) -> Status {
358       *ret = new TensorArray(
359           key, tensor_array->ElemType(), *tensor_array_output_handle,
360           array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
361           false /* dynamic_size */, true /* multiple_writes_aggregate */,
362           true /* is_grad */, marked_size /* marked_size */,
363           true /* close_after_read */);
364       return (*ret)->CopyShapesFrom(tensor_array, &shape_to_prepend);
365     };
366 
367     Status s = rm->LookupOrCreate<TensorArray>(
368         ctx->step_container()->name(), key, output_tensor_array, creator);
369     (*output_tensor_array)->Unref();
370 
371     return s;
372   }
373 
374  private:
375   // The gradient source for creating the given
376   // gradient TensorArray.  This should be unique to each gradients
377   // call.  Typical values look like "gradients", "gradients_1", ...
378   string source_;
379 
380   TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGradOp);
381 };
382 
383 REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad").Device(DEVICE_CPU),
384                         TensorArrayGradOp);
385 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2").Device(DEVICE_CPU),
386                         TensorArrayGradOp);
387 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3").Device(DEVICE_CPU),
388                         TensorArrayGradOp);
389 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape").Device(DEVICE_CPU),
390                         TensorArrayGradOp);
391 REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad")
392                             .Device(DEVICE_GPU)
393                             .HostMemory("handle")
394                             .HostMemory("grad_handle"),
395                         TensorArrayGradOp);
396 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2")
397                             .Device(DEVICE_GPU)
398                             .HostMemory("handle")
399                             .HostMemory("grad_handle"),
400                         TensorArrayGradOp);
401 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3")
402                             .Device(DEVICE_GPU)
403                             .HostMemory("handle")
404                             .HostMemory("grad_handle"),
405                         TensorArrayGradOp);
406 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape")
407                             .Device(DEVICE_GPU)
408                             .HostMemory("handle")
409                             .HostMemory("shape_to_prepend")
410                             .HostMemory("grad_handle"),
411                         TensorArrayGradOp);
412 
413 // WRITE **********************************************************************
414 
415 template <typename Device, typename T>
416 class TensorArrayWriteOp : public OpKernel {
417  public:
TensorArrayWriteOp(OpKernelConstruction * context)418   explicit TensorArrayWriteOp(OpKernelConstruction* context)
419       : OpKernel(context) {}
420 
Compute(OpKernelContext * ctx)421   void Compute(OpKernelContext* ctx) override {
422     OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
423 
424     const Tensor* tensor_index;
425     const Tensor* tensor_value;
426     OP_REQUIRES_OK(ctx, ctx->input("index", &tensor_index));
427     OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
428 
429     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_index->shape()),
430                 errors::InvalidArgument(
431                     "TensorArray index must be scalar, but had shape: ",
432                     tensor_index->shape().DebugString()));
433 
434     TensorArray* tensor_array = nullptr;
435     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
436     core::ScopedUnref unref(tensor_array);
437     const int32 index = tensor_index->scalar<int32>()();
438     OP_REQUIRES(
439         ctx, tensor_value->dtype() == tensor_array->ElemType(),
440         errors::InvalidArgument("TensorArray dtype is ",
441                                 DataTypeString(tensor_array->ElemType()),
442                                 " but Op is trying to write dtype ",
443                                 DataTypeString(tensor_value->dtype()), "."));
444     PersistentTensor persistent_tensor(*tensor_value);
445     Status s = tensor_array->WriteOrAggregate<Device, T>(ctx, index,
446                                                          &persistent_tensor);
447     OP_REQUIRES_OK(ctx, s);
448   }
449 };
450 
451 #define REGISTER_WRITE(type)                                                   \
452   REGISTER_KERNEL_BUILDER(                                                     \
453       Name("TensorArrayWrite").Device(DEVICE_CPU).TypeConstraint<type>("T"),   \
454       TensorArrayWriteOp<CPUDevice, type>);                                    \
455   REGISTER_KERNEL_BUILDER(                                                     \
456       Name("TensorArrayWriteV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
457       TensorArrayWriteOp<CPUDevice, type>);                                    \
458   REGISTER_KERNEL_BUILDER(                                                     \
459       Name("TensorArrayWriteV3").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
460       TensorArrayWriteOp<CPUDevice, type>);
461 
462 TF_CALL_ALL_TYPES(REGISTER_WRITE);
463 
464 #undef REGISTER_WRITE
465 
466 #if GOOGLE_CUDA
467 
468 #define REGISTER_GPU(type)                                      \
469   REGISTER_KERNEL_BUILDER(Name("TensorArrayWrite")              \
470                               .Device(DEVICE_GPU)               \
471                               .TypeConstraint<type>("T")        \
472                               .HostMemory("handle")             \
473                               .HostMemory("index"),             \
474                           TensorArrayWriteOp<GPUDevice, type>); \
475   REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV2")            \
476                               .Device(DEVICE_GPU)               \
477                               .TypeConstraint<type>("T")        \
478                               .HostMemory("handle")             \
479                               .HostMemory("index"),             \
480                           TensorArrayWriteOp<GPUDevice, type>); \
481   REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV3")            \
482                               .Device(DEVICE_GPU)               \
483                               .TypeConstraint<type>("T")        \
484                               .HostMemory("handle")             \
485                               .HostMemory("index"),             \
486                           TensorArrayWriteOp<GPUDevice, type>);
487 
488 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
489 TF_CALL_complex64(REGISTER_GPU);
490 TF_CALL_complex128(REGISTER_GPU);
491 REGISTER_GPU(bfloat16);
492 #undef REGISTER_GPU
493 
494 #endif  // GOOGLE_CUDA
495 
496 // READ ***********************************************************************
497 
498 template <typename Device, typename T>
499 class TensorArrayReadOp : public OpKernel {
500  public:
TensorArrayReadOp(OpKernelConstruction * context)501   explicit TensorArrayReadOp(OpKernelConstruction* context)
502       : OpKernel(context) {
503     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
504   }
505 
Compute(OpKernelContext * ctx)506   void Compute(OpKernelContext* ctx) override {
507     OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
508 
509     const Tensor* tensor_index;
510     OP_REQUIRES_OK(ctx, ctx->input("index", &tensor_index));
511 
512     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_index->shape()),
513                 errors::InvalidArgument(
514                     "TensorArray index must be scalar, but had shape: ",
515                     tensor_index->shape().DebugString()));
516 
517     TensorArray* tensor_array = nullptr;
518     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
519     core::ScopedUnref unref(tensor_array);
520 
521     const int32 index = tensor_index->scalar<int32>()();
522     OP_REQUIRES(
523         ctx, dtype_ == tensor_array->ElemType(),
524         errors::InvalidArgument(
525             "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
526             " but Op requested dtype ", DataTypeString(dtype_), "."));
527     PersistentTensor value;
528     Status s = tensor_array->Read<Device, T>(ctx, index, &value);
529     OP_REQUIRES_OK(ctx, s);
530     ctx->set_output(0, *value.AccessTensor(ctx));
531   }
532 
533  private:
534   DataType dtype_;
535 };
536 
537 #define REGISTER_READ(type)                                    \
538   REGISTER_KERNEL_BUILDER(Name("TensorArrayRead")              \
539                               .Device(DEVICE_CPU)              \
540                               .TypeConstraint<type>("dtype"),  \
541                           TensorArrayReadOp<CPUDevice, type>); \
542   REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2")            \
543                               .Device(DEVICE_CPU)              \
544                               .TypeConstraint<type>("dtype"),  \
545                           TensorArrayReadOp<CPUDevice, type>); \
546   REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV3")            \
547                               .Device(DEVICE_CPU)              \
548                               .TypeConstraint<type>("dtype"),  \
549                           TensorArrayReadOp<CPUDevice, type>);
550 
551 TF_CALL_ALL_TYPES(REGISTER_READ)
552 
553 #undef REGISTER_READ
554 
555 #if GOOGLE_CUDA
556 
557 #define REGISTER_GPU(type)                                     \
558   REGISTER_KERNEL_BUILDER(Name("TensorArrayRead")              \
559                               .Device(DEVICE_GPU)              \
560                               .TypeConstraint<type>("dtype")   \
561                               .HostMemory("handle")            \
562                               .HostMemory("index"),            \
563                           TensorArrayReadOp<GPUDevice, type>); \
564   REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2")            \
565                               .Device(DEVICE_GPU)              \
566                               .TypeConstraint<type>("dtype")   \
567                               .HostMemory("handle")            \
568                               .HostMemory("index"),            \
569                           TensorArrayReadOp<GPUDevice, type>); \
570   REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV3")            \
571                               .Device(DEVICE_GPU)              \
572                               .TypeConstraint<type>("dtype")   \
573                               .HostMemory("handle")            \
574                               .HostMemory("index"),            \
575                           TensorArrayReadOp<GPUDevice, type>);
576 
577 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
578 TF_CALL_complex64(REGISTER_GPU);
579 TF_CALL_complex128(REGISTER_GPU);
580 TF_CALL_int64(REGISTER_GPU);
581 REGISTER_GPU(bfloat16);
582 #undef REGISTER_GPU
583 
584 #endif  // GOOGLE_CUDA
585 
586 // PACK and GATHER ************************************************************
587 
588 // Concatenate the elements in a TensorArray.  All elements must be
589 // defined and have the same shape.
590 template <typename Device, typename T, bool LEGACY_PACK>
591 class TensorArrayPackOrGatherOp : public OpKernel {
592  public:
593   typedef typename TTypes<T, 2>::ConstMatrix ConstMatrix;
594   typedef std::vector<std::unique_ptr<ConstMatrix> > ConstMatrixVector;
595 
TensorArrayPackOrGatherOp(OpKernelConstruction * context)596   explicit TensorArrayPackOrGatherOp(OpKernelConstruction* context)
597       : OpKernel(context) {
598     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
599     OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
600   }
601 
Compute(OpKernelContext * ctx)602   void Compute(OpKernelContext* ctx) override {
603     OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
604 
605     TensorArray* tensor_array = nullptr;
606     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
607 
608     core::ScopedUnref unref(tensor_array);
609     OP_REQUIRES(
610         ctx, dtype_ == tensor_array->ElemType(),
611         errors::InvalidArgument(
612             "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
613             " but Op requested dtype ", DataTypeString(dtype_), "."));
614 
615     // Ensure new element shape is compatible with the one stored in the
616     // TensorArray.
617     OP_REQUIRES_OK(ctx, tensor_array->SetElemShape(element_shape_));
618 
619     int32 num_indices;
620     std::vector<PersistentTensor> values;
621     std::vector<int32> indices;
622     if (LEGACY_PACK) {
623       OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&num_indices));
624       indices.resize(num_indices);
625       std::iota(indices.begin(), indices.end(), 0);
626     } else {
627       const Tensor* tensor_indices;
628       OP_REQUIRES_OK(ctx, ctx->input("indices", &tensor_indices));
629       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_indices->shape()),
630                   errors::InvalidArgument(
631                       "Expected indices to be a vector, but received shape: ",
632                       tensor_indices->shape().DebugString()));
633       const auto indices_t = tensor_indices->vec<int32>();
634       num_indices = tensor_indices->NumElements();
635       indices.resize(num_indices);
636       std::copy(indices_t.data(), indices_t.data() + num_indices,
637                 indices.begin());
638     }
639 
640     // If there are no elements to return, return a zero-element Tensor with
641     // shape [0] + element_shape_
642     if (num_indices == 0) {
643       OP_REQUIRES(ctx, element_shape_.IsFullyDefined(),
644                   errors::Unimplemented(
645                       "TensorArray has size zero, but element shape ",
646                       element_shape_.DebugString(),
647                       " is not fully defined. "
648                       "Currently only static shapes are supported when packing "
649                       "zero-size TensorArrays."));
650       TensorShape empty_shape;
651       element_shape_.AsTensorShape(&empty_shape);
652       empty_shape.InsertDim(0, 0);
653       Tensor* empty_unused;
654       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &empty_unused));
655       return;
656     }
657 
658     // Read all the PersistentTensors into a vector to keep track of
659     // their memory.
660     Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
661     OP_REQUIRES_OK(ctx, s);
662 
663     const Tensor* value_0_t = values[0].AccessTensor(ctx);
664 
665     OP_REQUIRES(
666         ctx, element_shape_.IsCompatibleWith(value_0_t->shape()),
667         errors::InvalidArgument("TensorArray was passed element_shape ",
668                                 element_shape_.DebugString(),
669                                 " which does not match the Tensor at index 0: ",
670                                 value_0_t->shape().DebugString()));
671 
672     TensorShape output_shape(value_0_t->shape());
673     output_shape.InsertDim(0, num_indices);
674 
675     Tensor* output_tensor = nullptr;
676     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
677 
678     // If output_tensor is empty, there is nothing to concatenate so return it.
679     if (output_shape.num_elements() == 0) {
680       return;
681     }
682 
683     ConstMatrixVector input_tensors_flat;
684     input_tensors_flat.reserve(num_indices);
685     auto output_flat =
686         output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
687 
688     // Insert the first value
689     input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
690         value_0_t->shaped<T, 2>({1, value_0_t->NumElements()})));
691 
692     for (int i = 1; i < num_indices; ++i) {
693       const Tensor* value_t = values[i].AccessTensor(ctx);
694       OP_REQUIRES(
695           ctx, value_0_t->shape() == value_t->shape(),
696           errors::InvalidArgument(
697               "TensorArray has inconsistent shapes.  Index 0 has shape: ",
698               value_0_t->shape().DebugString(), " but index ", i,
699               " has shape: ", value_t->shape().DebugString()));
700       input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
701           value_t->shaped<T, 2>({1, value_t->NumElements()})));
702     }
703 
704 #if GOOGLE_CUDA
705     if (std::is_same<Device, GPUDevice>::value) {
706       ConcatGPU<T>(ctx, input_tensors_flat, output_tensor, &output_flat);
707       return;
708     }
709 #endif  // GOOGLE_CUDA
710     ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
711   }
712 
713  private:
714   DataType dtype_;
715   PartialTensorShape element_shape_;
716 };
717 
718 #define REGISTER_GATHER_AND_PACK(type)                                      \
719   REGISTER_KERNEL_BUILDER(                                                  \
720       Name("TensorArrayPack")                                               \
721           .Device(DEVICE_CPU)                                               \
722           .TypeConstraint<type>("dtype"),                                   \
723       TensorArrayPackOrGatherOp<CPUDevice, type, true /* LEGACY_PACK */>);  \
724   REGISTER_KERNEL_BUILDER(                                                  \
725       Name("TensorArrayGather")                                             \
726           .Device(DEVICE_CPU)                                               \
727           .TypeConstraint<type>("dtype"),                                   \
728       TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
729   REGISTER_KERNEL_BUILDER(                                                  \
730       Name("TensorArrayGatherV2")                                           \
731           .Device(DEVICE_CPU)                                               \
732           .TypeConstraint<type>("dtype"),                                   \
733       TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
734   REGISTER_KERNEL_BUILDER(                                                  \
735       Name("TensorArrayGatherV3")                                           \
736           .Device(DEVICE_CPU)                                               \
737           .TypeConstraint<type>("dtype"),                                   \
738       TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>);
739 
740 TF_CALL_POD_STRING_TYPES(REGISTER_GATHER_AND_PACK);
741 TF_CALL_variant(REGISTER_GATHER_AND_PACK);
742 REGISTER_GATHER_AND_PACK(quint8);
743 REGISTER_GATHER_AND_PACK(qint8);
744 REGISTER_GATHER_AND_PACK(qint32);
745 
746 #undef REGISTER_GATHER_AND_PACK
747 
748 #if GOOGLE_CUDA
749 
750 #define REGISTER_GPU(type)                                                  \
751   REGISTER_KERNEL_BUILDER(                                                  \
752       Name("TensorArrayPack")                                               \
753           .Device(DEVICE_GPU)                                               \
754           .TypeConstraint<type>("dtype")                                    \
755           .HostMemory("handle"),                                            \
756       TensorArrayPackOrGatherOp<GPUDevice, type, true /* LEGACY_PACK */>);  \
757   REGISTER_KERNEL_BUILDER(                                                  \
758       Name("TensorArrayGather")                                             \
759           .Device(DEVICE_GPU)                                               \
760           .TypeConstraint<type>("dtype")                                    \
761           .HostMemory("indices")                                            \
762           .HostMemory("handle"),                                            \
763       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
764   REGISTER_KERNEL_BUILDER(                                                  \
765       Name("TensorArrayGatherV2")                                           \
766           .Device(DEVICE_GPU)                                               \
767           .TypeConstraint<type>("dtype")                                    \
768           .HostMemory("indices")                                            \
769           .HostMemory("handle"),                                            \
770       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
771   REGISTER_KERNEL_BUILDER(                                                  \
772       Name("TensorArrayGatherV3")                                           \
773           .Device(DEVICE_GPU)                                               \
774           .TypeConstraint<type>("dtype")                                    \
775           .HostMemory("indices")                                            \
776           .HostMemory("handle"),                                            \
777       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
778 
779 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
780 TF_CALL_complex64(REGISTER_GPU);
781 TF_CALL_complex128(REGISTER_GPU);
782 REGISTER_GPU(bfloat16);
783 #undef REGISTER_GPU
784 
785 // A special GPU kernel for int32.
786 // TODO(b/25387198): Also enable int32 in device memory. This kernel
787 // registration requires all int32 inputs and outputs to be in host memory.
788 REGISTER_KERNEL_BUILDER(
789     Name("TensorArrayGather")
790         .Device(DEVICE_GPU)
791         .TypeConstraint<int32>("dtype")
792         .HostMemory("indices")
793         .HostMemory("handle"),
794     TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
795 REGISTER_KERNEL_BUILDER(
796     Name("TensorArrayGatherV2")
797         .Device(DEVICE_GPU)
798         .TypeConstraint<int32>("dtype")
799         .HostMemory("indices")
800         .HostMemory("handle"),
801     TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
802 REGISTER_KERNEL_BUILDER(
803     Name("TensorArrayGatherV3")
804         .Device(DEVICE_GPU)
805         .TypeConstraint<int32>("dtype")
806         .HostMemory("indices")
807         .HostMemory("handle"),
808     TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
809 
810 #endif  // GOOGLE_CUDA
811 
812 // CONCAT *********************************************************************
813 
814 // Concatenate the elements in a TensorArray.  All elements must be
815 // defined and (excepting the first dimension) have the same shape.
816 template <typename Device, typename T>
817 class TensorArrayConcatOp : public OpKernel {
818  public:
819   typedef typename TTypes<T, 2>::ConstMatrix ConstMatrix;
820   typedef std::vector<std::unique_ptr<ConstMatrix> > ConstMatrixVector;
821 
TensorArrayConcatOp(OpKernelConstruction * context)822   explicit TensorArrayConcatOp(OpKernelConstruction* context)
823       : OpKernel(context) {
824     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
825     OP_REQUIRES_OK(context, context->GetAttr("element_shape_except0",
826                                              &element_shape_except0_));
827   }
828 
Compute(OpKernelContext * ctx)829   void Compute(OpKernelContext* ctx) override {
830     OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
831 
832     TensorArray* tensor_array = nullptr;
833     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
834     core::ScopedUnref unref(tensor_array);
835     OP_REQUIRES(
836         ctx, dtype_ == tensor_array->ElemType(),
837         errors::InvalidArgument(
838             "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
839             " but Op requested dtype ", DataTypeString(dtype_), "."));
840 
841     int32 array_size;
842     OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&array_size));
843 
844     // If there are no elements, return a zero-element Tensor with
845     // shape [0] + element_shape_except0_
846     if (array_size == 0) {
847       OP_REQUIRES(
848           ctx, element_shape_except0_.IsFullyDefined(),
849           errors::Unimplemented(
850               "TensorArray has size zero, but element_shape_except0 ",
851               element_shape_except0_.DebugString(),
852               " is not fully defined. "
853               "Currently only static shapes are supported when concatenating "
854               "zero-size TensorArrays."));
855       TensorShape empty_shape;
856       element_shape_except0_.AsTensorShape(&empty_shape);
857       empty_shape.InsertDim(0, 0);
858       Tensor* empty_unused;
859       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &empty_unused));
860       OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {0}, &empty_unused));
861       return;
862     }
863 
864     // Read all the PersistentTensors into a vector to keep track of
865     // their memory.
866     std::vector<PersistentTensor> values;
867     std::vector<int32> indices(array_size);
868     std::iota(indices.begin(), indices.end(), 0);
869     Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
870     OP_REQUIRES_OK(ctx, s);
871 
872     std::vector<const Tensor*> value_tensors;
873     value_tensors.resize(values.size());
874 
875     Tensor* lengths_tensor = nullptr;
876     OP_REQUIRES_OK(ctx, ctx->allocate_output(
877                             1, TensorShape({static_cast<int64>(values.size())}),
878                             &lengths_tensor));
879     auto lengths_tensor_t = lengths_tensor->vec<int64>();
880 
881     TensorShape output_shape;
882     TensorShape output_shape_except0;
883     for (std::size_t i = 0; i < values.size(); ++i) {
884       value_tensors[i] = values[i].AccessTensor(ctx);
885       TensorShape value_shape_t = value_tensors[i]->shape();
886 
887       OP_REQUIRES(
888           ctx, TensorShapeUtils::IsVectorOrHigher(value_shape_t),
889           errors::InvalidArgument(
890               "Concat saw a scalar shape at index ", i,
891               " but requires at least vectors.  Did you mean to call pack?"));
892 
893       lengths_tensor_t(i) = value_shape_t.dim_size(0);
894 
895       TensorShape value_shape_t_except0 = value_shape_t;
896       value_shape_t_except0.RemoveDim(0);
897       if (i == 0) {
898         output_shape = value_shape_t;
899         output_shape_except0 = value_shape_t_except0;
900         OP_REQUIRES(
901             ctx, element_shape_except0_.IsCompatibleWith(output_shape_except0),
902             errors::InvalidArgument(
903                 "TensorArray was passed element_shape_except0 ",
904                 element_shape_except0_.DebugString(),
905                 " but index 0 has (excepting dimension 0) shape: ",
906                 value_shape_t_except0.DebugString(), " which does not match."));
907       } else {
908         OP_REQUIRES(ctx, output_shape_except0 == value_shape_t_except0,
909                     errors::InvalidArgument(
910                         "TensorArray has inconsistent shapes.  Index 0 has "
911                         "(excepting dimension 0) shape: ",
912                         output_shape_except0.DebugString(), " but index ", i,
913                         " has (excepting dimension 0) shape: ",
914                         value_shape_t_except0.DebugString()));
915         // Store the previous maximum length as the offset for this tensor.
916         output_shape.set_dim(
917             0, output_shape.dim_size(0) + value_shape_t.dim_size(0));
918       }
919     }
920 
921     Tensor* output_tensor = nullptr;
922     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
923     ConstMatrixVector input_tensors_flat;
924     input_tensors_flat.reserve(values.size());
925     for (size_t i = 0; i < values.size(); ++i) {
926       const Tensor* value_t = value_tensors[i];
927       if (value_t->NumElements() > 0) {
928         input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
929             value_t->shaped<T, 2>({1, value_t->NumElements()})));
930       }
931     }
932 
933     if (output_shape.num_elements() > 0) {
934       auto output_flat =
935           output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
936 #if GOOGLE_CUDA
937       if (std::is_same<Device, GPUDevice>::value) {
938         ConcatGPU<T>(ctx, input_tensors_flat, output_tensor, &output_flat);
939         return;
940       }
941 #endif  // GOOGLE_CUDA
942       ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
943     }
944   }
945 
946  private:
947   DataType dtype_;
948   PartialTensorShape element_shape_except0_;
949 };
950 
951 #define REGISTER_CONCAT(type)                                    \
952   REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")              \
953                               .Device(DEVICE_CPU)                \
954                               .TypeConstraint<type>("dtype")     \
955                               .HostMemory("lengths")             \
956                               .HostMemory("handle"),             \
957                           TensorArrayConcatOp<CPUDevice, type>); \
958   REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")            \
959                               .Device(DEVICE_CPU)                \
960                               .TypeConstraint<type>("dtype")     \
961                               .HostMemory("lengths")             \
962                               .HostMemory("handle"),             \
963                           TensorArrayConcatOp<CPUDevice, type>)  \
964   REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")            \
965                               .Device(DEVICE_CPU)                \
966                               .TypeConstraint<type>("dtype")     \
967                               .HostMemory("lengths")             \
968                               .HostMemory("handle"),             \
969                           TensorArrayConcatOp<CPUDevice, type>)
970 
971 TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
972 REGISTER_CONCAT(quint8);
973 REGISTER_CONCAT(qint8);
974 REGISTER_CONCAT(qint32);
975 
976 #undef REGISTER_CONCAT
977 
978 #if GOOGLE_CUDA
979 
980 #define REGISTER_GPU(type)                                       \
981   REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")              \
982                               .Device(DEVICE_GPU)                \
983                               .TypeConstraint<type>("dtype")     \
984                               .HostMemory("lengths")             \
985                               .HostMemory("handle"),             \
986                           TensorArrayConcatOp<GPUDevice, type>); \
987   REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")            \
988                               .Device(DEVICE_GPU)                \
989                               .TypeConstraint<type>("dtype")     \
990                               .HostMemory("lengths")             \
991                               .HostMemory("handle"),             \
992                           TensorArrayConcatOp<GPUDevice, type>)  \
993   REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")            \
994                               .Device(DEVICE_GPU)                \
995                               .TypeConstraint<type>("dtype")     \
996                               .HostMemory("lengths")             \
997                               .HostMemory("handle"),             \
998                           TensorArrayConcatOp<GPUDevice, type>)
999 
1000 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1001 TF_CALL_complex64(REGISTER_GPU);
1002 TF_CALL_complex128(REGISTER_GPU);
1003 REGISTER_GPU(bfloat16);
1004 #undef REGISTER_GPU
1005 
1006 // A special GPU kernel for int32.
1007 // TODO(b/25387198): Also enable int32 in device memory. This kernel
1008 // registration requires all int32 inputs and outputs to be in host memory.
1009 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")
1010                             .Device(DEVICE_GPU)
1011                             .TypeConstraint<int32>("dtype")
1012                             .HostMemory("lengths")
1013                             .HostMemory("handle"),
1014                         TensorArrayConcatOp<CPUDevice, int32>);
1015 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")
1016                             .Device(DEVICE_GPU)
1017                             .TypeConstraint<int32>("dtype")
1018                             .HostMemory("lengths")
1019                             .HostMemory("handle"),
1020                         TensorArrayConcatOp<CPUDevice, int32>);
1021 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
1022                             .Device(DEVICE_GPU)
1023                             .TypeConstraint<int32>("dtype")
1024                             .HostMemory("lengths")
1025                             .HostMemory("handle"),
1026                         TensorArrayConcatOp<CPUDevice, int32>);
1027 
1028 #endif  // GOOGLE_CUDA
1029 
1030 // UNPACK and SCATTER *********************************************************
1031 
1032 template <typename Device, typename T, bool LEGACY_UNPACK>
1033 class TensorArrayUnpackOrScatterOp : public OpKernel {
1034  public:
TensorArrayUnpackOrScatterOp(OpKernelConstruction * context)1035   explicit TensorArrayUnpackOrScatterOp(OpKernelConstruction* context)
1036       : OpKernel(context) {}
1037 
Compute(OpKernelContext * ctx)1038   void Compute(OpKernelContext* ctx) override {
1039     OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
1040 
1041     TensorArray* tensor_array = nullptr;
1042     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1043     core::ScopedUnref unref(tensor_array);
1044     const Tensor* tensor_value;
1045     OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
1046     TensorShape element_shape(tensor_value->shape());
1047 
1048     OP_REQUIRES(ctx,
1049                 FastBoundsCheck(element_shape.dim_size(0),
1050                                 std::numeric_limits<int32>::max()),
1051                 errors::InvalidArgument("tensor dim0 too large to unpack"));
1052 
1053     OP_REQUIRES(
1054         ctx, tensor_value->dtype() == tensor_array->ElemType(),
1055         errors::InvalidArgument("TensorArray dtype is ",
1056                                 DataTypeString(tensor_array->ElemType()),
1057                                 " but Op is trying to write dtype ",
1058                                 DataTypeString(tensor_value->dtype()), "."));
1059     OP_REQUIRES(ctx, element_shape.dims() > 0,
1060                 errors::InvalidArgument("Input value for unpack must be at "
1061                                         "least a vector but received shape: ",
1062                                         element_shape.DebugString()));
1063     int32 array_size;
1064     OP_REQUIRES_OK(ctx, tensor_array->Size(&array_size));
1065 
1066     int32 max_index;
1067     int32 num_values;
1068     std::vector<int32> write_indices;
1069     if (LEGACY_UNPACK) {
1070       num_values = element_shape.dim_size(0);
1071       max_index = num_values - 1;
1072       write_indices.resize(num_values);
1073       std::iota(write_indices.begin(), write_indices.end(), 0);
1074     } else {
1075       const Tensor* tensor_indices;
1076       OP_REQUIRES_OK(ctx, ctx->input("indices", &tensor_indices));
1077       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_indices->shape()),
1078                   errors::InvalidArgument(
1079                       "Expected indices to be a vector, but received shape: ",
1080                       tensor_indices->shape().DebugString()));
1081       OP_REQUIRES(ctx,
1082                   tensor_indices->NumElements() == element_shape.dim_size(0),
1083                   errors::InvalidArgument(
1084                       "Expected len(indices) == values.shape[0], but saw: ",
1085                       tensor_indices->NumElements(), " vs. ",
1086                       element_shape.dim_size(0)));
1087       const auto indices_t = tensor_indices->vec<int32>();
1088       num_values = tensor_indices->NumElements();
1089       max_index = (num_values == 0)
1090                       ? -1
1091                       : *std::max_element(indices_t.data(),
1092                                           indices_t.data() + num_values);
1093       write_indices.resize(num_values);
1094       // Copy into write_indices.
1095       std::copy(indices_t.data(), indices_t.data() + num_values,
1096                 write_indices.begin());
1097     }
1098 
1099     bool dynamic_size = tensor_array->HasDynamicSize();
1100 
1101     // If dynamic size, we may have to resize the TensorArray to fit.
1102     if (dynamic_size && array_size < max_index + 1) {
1103       array_size = static_cast<int32>(max_index + 1);
1104     }
1105 
1106     if (LEGACY_UNPACK) {
1107       OP_REQUIRES(
1108           ctx, element_shape.dim_size(0) == array_size,
1109           errors::InvalidArgument(
1110               "Input value must have first dimension equal to the array size (",
1111               element_shape.dim_size(0), " vs. ", array_size, ")"));
1112     } else {
1113       OP_REQUIRES(
1114           ctx, max_index < array_size,
1115           errors::InvalidArgument("Max scatter index must be < array size (",
1116                                   max_index, " vs. ", array_size, ")"));
1117     }
1118     element_shape.RemoveDim(0);
1119 
1120     auto tensor_value_t = tensor_value->shaped<T, 3>(
1121         {1, num_values, element_shape.num_elements()});
1122 
1123     Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
1124     Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
1125         1, 1, static_cast<Eigen::DenseIndex>(element_shape.num_elements())};
1126 
1127     std::vector<PersistentTensor> write_values;
1128     write_values.reserve(num_values);
1129 
1130     for (int i = 0; i < num_values; ++i) {
1131       Tensor* tensor_value_i;
1132       PersistentTensor persistent_tensor;
1133       OP_REQUIRES_OK(
1134           ctx, ctx->allocate_persistent(tensor_array->ElemType(), element_shape,
1135                                         &persistent_tensor, &tensor_value_i));
1136       auto tensor_value_i_t =
1137           tensor_value_i->shaped<T, 3>({1, 1, element_shape.num_elements()});
1138       indices[1] = i;
1139 
1140       if (element_shape.num_elements() > 0) {
1141         functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
1142                                        tensor_value_i_t, tensor_value_t,
1143                                        indices, sizes);
1144       }
1145 
1146       write_values.push_back(persistent_tensor);
1147     }
1148 
1149     // Record the pack size of the TensorArray.
1150     if (LEGACY_UNPACK) {
1151       OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
1152     }
1153 
1154     Status s = tensor_array->WriteOrAggregateMany<Device, T>(ctx, write_indices,
1155                                                              &write_values);
1156     OP_REQUIRES_OK(ctx, s);
1157   }
1158 };
1159 
1160 #define REGISTER_SCATTER_AND_UNPACK(type)                                      \
1161   REGISTER_KERNEL_BUILDER(                                                     \
1162       Name("TensorArrayUnpack").Device(DEVICE_CPU).TypeConstraint<type>("T"),  \
1163       TensorArrayUnpackOrScatterOp<CPUDevice, type,                            \
1164                                    true /* LEGACY_UNPACK */>);                 \
1165   REGISTER_KERNEL_BUILDER(                                                     \
1166       Name("TensorArrayScatter").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1167       TensorArrayUnpackOrScatterOp<CPUDevice, type,                            \
1168                                    false /* LEGACY_UNPACK */>);                \
1169   REGISTER_KERNEL_BUILDER(                                                     \
1170       Name("TensorArrayScatterV2")                                             \
1171           .Device(DEVICE_CPU)                                                  \
1172           .TypeConstraint<type>("T"),                                          \
1173       TensorArrayUnpackOrScatterOp<CPUDevice, type,                            \
1174                                    false /* LEGACY_UNPACK */>);                \
1175   REGISTER_KERNEL_BUILDER(                                                     \
1176       Name("TensorArrayScatterV3")                                             \
1177           .Device(DEVICE_CPU)                                                  \
1178           .TypeConstraint<type>("T"),                                          \
1179       TensorArrayUnpackOrScatterOp<CPUDevice, type,                            \
1180                                    false /* LEGACY_UNPACK */>);
1181 
1182 TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
1183 #undef REGISTER_SCATTER_AND_UNPACK
1184 
1185 #if GOOGLE_CUDA
1186 
1187 #define REGISTER_GPU(type)                                      \
1188   REGISTER_KERNEL_BUILDER(                                      \
1189       Name("TensorArrayUnpack")                                 \
1190           .Device(DEVICE_GPU)                                   \
1191           .TypeConstraint<type>("T")                            \
1192           .HostMemory("handle"),                                \
1193       TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
1194                                    true /* LEGACY_UNPACK */>);  \
1195   REGISTER_KERNEL_BUILDER(                                      \
1196       Name("TensorArrayScatter")                                \
1197           .Device(DEVICE_GPU)                                   \
1198           .TypeConstraint<type>("T")                            \
1199           .HostMemory("indices")                                \
1200           .HostMemory("handle"),                                \
1201       TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
1202                                    false /* LEGACY_UNPACK */>); \
1203   REGISTER_KERNEL_BUILDER(                                      \
1204       Name("TensorArrayScatterV2")                              \
1205           .Device(DEVICE_GPU)                                   \
1206           .TypeConstraint<type>("T")                            \
1207           .HostMemory("indices")                                \
1208           .HostMemory("handle"),                                \
1209       TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
1210                                    false /* LEGACY_UNPACK */>); \
1211   REGISTER_KERNEL_BUILDER(                                      \
1212       Name("TensorArrayScatterV3")                              \
1213           .Device(DEVICE_GPU)                                   \
1214           .TypeConstraint<type>("T")                            \
1215           .HostMemory("indices")                                \
1216           .HostMemory("handle"),                                \
1217       TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
1218                                    false /* LEGACY_UNPACK */>);
1219 
1220 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1221 TF_CALL_complex64(REGISTER_GPU);
1222 TF_CALL_complex128(REGISTER_GPU);
1223 TF_CALL_int64(REGISTER_GPU);
1224 #undef REGISTER_GPU
1225 
1226 #endif  // GOOGLE_CUDA
1227 
1228 // SPLIT *********************************************************************
1229 
1230 template <typename Device, typename T>
1231 class TensorArraySplitOp : public OpKernel {
1232  public:
TensorArraySplitOp(OpKernelConstruction * context)1233   explicit TensorArraySplitOp(OpKernelConstruction* context)
1234       : OpKernel(context) {}
1235 
Compute(OpKernelContext * ctx)1236   void Compute(OpKernelContext* ctx) override {
1237     OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
1238 
1239     TensorArray* tensor_array = nullptr;
1240     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1241     core::ScopedUnref unref(tensor_array);
1242     const Tensor* tensor_value;
1243     OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
1244     const Tensor* tensor_lengths;
1245     OP_REQUIRES_OK(ctx, ctx->input("lengths", &tensor_lengths));
1246 
1247     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_lengths->shape()),
1248                 errors::InvalidArgument(
1249                     "Expected lengths to be a vector, received shape: ",
1250                     tensor_lengths->shape().DebugString()));
1251     OP_REQUIRES(ctx,
1252                 FastBoundsCheck(tensor_lengths->NumElements(),
1253                                 std::numeric_limits<int32>::max()),
1254                 errors::InvalidArgument(
1255                     "Expected lengths to have < max int32 entries"));
1256 
1257     int32 num_tensors = static_cast<int32>(tensor_lengths->NumElements());
1258     auto tensor_lengths_t = tensor_lengths->vec<int64>();
1259     std::vector<int64> cumulative_lengths;
1260     cumulative_lengths.reserve(num_tensors);
1261     int64 total_length = 0;
1262     for (int i = 0; i < num_tensors; ++i) {
1263       total_length += tensor_lengths_t(i);
1264       cumulative_lengths.push_back(total_length);
1265     }
1266 
1267     OP_REQUIRES(
1268         ctx, TensorShapeUtils::IsVectorOrHigher(tensor_value->shape()),
1269         errors::InvalidArgument(
1270             "Expected value to be at least a vector, but received shape: ",
1271             tensor_value->shape().DebugString()));
1272 
1273     OP_REQUIRES(
1274         ctx, total_length == tensor_value->shape().dim_size(0),
1275         errors::InvalidArgument("Expected sum of lengths to be equal to "
1276                                 "values.shape[0], but sum of lengths is ",
1277                                 total_length, " and value's shape is: ",
1278                                 tensor_value->shape().DebugString()));
1279     int64 elements_per_row =
1280         (total_length == 0) ? 0 : (tensor_value->NumElements() / total_length);
1281 
1282     int32 array_size;
1283     OP_REQUIRES_OK(ctx, tensor_array->Size(&array_size));
1284     bool dynamic_size = tensor_array->HasDynamicSize();
1285 
1286     std::vector<TensorShape> element_shapes(num_tensors, tensor_value->shape());
1287     for (int32 i = 0; i < num_tensors; ++i) {
1288       element_shapes[i].set_dim(0, tensor_lengths_t(i));
1289     }
1290 
1291     // If dynamic size, we may have to resize the TensorArray to fit.
1292     if (dynamic_size && array_size < num_tensors) {
1293       array_size = num_tensors;
1294     }
1295 
1296     OP_REQUIRES(
1297         ctx, array_size == num_tensors,
1298         errors::InvalidArgument(
1299             "TensorArray's size is not equal to the size of lengths (",
1300             array_size, " vs. ", num_tensors, "), and the TensorArray is not ",
1301             "marked as dynamically resizeable"));
1302 
1303     OP_REQUIRES(
1304         ctx, tensor_value->dtype() == tensor_array->ElemType(),
1305         errors::InvalidArgument("TensorArray dtype is ",
1306                                 DataTypeString(tensor_array->ElemType()),
1307                                 " but Op is trying to write dtype ",
1308                                 DataTypeString(tensor_value->dtype()), "."));
1309 
1310     auto tensor_value_t =
1311         tensor_value->shaped<T, 3>({1, total_length, elements_per_row});
1312 
1313     std::vector<PersistentTensor> write_values;
1314     write_values.reserve(array_size);
1315 
1316     for (int i = 0; i < array_size; ++i) {
1317       Tensor* tensor_value_i;
1318       PersistentTensor persistent_tensor;
1319 
1320       int64 previous_length = (i == 0) ? 0 : cumulative_lengths[i - 1];
1321       Eigen::DSizes<Eigen::DenseIndex, 3> indices{
1322           0, static_cast<Eigen::DenseIndex>(previous_length), 0};
1323       Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
1324           1, static_cast<Eigen::DenseIndex>(tensor_lengths_t(i)),
1325           static_cast<Eigen::DenseIndex>(elements_per_row)};
1326 
1327       OP_REQUIRES_OK(ctx, ctx->allocate_persistent(
1328                               tensor_array->ElemType(), element_shapes[i],
1329                               &persistent_tensor, &tensor_value_i));
1330 
1331       if (tensor_lengths_t(i) > 0) {
1332         auto tensor_value_i_t = tensor_value_i->shaped<T, 3>(
1333             {1, tensor_lengths_t(i), elements_per_row});
1334 
1335         functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
1336                                        tensor_value_i_t, tensor_value_t,
1337                                        indices, sizes);
1338       }
1339 
1340       write_values.push_back(persistent_tensor);
1341     }
1342 
1343     // Record the concat size of the TensorArray.
1344     OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
1345 
1346     std::vector<int32> indices(array_size);
1347     std::iota(indices.begin(), indices.end(), 0);
1348 
1349     Status s = tensor_array->WriteOrAggregateMany<Device, T>(ctx, indices,
1350                                                              &write_values);
1351     OP_REQUIRES_OK(ctx, s);
1352   }
1353 };
1354 
1355 #define REGISTER_SPLIT(type)                                                   \
1356   REGISTER_KERNEL_BUILDER(                                                     \
1357       Name("TensorArraySplit").Device(DEVICE_CPU).TypeConstraint<type>("T"),   \
1358       TensorArraySplitOp<CPUDevice, type>);                                    \
1359   REGISTER_KERNEL_BUILDER(                                                     \
1360       Name("TensorArraySplitV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1361       TensorArraySplitOp<CPUDevice, type>);                                    \
1362   REGISTER_KERNEL_BUILDER(                                                     \
1363       Name("TensorArraySplitV3").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1364       TensorArraySplitOp<CPUDevice, type>);
1365 
1366 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
1367 #undef REGISTER_SPLIT
1368 
1369 #if GOOGLE_CUDA
1370 
1371 #define REGISTER_GPU(type)                                      \
1372   REGISTER_KERNEL_BUILDER(Name("TensorArraySplit")              \
1373                               .Device(DEVICE_GPU)               \
1374                               .TypeConstraint<type>("T")        \
1375                               .HostMemory("lengths")            \
1376                               .HostMemory("handle"),            \
1377                           TensorArraySplitOp<GPUDevice, type>); \
1378   REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV2")            \
1379                               .Device(DEVICE_GPU)               \
1380                               .TypeConstraint<type>("T")        \
1381                               .HostMemory("lengths")            \
1382                               .HostMemory("handle"),            \
1383                           TensorArraySplitOp<GPUDevice, type>); \
1384   REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV3")            \
1385                               .Device(DEVICE_GPU)               \
1386                               .TypeConstraint<type>("T")        \
1387                               .HostMemory("lengths")            \
1388                               .HostMemory("handle"),            \
1389                           TensorArraySplitOp<GPUDevice, type>);
1390 
1391 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1392 TF_CALL_complex64(REGISTER_GPU);
1393 TF_CALL_complex128(REGISTER_GPU);
1394 #undef REGISTER_GPU
1395 
1396 #endif  // GOOGLE_CUDA
1397 
1398 // SIZE ***********************************************************************
1399 
1400 // Get the size of the TensorArray
1401 class TensorArraySizeOp : public OpKernel {
1402  public:
TensorArraySizeOp(OpKernelConstruction * context)1403   explicit TensorArraySizeOp(OpKernelConstruction* context)
1404       : OpKernel(context) {}
1405 
Compute(OpKernelContext * ctx)1406   void Compute(OpKernelContext* ctx) override {
1407     TensorArray* tensor_array;
1408     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1409     core::ScopedUnref unref(tensor_array);
1410     Tensor* output = nullptr;
1411     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
1412     OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32>()())));
1413   }
1414 };
1415 
1416 REGISTER_KERNEL_BUILDER(Name("TensorArraySize").Device(DEVICE_CPU),
1417                         TensorArraySizeOp);
1418 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV2").Device(DEVICE_CPU),
1419                         TensorArraySizeOp);
1420 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV3").Device(DEVICE_CPU),
1421                         TensorArraySizeOp);
1422 
1423 REGISTER_KERNEL_BUILDER(Name("TensorArraySize")
1424                             .Device(DEVICE_GPU)
1425                             .HostMemory("handle")
1426                             .HostMemory("size"),
1427                         TensorArraySizeOp);
1428 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV2")
1429                             .Device(DEVICE_GPU)
1430                             .HostMemory("handle")
1431                             .HostMemory("size"),
1432                         TensorArraySizeOp);
1433 REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV3")
1434                             .Device(DEVICE_GPU)
1435                             .HostMemory("handle")
1436                             .HostMemory("size"),
1437                         TensorArraySizeOp);
1438 
1439 // CLOSE
1440 // **********************************************************************
1441 
1442 // Delete the TensorArray from its resource container.  This enables
1443 // the user to close and release the resource in the middle of a step/run.
1444 // TODO(ebrevdo): decide whether closing the grad op should happen
1445 // here or on the python side.
1446 class TensorArrayCloseOp : public OpKernel {
1447  public:
TensorArrayCloseOp(OpKernelConstruction * context)1448   explicit TensorArrayCloseOp(OpKernelConstruction* context)
1449       : OpKernel(context) {}
1450 
Compute(OpKernelContext * ctx)1451   void Compute(OpKernelContext* ctx) override {
1452     TensorArray* tensor_array;
1453     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1454     core::ScopedUnref unref(tensor_array);
1455     // Instead of deleting this TA from the ResourceManager, we just
1456     // clear it away and mark it as closed.  The remaining memory
1457     // consumed store its mutex and handle Tensor.  This will be
1458     // cleared out at the end of the step anyway, so it's fine to keep
1459     // it around until the end of the step.  Further calls to the
1460     // TensorArray will fail because TensorArray checks internally to
1461     // see if it is closed or not.
1462     tensor_array->ClearAndMarkClosed();
1463   }
1464 };
1465 
1466 REGISTER_KERNEL_BUILDER(Name("TensorArrayClose").Device(DEVICE_CPU),
1467                         TensorArrayCloseOp);
1468 REGISTER_KERNEL_BUILDER(Name("TensorArrayCloseV2").Device(DEVICE_CPU),
1469                         TensorArrayCloseOp);
1470 REGISTER_KERNEL_BUILDER(Name("TensorArrayCloseV3").Device(DEVICE_CPU),
1471                         TensorArrayCloseOp);
1472 
1473 REGISTER_KERNEL_BUILDER(
1474     Name("TensorArrayClose").Device(DEVICE_GPU).HostMemory("handle"),
1475     TensorArrayCloseOp);
1476 REGISTER_KERNEL_BUILDER(
1477     Name("TensorArrayCloseV2").Device(DEVICE_GPU).HostMemory("handle"),
1478     TensorArrayCloseOp);
1479 REGISTER_KERNEL_BUILDER(
1480     Name("TensorArrayCloseV3").Device(DEVICE_GPU).HostMemory("handle"),
1481     TensorArrayCloseOp);
1482 
1483 }  // namespace tensorflow
1484