• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/math_ops.cc.
17 
18 #define EIGEN_USE_THREADS
19 
20 #include <numeric>
21 
22 #include "tensorflow/core/kernels/aggregate_ops.h"
23 #include "tensorflow/core/kernels/aggregate_ops_cpu.h"
24 
25 #include "tensorflow/core/framework/numeric_op.h"
26 #include "tensorflow/core/framework/register_types.h"
27 #include "tensorflow/core/framework/variant.h"
28 #include "tensorflow/core/framework/variant_encode_decode.h"
29 #include "tensorflow/core/framework/variant_op_registry.h"
30 #include "tensorflow/core/lib/gtl/inlined_vector.h"
31 #include "tensorflow/core/platform/logging.h"
32 
33 namespace tensorflow {
34 
35 typedef Eigen::ThreadPoolDevice CPUDevice;
36 typedef Eigen::GpuDevice GPUDevice;
37 #ifdef TENSORFLOW_USE_SYCL
38 typedef Eigen::SyclDevice SYCLDevice;
39 #endif  // TENSORFLOW_USE_SYCL
40 
41 template <typename Device, typename T>
42 class AddNOp : public OpKernel {
43  public:
AddNOp(OpKernelConstruction * context)44   explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {}
45 
Compute(OpKernelContext * ctx)46   void Compute(OpKernelContext* ctx) override {
47     if (!ctx->ValidateInputsAreSameShape(this)) return;
48 
49     const Tensor& input0 = ctx->input(0);
50     const int num = ctx->num_inputs();
51 
52     if (num == 1) {
53       ctx->set_output(0, input0);
54       return;
55     }
56 
57     // Try to forward and accumulate the result in one of the input buffers.
58     int reused_input = -1;
59     gtl::InlinedVector<int, 8> input_indices(num);
60     std::iota(input_indices.begin(), input_indices.end(), 0);
61     Tensor* output = nullptr;
62     for (int input_idx = 0; input_idx < num; ++input_idx) {
63       if (ctx->forward_input_to_output_with_shape(input_idx, 0, input0.shape(),
64                                                   &output)) {
65         reused_input = input_idx;
66         break;
67       }
68     }
69     if (reused_input == -1) {
70       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
71     } else if (reused_input > 0) {
72       // Move the forwarded buffer to the front so we don't double count
73       // anything if there are more than 8 inputs.
74       input_indices[0] = reused_input;
75       input_indices[reused_input] = 0;
76     }
77     auto To = output->flat<T>();
78 
79 #define I(IDX) ctx->input(input_indices[IDX]).flat<T>()
80 
81 #if defined(__ANDROID_TYPES_SLIM__)
82     // On Android by default,we only support additions of two arguments, so we
83     // can reduce the number of template instantiations.
84     OP_REQUIRES(ctx, num == 2,
85                 errors::InvalidArgument("Only additions of two arguments "
86                                         "supported. Num inputs: ",
87                                         num));
88     functor::Add2Functor<Device, T> functor2;
89     functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
90 #else
91     static const int kWidth = 8;
92     int r = num % kWidth;
93 
94     switch (r) {
95       case 2: {
96         functor::Add2Functor<Device, T> functor2;
97         functor2(ctx->template eigen_device<Device>(), To, I(0), I(1));
98         break;
99       }
100       case 3: {
101         functor::Add3Functor<Device, T> functor3;
102         functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2));
103         break;
104       }
105       case 4: {
106         functor::Add4Functor<Device, T> functor4;
107         functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
108                  I(3));
109         break;
110       }
111       case 5: {
112         functor::Add5Functor<Device, T> functor5;
113         functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
114                  I(3), I(4));
115         break;
116       }
117       case 6: {
118         functor::Add6Functor<Device, T> functor6;
119         functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
120                  I(3), I(4), I(5));
121         break;
122       }
123       case 7: {
124         functor::Add7Functor<Device, T> functor7;
125         functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
126                  I(3), I(4), I(5), I(6));
127         break;
128       }
129       case 0: {
130         functor::Add8Functor<Device, T> functor8;
131         functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
132                  I(3), I(4), I(5), I(6), I(7));
133         r = 8;
134         break;
135       }
136       case 1: {
137         functor::Add9Functor<Device, T> functor9;
138         functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2),
139                  I(3), I(4), I(5), I(6), I(7), I(8));
140         r = 9;
141         break;
142       }
143     }
144 
145     for (; r < num; r += kWidth) {
146       functor::Add8pFunctor<Device, T> functor8p;
147       functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1),
148                 I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7));
149     }
150 #endif  // defined(__ANDROID_TYPES_SLIM__)
151 
152 #undef I
153   }
154 };
155 
156 template <typename Device>
157 class AddNOp<Device, Variant> : public OpKernel {
158  public:
AddNOp(OpKernelConstruction * context)159   explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {}
160 
Compute(OpKernelContext * ctx)161   void Compute(OpKernelContext* ctx) override {
162     if (!ctx->ValidateInputsAreSameShape(this)) return;
163 
164     const Tensor& input0 = ctx->input(0);
165     const int num = ctx->num_inputs();
166 
167     if (num == 1) {
168       ctx->set_output(0, input0);
169       return;
170     }
171 
172     for (int i = 0; i < num; ++i) {
173       // Step 1: ensure unary variants.
174       OP_REQUIRES(
175           ctx, ctx->input(i).dims() == 0,
176           errors::InvalidArgument(
177               "AddN of non-scalar Tensor with dtype=DT_VARIANT is not "
178               "supported; inputs[",
179               i, " has shape: ", ctx->input(i).shape().DebugString(), "."));
180     }
181 
182     // Step 2: attempt to add using
183     //   BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...)
184     //   For the output create a default-constructed variant object.
185     // TODO(ebrevdo): Perform summation in a tree-structure.
186     Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
187     Variant* v_out = &(out.scalar<Variant>()());
188     OP_REQUIRES_OK(
189         ctx, BinaryOpVariants<Device>(
190                  ctx, ADD_VARIANT_BINARY_OP, ctx->input(0).scalar<Variant>()(),
191                  ctx->input(1).scalar<Variant>()(), v_out));
192     for (int i = 2; i < num; ++i) {
193       const Variant tmp = std::move(*v_out);
194       const Variant& inp = ctx->input(i).scalar<Variant>()();
195       OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP,
196                                                    inp, tmp, v_out));
197     }
198     ctx->set_output(0, out);
199   }
200 };
201 
202 #define REGISTER_ADDN(type, dev)                                   \
203   REGISTER_KERNEL_BUILDER(                                         \
204       Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
205       AddNOp<dev##Device, type>)
206 
207 #define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU)
208 
209 TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU);
210 REGISTER_ADDN_CPU(Variant);
211 
212 #undef REGISTER_ADDN_CPU
213 
214 #if GOOGLE_CUDA
215 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
216 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
217 TF_CALL_int64(REGISTER_ADDN_GPU);
218 TF_CALL_complex64(REGISTER_ADDN_GPU);
219 TF_CALL_complex128(REGISTER_ADDN_GPU);
220 TF_CALL_variant(REGISTER_ADDN_GPU);
221 #undef REGISTER_ADDN_GPU
222 
223 // A special GPU kernel for int32.
224 // TODO(b/25387198): Also enable int32 in device memory. This kernel
225 // registration requires all int32 inputs and outputs to be in host memory.
226 REGISTER_KERNEL_BUILDER(Name("AddN")
227                             .Device(DEVICE_GPU)
228                             .TypeConstraint<int32>("T")
229                             .HostMemory("inputs")
230                             .HostMemory("sum"),
231                         AddNOp<CPUDevice, int32>);
232 
233 #endif  // GOOGLE_CUDA
234 
235 #ifdef TENSORFLOW_USE_SYCL
236 REGISTER_ADDN(float, SYCL);
237 REGISTER_ADDN(double, SYCL);
238 
239 // A special GPU kernel for int32.
240 // TODO(b/25387198): Also enable int32 in device memory. This kernel
241 // registration requires all int32 inputs and outputs to be in host memory.
242 REGISTER_KERNEL_BUILDER(Name("AddN")
243                             .Device(DEVICE_SYCL)
244                             .TypeConstraint<int32>("T")
245                             .HostMemory("inputs")
246                             .HostMemory("sum"),
247                         AddNOp<CPUDevice, int32>);
248 #endif  // TENSORFLOW_USE_SYCL
249 
250 #undef REGISTER_ADDN
251 
252 }  // namespace tensorflow
253