1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/math_ops.cc. 17 18 #define EIGEN_USE_THREADS 19 20 #include <numeric> 21 22 #include "tensorflow/core/kernels/aggregate_ops.h" 23 #include "tensorflow/core/kernels/aggregate_ops_cpu.h" 24 25 #include "tensorflow/core/framework/numeric_op.h" 26 #include "tensorflow/core/framework/register_types.h" 27 #include "tensorflow/core/framework/variant.h" 28 #include "tensorflow/core/framework/variant_encode_decode.h" 29 #include "tensorflow/core/framework/variant_op_registry.h" 30 #include "tensorflow/core/lib/gtl/inlined_vector.h" 31 #include "tensorflow/core/platform/logging.h" 32 33 namespace tensorflow { 34 35 typedef Eigen::ThreadPoolDevice CPUDevice; 36 typedef Eigen::GpuDevice GPUDevice; 37 #ifdef TENSORFLOW_USE_SYCL 38 typedef Eigen::SyclDevice SYCLDevice; 39 #endif // TENSORFLOW_USE_SYCL 40 41 template <typename Device, typename T> 42 class AddNOp : public OpKernel { 43 public: AddNOp(OpKernelConstruction * context)44 explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} 45 Compute(OpKernelContext * ctx)46 void Compute(OpKernelContext* ctx) override { 47 if (!ctx->ValidateInputsAreSameShape(this)) return; 48 49 const Tensor& input0 = ctx->input(0); 50 const int num = ctx->num_inputs(); 51 52 if (num == 1) { 53 ctx->set_output(0, input0); 54 return; 55 } 56 57 // Try to forward and accumulate the result in one of the input buffers. 58 int reused_input = -1; 59 gtl::InlinedVector<int, 8> input_indices(num); 60 std::iota(input_indices.begin(), input_indices.end(), 0); 61 Tensor* output = nullptr; 62 for (int input_idx = 0; input_idx < num; ++input_idx) { 63 if (ctx->forward_input_to_output_with_shape(input_idx, 0, input0.shape(), 64 &output)) { 65 reused_input = input_idx; 66 break; 67 } 68 } 69 if (reused_input == -1) { 70 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output)); 71 } else if (reused_input > 0) { 72 // Move the forwarded buffer to the front so we don't double count 73 // anything if there are more than 8 inputs. 74 input_indices[0] = reused_input; 75 input_indices[reused_input] = 0; 76 } 77 auto To = output->flat<T>(); 78 79 #define I(IDX) ctx->input(input_indices[IDX]).flat<T>() 80 81 #if defined(__ANDROID_TYPES_SLIM__) 82 // On Android by default,we only support additions of two arguments, so we 83 // can reduce the number of template instantiations. 84 OP_REQUIRES(ctx, num == 2, 85 errors::InvalidArgument("Only additions of two arguments " 86 "supported. Num inputs: ", 87 num)); 88 functor::Add2Functor<Device, T> functor2; 89 functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); 90 #else 91 static const int kWidth = 8; 92 int r = num % kWidth; 93 94 switch (r) { 95 case 2: { 96 functor::Add2Functor<Device, T> functor2; 97 functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); 98 break; 99 } 100 case 3: { 101 functor::Add3Functor<Device, T> functor3; 102 functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2)); 103 break; 104 } 105 case 4: { 106 functor::Add4Functor<Device, T> functor4; 107 functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 108 I(3)); 109 break; 110 } 111 case 5: { 112 functor::Add5Functor<Device, T> functor5; 113 functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 114 I(3), I(4)); 115 break; 116 } 117 case 6: { 118 functor::Add6Functor<Device, T> functor6; 119 functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 120 I(3), I(4), I(5)); 121 break; 122 } 123 case 7: { 124 functor::Add7Functor<Device, T> functor7; 125 functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 126 I(3), I(4), I(5), I(6)); 127 break; 128 } 129 case 0: { 130 functor::Add8Functor<Device, T> functor8; 131 functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 132 I(3), I(4), I(5), I(6), I(7)); 133 r = 8; 134 break; 135 } 136 case 1: { 137 functor::Add9Functor<Device, T> functor9; 138 functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 139 I(3), I(4), I(5), I(6), I(7), I(8)); 140 r = 9; 141 break; 142 } 143 } 144 145 for (; r < num; r += kWidth) { 146 functor::Add8pFunctor<Device, T> functor8p; 147 functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1), 148 I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7)); 149 } 150 #endif // defined(__ANDROID_TYPES_SLIM__) 151 152 #undef I 153 } 154 }; 155 156 template <typename Device> 157 class AddNOp<Device, Variant> : public OpKernel { 158 public: AddNOp(OpKernelConstruction * context)159 explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} 160 Compute(OpKernelContext * ctx)161 void Compute(OpKernelContext* ctx) override { 162 if (!ctx->ValidateInputsAreSameShape(this)) return; 163 164 const Tensor& input0 = ctx->input(0); 165 const int num = ctx->num_inputs(); 166 167 if (num == 1) { 168 ctx->set_output(0, input0); 169 return; 170 } 171 172 for (int i = 0; i < num; ++i) { 173 // Step 1: ensure unary variants. 174 OP_REQUIRES( 175 ctx, ctx->input(i).dims() == 0, 176 errors::InvalidArgument( 177 "AddN of non-scalar Tensor with dtype=DT_VARIANT is not " 178 "supported; inputs[", 179 i, " has shape: ", ctx->input(i).shape().DebugString(), ".")); 180 } 181 182 // Step 2: attempt to add using 183 // BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...) 184 // For the output create a default-constructed variant object. 185 // TODO(ebrevdo): Perform summation in a tree-structure. 186 Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({})); 187 Variant* v_out = &(out.scalar<Variant>()()); 188 OP_REQUIRES_OK( 189 ctx, BinaryOpVariants<Device>( 190 ctx, ADD_VARIANT_BINARY_OP, ctx->input(0).scalar<Variant>()(), 191 ctx->input(1).scalar<Variant>()(), v_out)); 192 for (int i = 2; i < num; ++i) { 193 const Variant tmp = std::move(*v_out); 194 const Variant& inp = ctx->input(i).scalar<Variant>()(); 195 OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP, 196 inp, tmp, v_out)); 197 } 198 ctx->set_output(0, out); 199 } 200 }; 201 202 #define REGISTER_ADDN(type, dev) \ 203 REGISTER_KERNEL_BUILDER( \ 204 Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ 205 AddNOp<dev##Device, type>) 206 207 #define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU) 208 209 TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU); 210 REGISTER_ADDN_CPU(Variant); 211 212 #undef REGISTER_ADDN_CPU 213 214 #if GOOGLE_CUDA 215 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU) 216 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU); 217 TF_CALL_int64(REGISTER_ADDN_GPU); 218 TF_CALL_complex64(REGISTER_ADDN_GPU); 219 TF_CALL_complex128(REGISTER_ADDN_GPU); 220 TF_CALL_variant(REGISTER_ADDN_GPU); 221 #undef REGISTER_ADDN_GPU 222 223 // A special GPU kernel for int32. 224 // TODO(b/25387198): Also enable int32 in device memory. This kernel 225 // registration requires all int32 inputs and outputs to be in host memory. 226 REGISTER_KERNEL_BUILDER(Name("AddN") 227 .Device(DEVICE_GPU) 228 .TypeConstraint<int32>("T") 229 .HostMemory("inputs") 230 .HostMemory("sum"), 231 AddNOp<CPUDevice, int32>); 232 233 #endif // GOOGLE_CUDA 234 235 #ifdef TENSORFLOW_USE_SYCL 236 REGISTER_ADDN(float, SYCL); 237 REGISTER_ADDN(double, SYCL); 238 239 // A special GPU kernel for int32. 240 // TODO(b/25387198): Also enable int32 in device memory. This kernel 241 // registration requires all int32 inputs and outputs to be in host memory. 242 REGISTER_KERNEL_BUILDER(Name("AddN") 243 .Device(DEVICE_SYCL) 244 .TypeConstraint<int32>("T") 245 .HostMemory("inputs") 246 .HostMemory("sum"), 247 AddNOp<CPUDevice, int32>); 248 #endif // TENSORFLOW_USE_SYCL 249 250 #undef REGISTER_ADDN 251 252 } // namespace tensorflow 253