1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <functional> 17 #include <unordered_map> 18 #include <utility> 19 20 #include "absl/container/flat_hash_map.h" 21 #include "tensorflow/core/framework/bounds_check.h" 22 #include "tensorflow/core/framework/op_kernel.h" 23 #include "tensorflow/core/framework/register_types.h" 24 #include "tensorflow/core/framework/tensor.h" 25 #include "tensorflow/core/framework/tensor_shape.h" 26 #include "tensorflow/core/lib/core/status.h" 27 #include "tensorflow/core/lib/hash/hash.h" 28 #include "tensorflow/core/platform/bfloat16.h" 29 30 namespace tensorflow { 31 namespace { 32 33 typedef Eigen::ThreadPoolDevice CPUDevice; 34 35 // `UniqueOpHashMap` defines the map type that is used when elements of type 36 // `T` are to be uniquified. By default, we use `absl::flat_hash_map<T, TIndex>` 37 // as the map type. Subsequent specializations are provided for 38 // performance and/or correctness. 39 template <typename T, typename TIndex> 40 struct UniqueOpHashMap { 41 using map_type = absl::flat_hash_map<T, TIndex>; 42 }; 43 44 // NOTE(mrry): For `tstring` elements, we use an `absl::string_view` key to 45 // avoid copying the input strings into the map. 46 template <typename TIndex> 47 struct UniqueOpHashMap<tstring, TIndex> { 48 using map_type = absl::flat_hash_map<absl::string_view, TIndex>; 49 }; 50 51 // NOTE(mrry): `absl::flat_hash_map<float, ...>` does not allow `NaN` as a key, 52 // because `NaN != NaN`, so we fall back to `std::unordered_map<>` for 53 // floating-point types. 54 template <typename TIndex> 55 struct UniqueOpHashMap<float, TIndex> { 56 using map_type = std::unordered_map<float, TIndex>; 57 }; 58 template <typename TIndex> 59 struct UniqueOpHashMap<double, TIndex> { 60 using map_type = std::unordered_map<double, TIndex>; 61 }; 62 template <typename TIndex> 63 struct UniqueOpHashMap<Eigen::half, TIndex> { 64 using map_type = std::unordered_map<Eigen::half, TIndex>; 65 }; 66 template <typename TIndex> 67 struct UniqueOpHashMap<bfloat16, TIndex> { 68 using map_type = std::unordered_map<bfloat16, TIndex>; 69 }; 70 71 // `UniqueOp` computes the unique elements in the input tensor. 72 // 73 // * `T` is the element type. 74 // * `TIndex` is the type used to represent indices in the output, either 75 // `int32` or `int64`. 76 template <typename T, typename TIndex> 77 class UniqueOp : public OpKernel { 78 public: UniqueOp(OpKernelConstruction * context)79 explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {} 80 Compute(OpKernelContext * context)81 void Compute(OpKernelContext* context) override { 82 const Tensor& input = context->input(0); 83 // TODO(dga): Make unique polymorphic for returning int32 and int64 84 // vectors to support large tensors. 85 OP_REQUIRES(context, 86 input.NumElements() <= std::numeric_limits<int32>::max(), 87 errors::InvalidArgument( 88 "unique does not support input tensors larger than ", 89 std::numeric_limits<int32>::max(), " elements")); 90 91 int64_t axis = 0; 92 std::vector<int64_t> new_sizes{1, input.NumElements(), 1}; 93 if (context->num_inputs() == 1) { 94 OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), 95 errors::InvalidArgument("unique expects a 1D vector.")); 96 } else { 97 // In case of UniqueV2, the axis is a 1D vector. The purpose is 98 // to allow specifying either "no axis" or "axis". The `[]` means 99 // "no axis", while `[x]` means `axis = x`. 100 const Tensor& axis_tensor = context->input(1); 101 OP_REQUIRES(context, TensorShapeUtils::IsVector(axis_tensor.shape()), 102 errors::InvalidArgument("axis expects a 1D vector.")); 103 OP_REQUIRES( 104 context, axis_tensor.NumElements() <= 1, 105 errors::InvalidArgument( 106 "axis does not support input tensors larger than 1 elements")); 107 if (axis_tensor.NumElements() == 0) { 108 OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), 109 errors::InvalidArgument("unique expects a 1D vector.")); 110 } else { 111 OP_REQUIRES(context, 112 (axis_tensor.dtype() == DT_INT32 || 113 axis_tensor.dtype() == DT_INT64), 114 errors::InvalidArgument( 115 "axis tensor should be int32 or int64, but got ", 116 DataTypeString(axis_tensor.dtype()))); 117 if (axis_tensor.dtype() == DT_INT32) { 118 axis = internal::SubtleMustCopy(axis_tensor.scalar<int32>()()); 119 } else { 120 axis = internal::SubtleMustCopy(axis_tensor.scalar<int64_t>()()); 121 } 122 axis = axis < 0 ? axis + input.dims() : axis; 123 OP_REQUIRES(context, 0 <= axis && axis < input.dims(), 124 errors::InvalidArgument("axis has to be between [0, ", 125 input.dims(), ")")); 126 if (axis > 0) { 127 for (int64_t i = 0; i < axis; i++) { 128 new_sizes[0] *= input.dim_size(i); 129 } 130 } 131 new_sizes[1] = input.dim_size(axis); 132 if (axis + 1 < input.dims()) { 133 for (int64_t i = axis + 1; i < input.dims(); i++) { 134 new_sizes[2] *= input.dim_size(i); 135 } 136 } 137 } 138 } 139 140 Tensor* idx = nullptr; 141 OP_REQUIRES_OK(context, context->allocate_output( 142 1, TensorShape({new_sizes[1]}), &idx)); 143 auto idx_vec = idx->template vec<TIndex>(); 144 145 int64_t uniq_size; 146 if (new_sizes[0] == 1 && new_sizes[2] == 1) { 147 // Specialized and faster implementation when unique is run over single 148 // elements. Here we put T directly into the map rather than ints pointing 149 // to them as in the general case. 150 auto Tin = input.flat<T>(); 151 const int64_t N = static_cast<int64_t>(Tin.size()); 152 153 typename UniqueOpHashMap<T, TIndex>::map_type uniq; 154 uniq.reserve(2 * N); 155 for (Eigen::Index i = 0, j = 0; i < N; ++i) { 156 auto it = uniq.emplace(Tin(i), j); 157 idx_vec(i) = it.first->second; 158 if (it.second) { 159 ++j; 160 } 161 } 162 163 uniq_size = static_cast<int64_t>(uniq.size()); 164 TensorShape output_shape(input.shape()); 165 output_shape.set_dim(axis, uniq_size); 166 Tensor* output = nullptr; 167 OP_REQUIRES_OK(context, 168 context->allocate_output(0, output_shape, &output)); 169 auto Tout = output->flat<T>(); 170 171 for (const auto& it : uniq) { 172 Tout(it.second) = it.first; 173 } 174 } else { 175 // General implementation when unique is run over multiple elements. 176 auto Tin = input.shaped<T, 3>(new_sizes); 177 178 auto hash_fn = [&Tin](const Eigen::Index& key) { 179 size_t h = 0; 180 for (Eigen::Index i = 0; i < Tin.dimension(0); i++) { 181 for (Eigen::Index j = 0; j < Tin.dimension(2); j++) { 182 h = Hash64Combine(h, hash<T>{}(Tin(i, key, j))); 183 } 184 } 185 return h; 186 }; 187 188 auto equal_to_fn = [&Tin](const Eigen::Index& lhs, 189 const Eigen::Index& rhs) { 190 for (Eigen::Index i = 0; i < Tin.dimension(0); i++) { 191 for (Eigen::Index j = 0; j < Tin.dimension(2); j++) { 192 if (Tin(i, lhs, j) != Tin(i, rhs, j)) { 193 return false; 194 } 195 } 196 } 197 return true; 198 }; 199 200 absl::flat_hash_map<int64_t, int64_t, decltype(hash_fn), 201 decltype(equal_to_fn)> 202 uniq(0, hash_fn, equal_to_fn); 203 204 uniq.reserve(2 * Tin.dimension(1)); 205 206 for (int64_t i = 0, j = 0; i < Tin.dimension(1); ++i) { 207 auto it = uniq.emplace(i, j); 208 idx_vec(i) = it.first->second; 209 if (it.second) { 210 ++j; 211 } 212 } 213 214 uniq_size = static_cast<int64_t>(uniq.size()); 215 new_sizes[1] = uniq_size; 216 TensorShape output_shape(input.shape()); 217 output_shape.set_dim(axis, uniq_size); 218 Tensor* output = nullptr; 219 OP_REQUIRES_OK(context, 220 context->allocate_output(0, output_shape, &output)); 221 auto Tout = output->shaped<T, 3>(new_sizes); 222 223 for (auto it : uniq) { 224 Tout.chip(it.second, 1) = Tin.chip(it.first, 1); 225 } 226 } 227 228 if (num_outputs() > 2) { 229 Tensor* output = nullptr; 230 OP_REQUIRES_OK(context, context->allocate_output( 231 2, TensorShape({uniq_size}), &output)); 232 auto count_output_vec = output->template vec<TIndex>(); 233 count_output_vec.setZero(); 234 const int N = idx_vec.size(); 235 for (int64_t i = 0; i < N; ++i) { 236 count_output_vec(idx_vec(i))++; 237 } 238 } 239 } 240 }; 241 242 #define REGISTER_UNIQUE(type) \ 243 REGISTER_KERNEL_BUILDER(Name("Unique") \ 244 .Device(DEVICE_CPU) \ 245 .TypeConstraint<type>("T") \ 246 .TypeConstraint<int32>("out_idx"), \ 247 UniqueOp<type, int32>); \ 248 REGISTER_KERNEL_BUILDER(Name("Unique") \ 249 .Device(DEVICE_CPU) \ 250 .TypeConstraint<type>("T") \ 251 .TypeConstraint<int64_t>("out_idx"), \ 252 UniqueOp<type, int64>); \ 253 REGISTER_KERNEL_BUILDER(Name("UniqueV2") \ 254 .Device(DEVICE_CPU) \ 255 .TypeConstraint<type>("T") \ 256 .TypeConstraint<int32>("out_idx"), \ 257 UniqueOp<type, int32>); \ 258 REGISTER_KERNEL_BUILDER(Name("UniqueV2") \ 259 .Device(DEVICE_CPU) \ 260 .TypeConstraint<type>("T") \ 261 .TypeConstraint<int64_t>("out_idx"), \ 262 UniqueOp<type, int64>); \ 263 REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts") \ 264 .Device(DEVICE_CPU) \ 265 .TypeConstraint<type>("T") \ 266 .TypeConstraint<int32>("out_idx"), \ 267 UniqueOp<type, int32>) \ 268 REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts") \ 269 .Device(DEVICE_CPU) \ 270 .TypeConstraint<type>("T") \ 271 .TypeConstraint<int64_t>("out_idx"), \ 272 UniqueOp<type, int64>); \ 273 REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2") \ 274 .Device(DEVICE_CPU) \ 275 .TypeConstraint<type>("T") \ 276 .TypeConstraint<int32>("out_idx"), \ 277 UniqueOp<type, int32>) \ 278 REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2") \ 279 .Device(DEVICE_CPU) \ 280 .TypeConstraint<type>("T") \ 281 .TypeConstraint<int64_t>("out_idx"), \ 282 UniqueOp<type, int64>) 283 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); 284 REGISTER_UNIQUE(tstring) 285 REGISTER_UNIQUE(bool) 286 #undef REGISTER_UNIQUE 287 288 // Fake integer GPU kernels so that the use of Unique in optimizers (to 289 // de-duplicate sparse gradient indices) does not conflict with gradients being 290 // located on a GPU. These kernels run on the CPU, their inputs and outputs 291 // residing in host (not GPU) memory. 292 #define REGISTER_UNIQUE_DEVICE(type) \ 293 REGISTER_KERNEL_BUILDER(Name("Unique") \ 294 .Device(DEVICE_DEFAULT) \ 295 .TypeConstraint<type>("T") \ 296 .TypeConstraint<int32>("out_idx") \ 297 .HostMemory("x") \ 298 .HostMemory("y") \ 299 .HostMemory("idx"), \ 300 UniqueOp<type, int32>); \ 301 REGISTER_KERNEL_BUILDER(Name("Unique") \ 302 .Device(DEVICE_DEFAULT) \ 303 .TypeConstraint<type>("T") \ 304 .TypeConstraint<int64_t>("out_idx") \ 305 .HostMemory("x") \ 306 .HostMemory("y") \ 307 .HostMemory("idx"), \ 308 UniqueOp<type, int64>); 309 310 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE_DEVICE); 311 REGISTER_UNIQUE_DEVICE(tstring) 312 REGISTER_UNIQUE_DEVICE(bool) 313 #undef REGISTER_UNIQUE_DEVICE 314 315 } // namespace 316 } // namespace tensorflow 317