1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 17 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 18 19 #include <numeric> 20 21 #include "tensorflow/core/platform/bfloat16.h" 22 23 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 24 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 25 #include "tensorflow/core/common_runtime/gpu/gpu_util.h" 26 #endif 27 28 #if GOOGLE_CUDA 29 #include "tensorflow/core/platform/cuda.h" 30 #elif TENSORFLOW_USE_ROCM 31 #include "tensorflow/core/platform/rocm.h" 32 #endif 33 34 #include "tensorflow/core/debug/debug_io_utils.h" 35 #include "tensorflow/core/framework/device_base.h" 36 #include "tensorflow/core/framework/op_kernel.h" 37 #include "tensorflow/core/framework/tensor_util.h" 38 #include "tensorflow/core/lib/core/notification.h" 39 #include "tensorflow/core/lib/strings/stringprintf.h" 40 #include "tensorflow/core/util/debug_events_writer.h" 41 42 namespace tensorflow { 43 44 // Copy op for debugging. 45 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the 46 // device on which the tensor is allocated. 47 class CopyOp : public OpKernel { 48 public: CopyOp(OpKernelConstruction * context)49 explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) { 50 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_)); 51 52 std::vector<string> debug_ops_spec; 53 OP_REQUIRES_OK(context, 54 context->GetAttr("debug_ops_spec", &debug_ops_spec)); 55 for (const string& debug_op_spec : debug_ops_spec) { 56 // Assume debug_op_spec has the format 57 // <debug_op>;<debug_url>;<gated_grpc>, e.g., 58 // DebugIdentity;grpc://localhost:3333;1 59 const std::vector<string> items = str_util::Split(debug_op_spec, ";"); 60 OP_REQUIRES( 61 context, items.size() == 3, 62 errors::Internal( 63 "Unexpected number of semicolons in debug_ops_spec element: ", 64 debug_op_spec)); 65 debug_op_and_url_specs_.push_back( 66 DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]), 67 items[1], items[2] == "1")); 68 } 69 } 70 Compute(OpKernelContext * context)71 void Compute(OpKernelContext* context) override { 72 const Tensor& src_tensor = context->input(0); 73 74 if (src_tensor.IsInitialized() && 75 DataTypeCanUseMemcpy(src_tensor.dtype()) && 76 DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) { 77 // Source tensor is initialized and is mem-copyable. Make a copy. 78 Tensor* copied_tensor; 79 OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(), 80 &copied_tensor)); 81 82 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 83 Device* device = static_cast<Device*>(context->device()); 84 // Determine if the input tensor is not on CPU (e.g., on GPU). 85 bool off_host_input = device->device_type() == DEVICE_GPU && 86 !context->input_alloc_attr(0).on_host(); 87 88 if (off_host_input) { 89 DeviceContext* device_ctxt = context->op_device_context(); 90 // Input is not on host: deep-copy it from GPU to the same GPU. 91 Notification done_copy; 92 GPUUtil::CopyGPUTensorToSameGPU( 93 device, device_ctxt, &src_tensor, copied_tensor, 94 [&done_copy](const Status& s) { done_copy.Notify(); }); 95 done_copy.WaitForNotification(); 96 } else { 97 // The input tensor is on the host (CPU): deep-copy from CPU to CPU. 98 *copied_tensor = tensor::DeepCopy(src_tensor); 99 } 100 #else 101 *copied_tensor = tensor::DeepCopy(src_tensor); 102 #endif 103 } else { 104 // Source tensor is NOT initialized and/or is not mem-copyable: Forward 105 // the Tensor object. 106 context->set_output(0, src_tensor); 107 } 108 } 109 IsExpensive()110 bool IsExpensive() override { return false; } 111 112 private: 113 string tensor_name_; 114 std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_; 115 }; 116 117 // Base class of all debug ops. 118 class BaseDebugOp : public OpKernel { 119 public: BaseDebugOp(const string & debug_op_name,OpKernelConstruction * context)120 explicit BaseDebugOp(const string& debug_op_name, 121 OpKernelConstruction* context) 122 : OpKernel(context), debug_op_name_(debug_op_name) { 123 OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_)); 124 OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_)); 125 126 string device_name; 127 string tensor_name; 128 OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name)); 129 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name)); 130 131 std::vector<string> name_items = str_util::Split(tensor_name, ':'); 132 string node_name; 133 int32 output_slot = 0; 134 OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2, 135 errors::InvalidArgument("Failed to parse tensor name: \"", 136 tensor_name, "\"")); 137 if (name_items.size() == 2) { 138 node_name = name_items[0]; 139 OP_REQUIRES( 140 context, strings::safe_strto32(name_items[1], &output_slot), 141 errors::InvalidArgument("Invalid string value for output_slot: \"", 142 name_items[1], "\"")); 143 } else if (name_items.size() == 1) { 144 node_name = name_items[0]; 145 } 146 147 debug_watch_key_.reset( 148 new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_)); 149 } 150 IsExpensive()151 bool IsExpensive() override { return false; } 152 153 protected: 154 // Apply gRPC gating (if gated_grpc_ attribute is true). 155 // 156 // Returns false if and only if all grpc:// debug URLs of the debug op are 157 // disabled currently (i.e., gated off), in which case the debug op will emit 158 // an empty (size {0}) tensor of undefined data type. ApplyGrpcGating(OpKernelContext * context)159 bool ApplyGrpcGating(OpKernelContext* context) { 160 if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen( 161 debug_watch_key_->debug_node_name, debug_urls_)) { 162 // The entire node is gated off: Output an empty tensor and avoid 163 // expensive computation. 164 Tensor* output_tensor; 165 TensorShape shape({0}); 166 if (!context->allocate_output(0, shape, &output_tensor).ok()) { 167 LOG(ERROR) << "Debug node of watch key " 168 << debug_watch_key_->debug_node_name 169 << " failed to allocate empty tensor under gated-off state."; 170 } 171 return false; 172 } else { 173 return true; 174 } 175 } 176 177 // Publish a tensor to all debug URLs of the debug op. 178 // Log an error if the publishing failed. PublishTensor(const Tensor & tensor)179 Status PublishTensor(const Tensor& tensor) { 180 if (debug_urls_.empty()) { 181 return Status::OK(); 182 } else { 183 Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor, 184 Env::Default()->NowMicros(), 185 debug_urls_, gated_grpc_); 186 if (!status.ok()) { 187 LOG(ERROR) << "Debug node of watch key " 188 << debug_watch_key_->debug_node_name 189 << " failed to publish debug tensor data to all URLs " 190 << str_util::Join(debug_urls_, ", ") 191 << ", due to: " << status.error_message(); 192 } 193 return status; 194 } 195 } 196 197 private: 198 const string debug_op_name_; 199 std::unique_ptr<DebugNodeKey> debug_watch_key_; 200 std::vector<string> debug_urls_; 201 bool gated_grpc_; 202 }; 203 204 // Identity op for debugging. 205 // Output slot 0 carries the debug signal and is always allocated on the 206 // host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp, 207 // the debug signal is equal to the input tensor. 208 class DebugIdentityOp : public BaseDebugOp { 209 public: DebugIdentityOp(OpKernelConstruction * context)210 explicit DebugIdentityOp(OpKernelConstruction* context) 211 : BaseDebugOp("DebugIdentity", context) {} 212 Compute(OpKernelContext * context)213 void Compute(OpKernelContext* context) override { 214 if (!ApplyGrpcGating(context)) { 215 return; 216 } 217 218 OP_REQUIRES_OK(context, PublishTensor(context->input(0))); 219 context->set_output(0, context->input(0)); 220 } 221 }; 222 223 // NaN-counter op for debugging. 224 template <typename T> 225 class DebugNanCountOp : public BaseDebugOp { 226 public: DebugNanCountOp(OpKernelConstruction * context)227 explicit DebugNanCountOp(OpKernelConstruction* context) 228 : BaseDebugOp("DebugNanCount", context) {} 229 Compute(OpKernelContext * context)230 void Compute(OpKernelContext* context) override { 231 if (!ApplyGrpcGating(context)) { 232 return; 233 } 234 235 Tensor* output_tensor; 236 const Tensor& input = context->input(0); 237 238 // Use DT_INT64/int64 to be consistent with TensorShape::num_elements(). 239 int64 nan_count = 0; 240 241 // If the input is an uninitialized tensor, let nan_count be 0. 242 if (input.IsInitialized()) { 243 // Count NaNs. 244 const TensorShape& input_shape = input.shape(); 245 const T* input_flat = input.template flat<T>().data(); 246 247 for (int64 i = 0; i < input_shape.num_elements(); ++i) { 248 if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) { 249 nan_count++; 250 } 251 } 252 } 253 254 TensorShape shape({1}); 255 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 256 output_tensor->vec<int64>()(0) = nan_count; 257 OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); 258 } 259 }; 260 261 // Numeric summary op for debugging. 262 template <typename T> 263 class DebugNumericSummaryOp : public BaseDebugOp { 264 public: DebugNumericSummaryOp(OpKernelConstruction * context)265 explicit DebugNumericSummaryOp(OpKernelConstruction* context) 266 : BaseDebugOp("DebugNumericSummary", context) { 267 OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_)); 268 OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_)); 269 OP_REQUIRES_OK(context, 270 context->GetAttr("mute_if_healthy", &mute_if_healthy_)); 271 } 272 Compute(OpKernelContext * context)273 void Compute(OpKernelContext* context) override { 274 if (!ApplyGrpcGating(context)) { 275 return; 276 } 277 278 Tensor* output_tensor; 279 const Tensor& input = context->input(0); 280 281 int64 is_initialized = 0; 282 int64 element_count = 0; 283 int64 negative_inf_count = 0; 284 int64 negative_count = 0; 285 int64 zero_count = 0; 286 int64 positive_count = 0; 287 int64 positive_inf_count = 0; 288 int64 nan_count = 0; 289 double min = std::numeric_limits<double>::infinity(); 290 double max = -std::numeric_limits<double>::infinity(); 291 double sum = 0.0; 292 double mean = std::numeric_limits<double>::quiet_NaN(); 293 double variance = std::numeric_limits<double>::quiet_NaN(); 294 295 // Equal to negative_count + zero_count + positive_count. 296 int64 non_inf_nan_count = 0; 297 298 const TensorShape& input_shape = input.shape(); 299 if (input.IsInitialized()) { 300 is_initialized = 1; 301 const T* input_flat = input.template flat<T>().data(); 302 303 element_count = input_shape.num_elements(); 304 const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_); 305 const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_); 306 307 for (int64 i = 0; i < element_count; ++i) { 308 const double x = static_cast<double>(input_flat[i]); 309 if (Eigen::numext::isnan(x)) { 310 nan_count++; 311 } else if (Eigen::numext::isinf(x)) { 312 if (x < 0.0) { 313 negative_inf_count++; 314 } else { 315 positive_inf_count++; 316 } 317 } else { 318 if (is_lower_bound_custom && x <= lower_bound_) { 319 negative_inf_count++; 320 } else if (is_upper_bound_custom && x >= upper_bound_) { 321 positive_inf_count++; 322 } else if (x < 0.0) { 323 negative_count++; 324 } else if (x > 0.0) { 325 positive_count++; 326 } else { 327 zero_count++; 328 } 329 330 if (x < min) { 331 min = x; 332 } 333 if (x > max) { 334 max = x; 335 } 336 337 non_inf_nan_count++; 338 sum += x; 339 } 340 } 341 342 if (non_inf_nan_count > 0) { 343 mean = sum / non_inf_nan_count; 344 345 // Do a second pass to compute variance. 346 variance = 0.0; 347 for (int64 i = 0; i < element_count; ++i) { 348 const double x = static_cast<double>(input_flat[i]); 349 if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) { 350 variance += (x - mean) * (x - mean); 351 } 352 } 353 variance /= non_inf_nan_count; 354 } 355 } 356 357 TensorShape shape({14 + input_shape.dims()}); 358 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 359 output_tensor->vec<double>()(0) = static_cast<double>(is_initialized); 360 output_tensor->vec<double>()(1) = static_cast<double>(element_count); 361 output_tensor->vec<double>()(2) = static_cast<double>(nan_count); 362 output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count); 363 output_tensor->vec<double>()(4) = static_cast<double>(negative_count); 364 output_tensor->vec<double>()(5) = static_cast<double>(zero_count); 365 output_tensor->vec<double>()(6) = static_cast<double>(positive_count); 366 output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count); 367 output_tensor->vec<double>()(8) = min; 368 output_tensor->vec<double>()(9) = max; 369 output_tensor->vec<double>()(10) = mean; 370 output_tensor->vec<double>()(11) = variance; 371 372 output_tensor->vec<double>()(12) = static_cast<double>(input.dtype()); 373 output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims()); 374 for (size_t d = 0; d < input_shape.dims(); ++d) { 375 output_tensor->vec<double>()(14 + d) = 376 static_cast<double>(input_shape.dim_sizes()[d]); 377 } 378 379 bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 && 380 positive_inf_count == 0; 381 if (!mute) { 382 OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); 383 } 384 } 385 386 private: 387 float lower_bound_; 388 float upper_bound_; 389 bool mute_if_healthy_; 390 }; 391 392 // Identity op for tfdbg v2: Writes debug data using DebugEventsWriter. 393 class DebugIdentityV2Op : public OpKernel { 394 public: DebugIdentityV2Op(OpKernelConstruction * context)395 explicit DebugIdentityV2Op(OpKernelConstruction* context) 396 : OpKernel(context), 397 device_name_(context->device()->name()), 398 output_slot_(-1), 399 tensor_debug_mode_(0), 400 tfdbg_run_id_() { 401 std::vector<string> debug_urls; 402 OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls)); 403 for (const string& debug_url : debug_urls) { 404 if (absl::StartsWith(debug_url, DebugIO::kFileURLScheme)) { 405 dump_roots_.emplace_back( 406 debug_url.substr(strlen(DebugIO::kFileURLScheme))); 407 } else { 408 context->SetStatus( 409 errors::Internal("Unsupported debug URL schema in: ", debug_url)); 410 } 411 } 412 OP_REQUIRES_OK(context, 413 context->GetAttr("tfdbg_context_id", &tfdbg_context_id_)); 414 OP_REQUIRES_OK(context, context->GetAttr("op_name", &op_name_)); 415 OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_)); 416 OP_REQUIRES_OK(context, 417 context->GetAttr("tensor_debug_mode", &tensor_debug_mode_)); 418 if (context->HasAttr("circular_buffer_size")) { 419 OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size", 420 &circular_buffer_size_)); 421 } else { 422 circular_buffer_size_ = 423 tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize; 424 } 425 if (context->HasAttr("tfdbg_run_id")) { 426 OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_)); 427 } 428 } 429 Compute(OpKernelContext * context)430 void Compute(OpKernelContext* context) override { 431 const Tensor& tensor = context->input(0); 432 for (const string& dump_root : dump_roots_) { 433 tfdbg::DebugEventsWriter* debug_events_writer = 434 tfdbg::DebugEventsWriter::GetDebugEventsWriter( 435 dump_root, tfdbg_run_id_, circular_buffer_size_); 436 OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace( 437 tfdbg_context_id_, device_name_, op_name_, 438 output_slot_, tensor_debug_mode_, tensor)); 439 } 440 context->set_output(0, tensor); 441 } 442 443 private: 444 std::vector<string> dump_roots_; 445 string tfdbg_context_id_; 446 string device_name_; 447 string op_name_; 448 int32 output_slot_; 449 int32 tensor_debug_mode_; 450 int64 circular_buffer_size_; 451 string tfdbg_run_id_; 452 }; 453 454 typedef Eigen::ThreadPoolDevice CPUDevice; 455 typedef Eigen::GpuDevice GPUDevice; 456 457 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 458 template <typename Tin, typename Tout> 459 struct CurtHealthLaunch { 460 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]); 461 }; 462 463 extern template struct CurtHealthLaunch<Eigen::half, float>; 464 extern template struct CurtHealthLaunch<float, float>; 465 extern template struct CurtHealthLaunch<double, float>; 466 extern template struct CurtHealthLaunch<Eigen::half, double>; 467 extern template struct CurtHealthLaunch<float, double>; 468 extern template struct CurtHealthLaunch<double, double>; 469 470 template <typename Tin, typename Tout> 471 struct ConciseHealthLaunch { 472 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); 473 }; 474 475 extern template struct ConciseHealthLaunch<Eigen::half, float>; 476 extern template struct ConciseHealthLaunch<float, float>; 477 extern template struct ConciseHealthLaunch<double, float>; 478 extern template struct ConciseHealthLaunch<Eigen::half, double>; 479 extern template struct ConciseHealthLaunch<float, double>; 480 extern template struct ConciseHealthLaunch<double, double>; 481 482 template <typename Tin, typename Tout> 483 struct FullHealthLaunch { 484 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]); 485 }; 486 487 extern template struct FullHealthLaunch<Eigen::half, float>; 488 extern template struct FullHealthLaunch<float, float>; 489 extern template struct FullHealthLaunch<double, float>; 490 extern template struct FullHealthLaunch<Eigen::half, double>; 491 extern template struct FullHealthLaunch<float, double>; 492 extern template struct FullHealthLaunch<double, double>; 493 494 template <typename Tin, typename Tout> 495 struct ReduceInfNanThreeSlotsLaunch { 496 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); 497 }; 498 499 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, float>; 500 extern template struct ReduceInfNanThreeSlotsLaunch<float, float>; 501 extern template struct ReduceInfNanThreeSlotsLaunch<double, float>; 502 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, double>; 503 extern template struct ReduceInfNanThreeSlotsLaunch<float, double>; 504 extern template struct ReduceInfNanThreeSlotsLaunch<double, double>; 505 506 #endif 507 508 template <typename Device, typename Tin, typename Tout> 509 class DebugNumericSummaryV2Op; 510 511 // Numeric summary op for tfdbg v2: CPU Kernel. 512 template <typename Tin, typename Tout> 513 class DebugNumericSummaryV2Op<CPUDevice, Tin, Tout> : public OpKernel { 514 public: 515 explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) 516 : OpKernel(context) { 517 OP_REQUIRES_OK(context, 518 context->GetAttr("tensor_debug_mode", &tensor_debug_mode_)); 519 OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_)); 520 } 521 522 void Compute(OpKernelContext* context) override { 523 const Tensor& tensor = context->input(0); 524 auto in = tensor.flat<Tin>(); 525 const Tin* data = in.data(); 526 const int64 size = in.size(); 527 Tensor* output_tensor; 528 Tout tensor_id = static_cast<Tout>(tensor_id_); 529 const Tout num_elem = static_cast<Tout>(context->input(0).NumElements()); 530 // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because 531 // that mode does not make use of tensor_id. 532 if (tensor_debug_mode_ != 8) { 533 OP_REQUIRES( 534 context, tensor_id_ <= kMaxTensorId, 535 errors::InvalidArgument("DebugNumericSummaryV2Op requires " 536 "tensor_id to be less than or equal to " 537 "(2^", 538 std::numeric_limits<Tout>::digits, 539 "). Given tensor_id:", tensor_id_)); 540 } 541 542 if (tensor_debug_mode_ == 2) { // CURT_HEALTH 543 TensorShape shape({2}); 544 OP_REQUIRES_OK(context, 545 context->allocate_output(0, shape, &output_tensor)); 546 output_tensor->flat<Tout>()(0) = tensor_id; // Slot tensor id 547 output_tensor->flat<Tout>()(1) = 0.0; // Has inf or nan 548 int fp_props = 549 std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { 550 return Eigen::numext::isfinite(y) ? x : 1; 551 }); 552 if (fp_props) { 553 output_tensor->flat<Tout>()(1) = 1.0; 554 } 555 } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH 556 TensorShape shape({5}); 557 OP_REQUIRES_OK(context, 558 context->allocate_output(0, shape, &output_tensor)); 559 output_tensor->flat<Tout>()(0) = tensor_id; 560 output_tensor->flat<Tout>()(1) = num_elem; 561 562 // Accumulator value [neg_inf_count, pos_inf_count, nan_count] 563 Tout fp_props[3] = {0.0, 0.0, 0.0}; 564 std::for_each(data, data + size, [&fp_props](const Tin& y) { 565 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 566 // Do nothing: common case. 567 } else if (Eigen::numext::isinf(y)) { 568 if (y < static_cast<Tin>(0.f)) { 569 ++fp_props[0]; 570 } else { 571 ++fp_props[1]; 572 } 573 } else if (Eigen::numext::isnan(y)) { 574 ++fp_props[2]; 575 } 576 }); 577 output_tensor->flat<Tout>()(2) = fp_props[0]; // Slot for -inf count 578 output_tensor->flat<Tout>()(3) = fp_props[1]; // Slot for inf count 579 output_tensor->flat<Tout>()(4) = fp_props[2]; // Slot for nan count 580 } else if (tensor_debug_mode_ == 4) { // FULL HEALTH 581 TensorShape shape({11}); 582 OP_REQUIRES_OK(context, 583 context->allocate_output(0, shape, &output_tensor)); 584 int num_dims = tensor.dims(); 585 output_tensor->flat<Tout>()(0) = tensor_id; 586 output_tensor->flat<Tout>()(1) = -1.0; // TODO(144919262): Device ID 587 output_tensor->flat<Tout>()(2) = static_cast<Tout>(tensor.dtype()); 588 output_tensor->flat<Tout>()(3) = static_cast<Tout>(num_dims); 589 output_tensor->flat<Tout>()(4) = num_elem; 590 591 // Accumulator value [neg_inf_count, pos_inf_count, nan_count, neg_count, 592 // zero_count, pos_count] 593 Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 594 std::for_each(data, data + size, [&fp_props](const Tin& y) { 595 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 596 if (y < static_cast<Tin>(0.f)) { 597 ++fp_props[3]; 598 } else if (y == static_cast<Tin>(0.f)) { 599 ++fp_props[4]; 600 } else { 601 ++fp_props[5]; 602 } 603 } else if (Eigen::numext::isinf(y)) { 604 if (y < static_cast<Tin>(0.f)) { 605 ++fp_props[0]; 606 } else { 607 ++fp_props[1]; 608 } 609 } else if (Eigen::numext::isnan(y)) { 610 ++fp_props[2]; 611 } 612 }); 613 output_tensor->flat<Tout>()(5) = fp_props[0]; // Slot for -inf count 614 output_tensor->flat<Tout>()(6) = fp_props[1]; // Slot for inf count 615 output_tensor->flat<Tout>()(7) = fp_props[2]; // Slot for nan count. 616 output_tensor->flat<Tout>()(8) = fp_props[3]; // Slot for neg count. 617 output_tensor->flat<Tout>()(9) = fp_props[4]; // Slot for zero count. 618 output_tensor->flat<Tout>()(10) = fp_props[5]; // Slot for pos count. 619 } else if (tensor_debug_mode_ == 5) { // SHAPE 620 TensorShape shape({10}); 621 OP_REQUIRES_OK(context, 622 context->allocate_output(0, shape, &output_tensor)); 623 624 int num_dims = tensor.dims(); 625 output_tensor->flat<Tout>()(0) = tensor_id; 626 output_tensor->flat<Tout>()(1) = static_cast<Tout>(tensor.dtype()); 627 output_tensor->flat<Tout>()(2) = static_cast<Tout>(num_dims); 628 output_tensor->flat<Tout>()(3) = num_elem; 629 630 // Tensor shape - stored as (6 columns) 631 // if num_dim is less than 6, we right pad the shape with zeros 632 // if num_dim is greater than 6, we truncate the head (left most) of the 633 // dimensions as they are more predictable than the last few (e.g. batch 634 // size as first dimension) 635 int dim_idx = 4; 636 for (int i = std::max(0, num_dims - kShapeDims); 637 i < std::max(6, num_dims); ++i) { 638 if (i < num_dims) { 639 output_tensor->flat<Tout>()(dim_idx++) = 640 static_cast<Tout>(tensor.dim_size(i)); 641 } else { 642 output_tensor->flat<Tout>()(dim_idx++) = 0.0; 643 } 644 } 645 } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. 646 TensorShape shape({3}); 647 OP_REQUIRES_OK(context, 648 context->allocate_output(0, shape, &output_tensor)); 649 output_tensor->flat<Tout>()(0) = 0.0; // Slot for -inf. 650 output_tensor->flat<Tout>()(1) = 0.0; // Slot for inf. 651 output_tensor->flat<Tout>()(2) = 0.0; // Slot for nan. 652 653 int fp_props = 654 std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { 655 int result = x; 656 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 657 // Do nothing: common case. 658 } else if (Eigen::numext::isinf(y)) { 659 result |= y < static_cast<Tin>(0.f) ? kNegInfBit : kPosInfBit; 660 } else if (Eigen::numext::isnan(y)) { 661 result |= kNaNBit; 662 } 663 return result; 664 }); 665 666 if (fp_props & kNegInfBit) { 667 output_tensor->flat<Tout>()(0) = -std::numeric_limits<Tout>::infinity(); 668 } 669 if (fp_props & kPosInfBit) { 670 output_tensor->flat<Tout>()(1) = std::numeric_limits<Tout>::infinity(); 671 } 672 if (fp_props & kNaNBit) { 673 output_tensor->flat<Tout>()(2) = std::numeric_limits<Tout>::quiet_NaN(); 674 } 675 } else { 676 // TODO(cais): Implement other tensor debug modes in debug_event.proto. 677 context->SetStatus(errors::Unimplemented( 678 "Unimplemented tensor debug mode: ", tensor_debug_mode_)); 679 } 680 } 681 682 private: 683 int tensor_debug_mode_; 684 int64 tensor_id_; 685 static constexpr int kShapeDims = 6; 686 static constexpr int kNegInfBit = 0x01; 687 static constexpr int kPosInfBit = 0x02; 688 static constexpr int kNaNBit = 0x04; 689 static constexpr int64 kMaxTensorId = 1LL 690 << std::numeric_limits<Tout>::digits; 691 }; 692 693 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 694 695 template <typename Tin, typename Tout> 696 class DebugNumericSummaryV2Op<GPUDevice, Tin, Tout> : public AsyncOpKernel { 697 public: 698 typedef GPUDevice Device; 699 700 explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) 701 : AsyncOpKernel(context) { 702 OP_REQUIRES_OK(context, 703 context->GetAttr("tensor_debug_mode", &tensor_debug_mode_)); 704 OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_)); 705 } 706 707 void ComputeAsync(OpKernelContext* context, DoneCallback done) override { 708 Tensor* output_tensor; 709 Tout tensor_id = static_cast<Tout>(tensor_id_); 710 const Tensor& tensor = context->input(0); 711 const Tout num_elem = static_cast<Tout>(tensor.NumElements()); 712 const Device& d = context->eigen_device<Device>(); 713 auto input = tensor.flat<Tin>(); 714 auto check_cb = [this, done]() { done(); }; 715 // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because 716 // that mode does not make use of tensor_id. 717 if (tensor_debug_mode_ != 8) { 718 OP_REQUIRES_ASYNC( 719 context, tensor_id_ <= kMaxTensorId, 720 errors::InvalidArgument("DebugNumericSummaryV2Op requires " 721 "tensor_id to be less than or equal to " 722 "(2^", 723 std::numeric_limits<Tout>::digits, 724 "). Given tensor_id:", tensor_id_), 725 done); 726 } 727 728 if (tensor_debug_mode_ == 2) { // CURT_HEALTH. 729 TensorShape shape({2}); 730 OP_REQUIRES_OK(context, 731 context->allocate_output(0, shape, &output_tensor)); 732 733 auto* stream = context->op_device_context()->stream(); 734 OP_REQUIRES_ASYNC(context, stream != nullptr, 735 errors::Internal("No GPU stream available."), done); 736 737 se::DeviceMemoryBase output_tensor_ptr( 738 output_tensor->flat<Tout>().data(), 739 output_tensor->flat<Tout>().size()); 740 stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout)); 741 // Copy tensor_id to slot zero 742 stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout)); 743 if (num_elem == 0) { 744 done(); 745 return; 746 } 747 748 // Call the GPU kernels for the numerical (inf/nan) checks. 749 auto input = context->input(0).flat<Tin>(); 750 CurtHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(), 751 output_tensor->flat<Tout>().data() + 1); 752 753 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 754 stream, std::move(check_cb)); 755 } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH. 756 TensorShape shape({5}); 757 OP_REQUIRES_OK(context, 758 context->allocate_output(0, shape, &output_tensor)); 759 760 auto* stream = context->op_device_context()->stream(); 761 OP_REQUIRES_ASYNC(context, stream != nullptr, 762 errors::Internal("No GPU stream available."), done); 763 764 se::DeviceMemoryBase output_tensor_ptr( 765 output_tensor->flat<Tout>().data(), 766 output_tensor->flat<Tout>().size()); 767 stream->ThenMemset32(&output_tensor_ptr, 0, 5 * sizeof(Tout)); 768 const Tout static_output[] = {tensor_id, num_elem}; 769 stream->ThenMemcpy(&output_tensor_ptr, &static_output, 2 * sizeof(Tout)); 770 if (num_elem == 0) { 771 done(); 772 return; 773 } 774 775 // Call the GPU kernels for the numerical (inf/nan) checks. 776 ConciseHealthLaunch<Tin, Tout>().Run( 777 d, input.data(), input.size(), 778 output_tensor->flat<Tout>().data() + 2); 779 780 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 781 stream, std::move(check_cb)); 782 } else if (tensor_debug_mode_ == 4) { // FULL HEALTH 783 TensorShape shape({11}); 784 OP_REQUIRES_OK(context, 785 context->allocate_output(0, shape, &output_tensor)); 786 787 auto* stream = context->op_device_context()->stream(); 788 OP_REQUIRES_ASYNC(context, stream != nullptr, 789 errors::Internal("No GPU stream available."), done); 790 791 se::DeviceMemoryBase output_tensor_ptr( 792 output_tensor->flat<Tout>().data(), 793 output_tensor->flat<Tout>().size()); 794 stream->ThenMemset32(&output_tensor_ptr, 0, 11 * sizeof(Tout)); 795 796 int num_dims = tensor.dims(); 797 const Tout static_output[] = {tensor_id, 798 -1.0, // TODO(144919262): Device ID 799 static_cast<Tout>(tensor.dtype()), 800 static_cast<Tout>(num_dims), num_elem}; 801 stream->ThenMemcpy(&output_tensor_ptr, &static_output, 5 * sizeof(Tout)); 802 if (num_elem == 0) { 803 done(); 804 return; 805 } 806 807 // Call the GPU kernels for the numerical (inf/nan) checks and 808 // pos/neg/zero counts. 809 FullHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(), 810 output_tensor->flat<Tout>().data() + 5); 811 812 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 813 stream, std::move(check_cb)); 814 } else if (tensor_debug_mode_ == 5) { // SHAPE 815 TensorShape shape({10}); 816 OP_REQUIRES_OK(context, 817 context->allocate_output(0, shape, &output_tensor)); 818 819 auto* stream = context->op_device_context()->stream(); 820 OP_REQUIRES_ASYNC(context, stream != nullptr, 821 errors::Internal("No GPU stream available."), done); 822 823 se::DeviceMemoryBase output_tensor_ptr( 824 output_tensor->flat<Tout>().data(), 825 output_tensor->flat<Tout>().size()); 826 827 int num_dims = tensor.dims(); 828 Tout static_output[10] = {tensor_id, 829 static_cast<Tout>(tensor.dtype()), 830 static_cast<Tout>(num_dims), 831 num_elem, 832 0.0, 833 0.0, 834 0.0, 835 0.0, 836 0.0, 837 0.0}; 838 // Tensor shape: right pad zeros, truncate head 839 int dim_idx = 4; 840 for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) { 841 static_output[dim_idx++] = static_cast<Tout>(tensor.dim_size(i)); 842 } 843 // Write to device stream 844 stream->ThenMemcpy(&output_tensor_ptr, &static_output, sizeof(Tout) * 10); 845 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 846 stream, std::move(check_cb)); 847 } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. 848 TensorShape shape({3}); 849 OP_REQUIRES_OK(context, 850 context->allocate_output(0, shape, &output_tensor)); 851 852 auto* stream = context->op_device_context()->stream(); 853 OP_REQUIRES_ASYNC(context, stream != nullptr, 854 errors::Internal("No GPU stream available."), done); 855 856 se::DeviceMemoryBase output_tensor_ptr( 857 output_tensor->flat<Tout>().data(), 858 output_tensor->flat<Tout>().size()); 859 stream->ThenMemset32(&output_tensor_ptr, 0, 860 output_tensor->flat<Tout>().size() * sizeof(Tout)); 861 if (num_elem == 0) { 862 done(); 863 return; 864 } 865 866 // Call the GPU kernels for the numerical (inf/nan) checks. 867 auto input = context->input(0).flat<Tin>(); 868 ReduceInfNanThreeSlotsLaunch<Tin, Tout>().Run( 869 d, input.data(), input.size(), output_tensor->flat<Tout>().data()); 870 871 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 872 stream, std::move(check_cb)); 873 } else { 874 // TODO(cais): Implement other tensor debug modes in debug_event.proto. 875 context->SetStatus(errors::Unimplemented( 876 "Unimplemented tensor debug mode: ", tensor_debug_mode_)); 877 done(); 878 } 879 } 880 881 private: 882 int tensor_debug_mode_; 883 int64 tensor_id_; 884 static constexpr int64 kMaxTensorId = 1L << std::numeric_limits<Tout>::digits; 885 }; 886 887 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 888 889 } // namespace tensorflow 890 891 #endif // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 892