1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 17 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 18 19 #if GOOGLE_CUDA 20 #include "tensorflow/core/common_runtime/gpu/gpu_util.h" 21 #endif 22 #ifdef TENSORFLOW_USE_SYCL 23 #include "tensorflow/core/common_runtime/sycl/sycl_util.h" 24 #endif // TENSORFLOW_USE_SYCL 25 #include "tensorflow/core/debug/debug_io_utils.h" 26 #include "tensorflow/core/framework/device_base.h" 27 #include "tensorflow/core/framework/op_kernel.h" 28 #include "tensorflow/core/framework/tensor_util.h" 29 #include "tensorflow/core/lib/core/notification.h" 30 #include "tensorflow/core/lib/strings/stringprintf.h" 31 32 namespace tensorflow { 33 34 // Copy op for debugging. 35 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the 36 // device on which the tensor is allocated. 37 class CopyOp : public OpKernel { 38 public: CopyOp(OpKernelConstruction * context)39 explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) { 40 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_)); 41 42 std::vector<string> debug_ops_spec; 43 OP_REQUIRES_OK(context, 44 context->GetAttr("debug_ops_spec", &debug_ops_spec)); 45 for (const string& debug_op_spec : debug_ops_spec) { 46 // Assume debug_op_spec has the format 47 // <debug_op>;<debug_url>;<gated_grpc>, e.g., 48 // DebugIdentity;grpc://localhost:3333;1 49 const std::vector<string> items = str_util::Split(debug_op_spec, ";"); 50 OP_REQUIRES( 51 context, items.size() == 3, 52 errors::Internal( 53 "Unexpected number of semicolons in debug_ops_spec element: ", 54 debug_op_spec)); 55 debug_op_and_url_specs_.push_back( 56 DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]), 57 items[1], items[2] == "1")); 58 } 59 } 60 Compute(OpKernelContext * context)61 void Compute(OpKernelContext* context) override { 62 const Tensor& src_tensor = context->input(0); 63 64 if (src_tensor.IsInitialized() && 65 DataTypeCanUseMemcpy(src_tensor.dtype()) && 66 DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) { 67 // Source tensor is initialized and is mem-copyable. Make a copy. 68 Tensor* copied_tensor; 69 OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(), 70 &copied_tensor)); 71 72 #if GOOGLE_CUDA 73 Device* device = static_cast<Device*>(context->device()); 74 // Determine if the input tensor is not on CPU (e.g., on GPU). 75 bool off_host_input = device->device_type() == DEVICE_GPU && 76 !context->input_alloc_attr(0).on_host(); 77 78 if (off_host_input) { 79 DeviceContext* device_ctxt = context->op_device_context(); 80 // Input is not on host: deep-copy it from GPU to the same GPU. 81 Notification done_copy; 82 GPUUtil::CopyGPUTensorToSameGPU( 83 device, device_ctxt, &src_tensor, copied_tensor, 84 [&done_copy](const Status& s) { done_copy.Notify(); }); 85 done_copy.WaitForNotification(); 86 } else { 87 // The input tensor is on the host (CPU): deep-copy from CPU to CPU. 88 *copied_tensor = tensor::DeepCopy(src_tensor); 89 } 90 #elif defined(TENSORFLOW_USE_SYCL) 91 Device* device = static_cast<Device*>(context->device()); 92 // Determine if the input tensor is not on CPU (e.g., on GPU). 93 const bool off_host_input = device->device_type() == DEVICE_SYCL && 94 !context->input_alloc_attr(0).on_host(); 95 96 if (off_host_input) { 97 SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor); 98 } else { 99 *copied_tensor = tensor::DeepCopy(src_tensor); 100 } 101 #else 102 *copied_tensor = tensor::DeepCopy(src_tensor); 103 #endif 104 } else { 105 // Source tensor is NOT initialized and/or is not mem-copyable: Forward 106 // the Tensor object. 107 context->set_output(0, src_tensor); 108 } 109 } 110 IsExpensive()111 bool IsExpensive() override { return false; } 112 113 private: 114 string tensor_name_; 115 std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_; 116 }; 117 118 // Base class of all debug ops. 119 class BaseDebugOp : public OpKernel { 120 public: BaseDebugOp(const string & debug_op_name,OpKernelConstruction * context)121 explicit BaseDebugOp(const string& debug_op_name, 122 OpKernelConstruction* context) 123 : OpKernel(context), debug_op_name_(debug_op_name) { 124 OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_)); 125 OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_)); 126 127 string device_name; 128 string tensor_name; 129 OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name)); 130 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name)); 131 132 std::vector<string> name_items = str_util::Split(tensor_name, ':'); 133 string node_name; 134 int32 output_slot = 0; 135 OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2, 136 errors::InvalidArgument("Failed to parse tensor name: \"", 137 tensor_name, "\"")); 138 if (name_items.size() == 2) { 139 node_name = name_items[0]; 140 OP_REQUIRES( 141 context, strings::safe_strto32(name_items[1], &output_slot), 142 errors::InvalidArgument("Invalid string value for output_slot: \"", 143 name_items[1], "\"")); 144 } else if (name_items.size() == 1) { 145 node_name = name_items[0]; 146 } 147 148 debug_watch_key_.reset( 149 new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_)); 150 } 151 IsExpensive()152 bool IsExpensive() override { return false; } 153 154 protected: 155 // Apply gRPC gating (if gated_grpc_ attribute is true). 156 // 157 // Returns false if and only if all grpc:// debug URLs of the debug op are 158 // disabled currently (i.e., gated off), in which case the debug op will emit 159 // an empty (size {0}) tensor of undefined data type. ApplyGrpcGating(OpKernelContext * context)160 bool ApplyGrpcGating(OpKernelContext* context) { 161 if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen( 162 debug_watch_key_->debug_node_name, debug_urls_)) { 163 // The entire node is gated off: Output an empty tensor and avoid 164 // expensive computation. 165 Tensor* output_tensor; 166 TensorShape shape({0}); 167 if (!context->allocate_output(0, shape, &output_tensor).ok()) { 168 LOG(ERROR) << "Debug node of watch key " 169 << debug_watch_key_->debug_node_name 170 << " failed to allocate empty tensor under gated-off state."; 171 } 172 return false; 173 } else { 174 return true; 175 } 176 } 177 178 // Publish a tensor to all debug URLs of the debug op. 179 // Log an error if the publishing failed. PublishTensor(const Tensor & tensor)180 Status PublishTensor(const Tensor& tensor) { 181 if (debug_urls_.empty()) { 182 return Status::OK(); 183 } else { 184 Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor, 185 Env::Default()->NowMicros(), 186 debug_urls_, gated_grpc_); 187 if (!status.ok()) { 188 LOG(ERROR) << "Debug node of watch key " 189 << debug_watch_key_->debug_node_name 190 << " failed to publish debug tensor data to all URLs " 191 << str_util::Join(debug_urls_, ", ") 192 << ", due to: " << status.error_message(); 193 } 194 return status; 195 } 196 } 197 198 private: 199 const string debug_op_name_; 200 std::unique_ptr<DebugNodeKey> debug_watch_key_; 201 std::vector<string> debug_urls_; 202 bool gated_grpc_; 203 }; 204 205 // Identity op for debugging. 206 // Output slot 0 carries the debug signal and is always allocated on the 207 // host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp, 208 // the debug signal is equal to the input tensor. 209 class DebugIdentityOp : public BaseDebugOp { 210 public: DebugIdentityOp(OpKernelConstruction * context)211 explicit DebugIdentityOp(OpKernelConstruction* context) 212 : BaseDebugOp("DebugIdentity", context) {} 213 Compute(OpKernelContext * context)214 void Compute(OpKernelContext* context) override { 215 if (!ApplyGrpcGating(context)) { 216 return; 217 } 218 219 OP_REQUIRES_OK(context, PublishTensor(context->input(0))); 220 context->set_output(0, context->input(0)); 221 } 222 }; 223 224 // NaN-counter op for debugging. 225 template <typename T> 226 class DebugNanCountOp : public BaseDebugOp { 227 public: DebugNanCountOp(OpKernelConstruction * context)228 explicit DebugNanCountOp(OpKernelConstruction* context) 229 : BaseDebugOp("DebugNanCount", context) {} 230 Compute(OpKernelContext * context)231 void Compute(OpKernelContext* context) override { 232 if (!ApplyGrpcGating(context)) { 233 return; 234 } 235 236 Tensor* output_tensor; 237 const Tensor& input = context->input(0); 238 239 // Use DT_INT64/int64 to be consistent with TensorShape::num_elements(). 240 int64 nan_count = 0; 241 242 // If the input is an uninitialized tensor, let nan_count be 0. 243 if (input.IsInitialized()) { 244 // Count NaNs. 245 const TensorShape& input_shape = input.shape(); 246 const T* input_flat = input.template flat<T>().data(); 247 248 for (int64 i = 0; i < input_shape.num_elements(); ++i) { 249 if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) { 250 nan_count++; 251 } 252 } 253 } 254 255 TensorShape shape({1}); 256 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 257 output_tensor->vec<int64>()(0) = nan_count; 258 OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); 259 } 260 }; 261 262 // Numeric summary op for debugging. 263 template <typename T> 264 class DebugNumericSummaryOp : public BaseDebugOp { 265 public: DebugNumericSummaryOp(OpKernelConstruction * context)266 explicit DebugNumericSummaryOp(OpKernelConstruction* context) 267 : BaseDebugOp("DebugNumericSummary", context) { 268 OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_)); 269 OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_)); 270 OP_REQUIRES_OK(context, 271 context->GetAttr("mute_if_healthy", &mute_if_healthy_)); 272 } 273 Compute(OpKernelContext * context)274 void Compute(OpKernelContext* context) override { 275 if (!ApplyGrpcGating(context)) { 276 return; 277 } 278 279 Tensor* output_tensor; 280 const Tensor& input = context->input(0); 281 282 int64 is_initialized = 0; 283 int64 element_count = 0; 284 int64 negative_inf_count = 0; 285 int64 negative_count = 0; 286 int64 zero_count = 0; 287 int64 positive_count = 0; 288 int64 positive_inf_count = 0; 289 int64 nan_count = 0; 290 double min = std::numeric_limits<double>::infinity(); 291 double max = -std::numeric_limits<double>::infinity(); 292 double sum = 0.0; 293 double mean = std::numeric_limits<double>::quiet_NaN(); 294 double variance = std::numeric_limits<double>::quiet_NaN(); 295 296 // Equal to negative_count + zero_count + positive_count. 297 int64 non_inf_nan_count = 0; 298 299 const TensorShape& input_shape = input.shape(); 300 if (input.IsInitialized()) { 301 is_initialized = 1; 302 const T* input_flat = input.template flat<T>().data(); 303 304 element_count = input_shape.num_elements(); 305 const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_); 306 const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_); 307 308 for (int64 i = 0; i < element_count; ++i) { 309 const double x = static_cast<double>(input_flat[i]); 310 if (Eigen::numext::isnan(x)) { 311 nan_count++; 312 } else if (Eigen::numext::isinf(x)) { 313 if (x < 0.0) { 314 negative_inf_count++; 315 } else { 316 positive_inf_count++; 317 } 318 } else { 319 if (is_lower_bound_custom && x <= lower_bound_) { 320 negative_inf_count++; 321 } else if (is_upper_bound_custom && x >= upper_bound_) { 322 positive_inf_count++; 323 } else if (x < 0.0) { 324 negative_count++; 325 } else if (x > 0.0) { 326 positive_count++; 327 } else { 328 zero_count++; 329 } 330 331 if (x < min) { 332 min = x; 333 } 334 if (x > max) { 335 max = x; 336 } 337 338 non_inf_nan_count++; 339 sum += x; 340 } 341 } 342 343 if (non_inf_nan_count > 0) { 344 mean = sum / non_inf_nan_count; 345 346 // Do a second pass to compute variance. 347 variance = 0.0; 348 for (int64 i = 0; i < element_count; ++i) { 349 const double x = static_cast<double>(input_flat[i]); 350 if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) { 351 variance += (x - mean) * (x - mean); 352 } 353 } 354 variance /= non_inf_nan_count; 355 } 356 } 357 358 TensorShape shape({14 + input_shape.dims()}); 359 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 360 output_tensor->vec<double>()(0) = static_cast<double>(is_initialized); 361 output_tensor->vec<double>()(1) = static_cast<double>(element_count); 362 output_tensor->vec<double>()(2) = static_cast<double>(nan_count); 363 output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count); 364 output_tensor->vec<double>()(4) = static_cast<double>(negative_count); 365 output_tensor->vec<double>()(5) = static_cast<double>(zero_count); 366 output_tensor->vec<double>()(6) = static_cast<double>(positive_count); 367 output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count); 368 output_tensor->vec<double>()(8) = min; 369 output_tensor->vec<double>()(9) = max; 370 output_tensor->vec<double>()(10) = mean; 371 output_tensor->vec<double>()(11) = variance; 372 373 output_tensor->vec<double>()(12) = static_cast<double>(input.dtype()); 374 output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims()); 375 for (size_t d = 0; d < input_shape.dims(); ++d) { 376 output_tensor->vec<double>()(14 + d) = 377 static_cast<double>(input_shape.dim_sizes()[d]); 378 } 379 380 bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 && 381 positive_inf_count == 0; 382 if (!mute) { 383 OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); 384 } 385 } 386 387 private: 388 float lower_bound_; 389 float upper_bound_; 390 bool mute_if_healthy_; 391 }; 392 393 } // namespace tensorflow 394 395 #endif // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 396