• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
17 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
18 
19 #include <numeric>
20 
21 #include "tensorflow/core/platform/bfloat16.h"
22 
23 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
24 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
26 #endif
27 
28 #if GOOGLE_CUDA
29 #include "tensorflow/core/platform/cuda.h"
30 #elif TENSORFLOW_USE_ROCM
31 #include "tensorflow/core/platform/rocm.h"
32 #endif
33 
34 #include "tensorflow/core/debug/debug_io_utils.h"
35 #include "tensorflow/core/framework/device_base.h"
36 #include "tensorflow/core/framework/op_kernel.h"
37 #include "tensorflow/core/framework/tensor_util.h"
38 #include "tensorflow/core/lib/core/notification.h"
39 #include "tensorflow/core/lib/strings/stringprintf.h"
40 #include "tensorflow/core/util/debug_events_writer.h"
41 
42 namespace tensorflow {
43 
44 // Copy op for debugging.
45 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
46 // device on which the tensor is allocated.
47 class CopyOp : public OpKernel {
48  public:
CopyOp(OpKernelConstruction * context)49   explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
50     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
51 
52     std::vector<string> debug_ops_spec;
53     OP_REQUIRES_OK(context,
54                    context->GetAttr("debug_ops_spec", &debug_ops_spec));
55     for (const string& debug_op_spec : debug_ops_spec) {
56       // Assume debug_op_spec has the format
57       // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
58       // DebugIdentity;grpc://localhost:3333;1
59       const std::vector<string> items = str_util::Split(debug_op_spec, ";");
60       OP_REQUIRES(
61           context, items.size() == 3,
62           errors::Internal(
63               "Unexpected number of semicolons in debug_ops_spec element: ",
64               debug_op_spec));
65       debug_op_and_url_specs_.push_back(
66           DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
67                                items[1], items[2] == "1"));
68     }
69   }
70 
Compute(OpKernelContext * context)71   void Compute(OpKernelContext* context) override {
72     const Tensor& src_tensor = context->input(0);
73 
74     if (src_tensor.IsInitialized() &&
75         DataTypeCanUseMemcpy(src_tensor.dtype()) &&
76         DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
77       // Source tensor is initialized and is mem-copyable. Make a copy.
78       Tensor* copied_tensor;
79       OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
80                                                        &copied_tensor));
81 
82 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
83       Device* device = static_cast<Device*>(context->device());
84       // Determine if the input tensor is not on CPU (e.g., on GPU).
85       bool off_host_input = device->device_type() == DEVICE_GPU &&
86                             !context->input_alloc_attr(0).on_host();
87 
88       if (off_host_input) {
89         DeviceContext* device_ctxt = context->op_device_context();
90         // Input is not on host: deep-copy it from GPU to the same GPU.
91         Notification done_copy;
92         GPUUtil::CopyGPUTensorToSameGPU(
93             device, device_ctxt, &src_tensor, copied_tensor,
94             [&done_copy](const Status& s) { done_copy.Notify(); });
95         done_copy.WaitForNotification();
96       } else {
97         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
98         *copied_tensor = tensor::DeepCopy(src_tensor);
99       }
100 #else
101       *copied_tensor = tensor::DeepCopy(src_tensor);
102 #endif
103     } else {
104       // Source tensor is NOT initialized and/or is not mem-copyable: Forward
105       // the Tensor object.
106       context->set_output(0, src_tensor);
107     }
108   }
109 
IsExpensive()110   bool IsExpensive() override { return false; }
111 
112  private:
113   string tensor_name_;
114   std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
115 };
116 
117 // Base class of all debug ops.
118 class BaseDebugOp : public OpKernel {
119  public:
BaseDebugOp(const string & debug_op_name,OpKernelConstruction * context)120   explicit BaseDebugOp(const string& debug_op_name,
121                        OpKernelConstruction* context)
122       : OpKernel(context), debug_op_name_(debug_op_name) {
123     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
124     OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
125 
126     string device_name;
127     string tensor_name;
128     OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name));
129     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name));
130 
131     std::vector<string> name_items = str_util::Split(tensor_name, ':');
132     string node_name;
133     int32 output_slot = 0;
134     OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2,
135                 errors::InvalidArgument("Failed to parse tensor name: \"",
136                                         tensor_name, "\""));
137     if (name_items.size() == 2) {
138       node_name = name_items[0];
139       OP_REQUIRES(
140           context, strings::safe_strto32(name_items[1], &output_slot),
141           errors::InvalidArgument("Invalid string value for output_slot: \"",
142                                   name_items[1], "\""));
143     } else if (name_items.size() == 1) {
144       node_name = name_items[0];
145     }
146 
147     debug_watch_key_.reset(
148         new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_));
149   }
150 
IsExpensive()151   bool IsExpensive() override { return false; }
152 
153  protected:
154   // Apply gRPC gating (if gated_grpc_ attribute is true).
155   //
156   // Returns false if and only if all grpc:// debug URLs of the debug op are
157   // disabled currently (i.e., gated off), in which case the debug op will emit
158   // an empty (size {0}) tensor of undefined data type.
ApplyGrpcGating(OpKernelContext * context)159   bool ApplyGrpcGating(OpKernelContext* context) {
160     if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(
161                            debug_watch_key_->debug_node_name, debug_urls_)) {
162       // The entire node is gated off: Output an empty tensor and avoid
163       // expensive computation.
164       Tensor* output_tensor;
165       TensorShape shape({0});
166       if (!context->allocate_output(0, shape, &output_tensor).ok()) {
167         LOG(ERROR) << "Debug node of watch key "
168                    << debug_watch_key_->debug_node_name
169                    << " failed to allocate empty tensor under gated-off state.";
170       }
171       return false;
172     } else {
173       return true;
174     }
175   }
176 
177   // Publish a tensor to all debug URLs of the debug op.
178   // Log an error if the publishing failed.
PublishTensor(const Tensor & tensor)179   Status PublishTensor(const Tensor& tensor) {
180     if (debug_urls_.empty()) {
181       return Status::OK();
182     } else {
183       Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor,
184                                                   Env::Default()->NowMicros(),
185                                                   debug_urls_, gated_grpc_);
186       if (!status.ok()) {
187         LOG(ERROR) << "Debug node of watch key "
188                    << debug_watch_key_->debug_node_name
189                    << " failed to publish debug tensor data to all URLs "
190                    << str_util::Join(debug_urls_, ", ")
191                    << ", due to: " << status.error_message();
192       }
193       return status;
194     }
195   }
196 
197  private:
198   const string debug_op_name_;
199   std::unique_ptr<DebugNodeKey> debug_watch_key_;
200   std::vector<string> debug_urls_;
201   bool gated_grpc_;
202 };
203 
204 // Identity op for debugging.
205 //   Output slot 0 carries the debug signal and is always allocated on the
206 //   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
207 //   the debug signal is equal to the input tensor.
208 class DebugIdentityOp : public BaseDebugOp {
209  public:
DebugIdentityOp(OpKernelConstruction * context)210   explicit DebugIdentityOp(OpKernelConstruction* context)
211       : BaseDebugOp("DebugIdentity", context) {}
212 
Compute(OpKernelContext * context)213   void Compute(OpKernelContext* context) override {
214     if (!ApplyGrpcGating(context)) {
215       return;
216     }
217 
218     OP_REQUIRES_OK(context, PublishTensor(context->input(0)));
219     context->set_output(0, context->input(0));
220   }
221 };
222 
223 // NaN-counter op for debugging.
224 template <typename T>
225 class DebugNanCountOp : public BaseDebugOp {
226  public:
DebugNanCountOp(OpKernelConstruction * context)227   explicit DebugNanCountOp(OpKernelConstruction* context)
228       : BaseDebugOp("DebugNanCount", context) {}
229 
Compute(OpKernelContext * context)230   void Compute(OpKernelContext* context) override {
231     if (!ApplyGrpcGating(context)) {
232       return;
233     }
234 
235     Tensor* output_tensor;
236     const Tensor& input = context->input(0);
237 
238     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
239     int64 nan_count = 0;
240 
241     // If the input is an uninitialized tensor, let nan_count be 0.
242     if (input.IsInitialized()) {
243       // Count NaNs.
244       const TensorShape& input_shape = input.shape();
245       const T* input_flat = input.template flat<T>().data();
246 
247       for (int64 i = 0; i < input_shape.num_elements(); ++i) {
248         if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) {
249           nan_count++;
250         }
251       }
252     }
253 
254     TensorShape shape({1});
255     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
256     output_tensor->vec<int64>()(0) = nan_count;
257     OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
258   }
259 };
260 
261 // Numeric summary op for debugging.
262 template <typename T>
263 class DebugNumericSummaryOp : public BaseDebugOp {
264  public:
DebugNumericSummaryOp(OpKernelConstruction * context)265   explicit DebugNumericSummaryOp(OpKernelConstruction* context)
266       : BaseDebugOp("DebugNumericSummary", context) {
267     OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
268     OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
269     OP_REQUIRES_OK(context,
270                    context->GetAttr("mute_if_healthy", &mute_if_healthy_));
271   }
272 
Compute(OpKernelContext * context)273   void Compute(OpKernelContext* context) override {
274     if (!ApplyGrpcGating(context)) {
275       return;
276     }
277 
278     Tensor* output_tensor;
279     const Tensor& input = context->input(0);
280 
281     int64 is_initialized = 0;
282     int64 element_count = 0;
283     int64 negative_inf_count = 0;
284     int64 negative_count = 0;
285     int64 zero_count = 0;
286     int64 positive_count = 0;
287     int64 positive_inf_count = 0;
288     int64 nan_count = 0;
289     double min = std::numeric_limits<double>::infinity();
290     double max = -std::numeric_limits<double>::infinity();
291     double sum = 0.0;
292     double mean = std::numeric_limits<double>::quiet_NaN();
293     double variance = std::numeric_limits<double>::quiet_NaN();
294 
295     // Equal to negative_count + zero_count + positive_count.
296     int64 non_inf_nan_count = 0;
297 
298     const TensorShape& input_shape = input.shape();
299     if (input.IsInitialized()) {
300       is_initialized = 1;
301       const T* input_flat = input.template flat<T>().data();
302 
303       element_count = input_shape.num_elements();
304       const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
305       const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
306 
307       for (int64 i = 0; i < element_count; ++i) {
308         const double x = static_cast<double>(input_flat[i]);
309         if (Eigen::numext::isnan(x)) {
310           nan_count++;
311         } else if (Eigen::numext::isinf(x)) {
312           if (x < 0.0) {
313             negative_inf_count++;
314           } else {
315             positive_inf_count++;
316           }
317         } else {
318           if (is_lower_bound_custom && x <= lower_bound_) {
319             negative_inf_count++;
320           } else if (is_upper_bound_custom && x >= upper_bound_) {
321             positive_inf_count++;
322           } else if (x < 0.0) {
323             negative_count++;
324           } else if (x > 0.0) {
325             positive_count++;
326           } else {
327             zero_count++;
328           }
329 
330           if (x < min) {
331             min = x;
332           }
333           if (x > max) {
334             max = x;
335           }
336 
337           non_inf_nan_count++;
338           sum += x;
339         }
340       }
341 
342       if (non_inf_nan_count > 0) {
343         mean = sum / non_inf_nan_count;
344 
345         // Do a second pass to compute variance.
346         variance = 0.0;
347         for (int64 i = 0; i < element_count; ++i) {
348           const double x = static_cast<double>(input_flat[i]);
349           if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
350             variance += (x - mean) * (x - mean);
351           }
352         }
353         variance /= non_inf_nan_count;
354       }
355     }
356 
357     TensorShape shape({14 + input_shape.dims()});
358     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
359     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
360     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
361     output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
362     output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
363     output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
364     output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
365     output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
366     output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
367     output_tensor->vec<double>()(8) = min;
368     output_tensor->vec<double>()(9) = max;
369     output_tensor->vec<double>()(10) = mean;
370     output_tensor->vec<double>()(11) = variance;
371 
372     output_tensor->vec<double>()(12) = static_cast<double>(input.dtype());
373     output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims());
374     for (size_t d = 0; d < input_shape.dims(); ++d) {
375       output_tensor->vec<double>()(14 + d) =
376           static_cast<double>(input_shape.dim_sizes()[d]);
377     }
378 
379     bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
380                 positive_inf_count == 0;
381     if (!mute) {
382       OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
383     }
384   }
385 
386  private:
387   float lower_bound_;
388   float upper_bound_;
389   bool mute_if_healthy_;
390 };
391 
392 // Identity op for tfdbg v2: Writes debug data using DebugEventsWriter.
393 class DebugIdentityV2Op : public OpKernel {
394  public:
DebugIdentityV2Op(OpKernelConstruction * context)395   explicit DebugIdentityV2Op(OpKernelConstruction* context)
396       : OpKernel(context),
397         device_name_(context->device()->name()),
398         output_slot_(-1),
399         tensor_debug_mode_(0),
400         tfdbg_run_id_() {
401     std::vector<string> debug_urls;
402     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls));
403     for (const string& debug_url : debug_urls) {
404       if (absl::StartsWith(debug_url, DebugIO::kFileURLScheme)) {
405         dump_roots_.emplace_back(
406             debug_url.substr(strlen(DebugIO::kFileURLScheme)));
407       } else {
408         context->SetStatus(
409             errors::Internal("Unsupported debug URL schema in: ", debug_url));
410       }
411     }
412     OP_REQUIRES_OK(context,
413                    context->GetAttr("tfdbg_context_id", &tfdbg_context_id_));
414     OP_REQUIRES_OK(context, context->GetAttr("op_name", &op_name_));
415     OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
416     OP_REQUIRES_OK(context,
417                    context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
418     if (context->HasAttr("circular_buffer_size")) {
419       OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size",
420                                                &circular_buffer_size_));
421     } else {
422       circular_buffer_size_ =
423           tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
424     }
425     if (context->HasAttr("tfdbg_run_id")) {
426       OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_));
427     }
428   }
429 
Compute(OpKernelContext * context)430   void Compute(OpKernelContext* context) override {
431     const Tensor& tensor = context->input(0);
432     for (const string& dump_root : dump_roots_) {
433       tfdbg::DebugEventsWriter* debug_events_writer =
434           tfdbg::DebugEventsWriter::GetDebugEventsWriter(
435               dump_root, tfdbg_run_id_, circular_buffer_size_);
436       OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
437                                   tfdbg_context_id_, device_name_, op_name_,
438                                   output_slot_, tensor_debug_mode_, tensor));
439     }
440     context->set_output(0, tensor);
441   }
442 
443  private:
444   std::vector<string> dump_roots_;
445   string tfdbg_context_id_;
446   string device_name_;
447   string op_name_;
448   int32 output_slot_;
449   int32 tensor_debug_mode_;
450   int64 circular_buffer_size_;
451   string tfdbg_run_id_;
452 };
453 
454 typedef Eigen::ThreadPoolDevice CPUDevice;
455 typedef Eigen::GpuDevice GPUDevice;
456 
457 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
458 template <typename Tin, typename Tout>
459 struct CurtHealthLaunch {
460   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]);
461 };
462 
463 extern template struct CurtHealthLaunch<Eigen::half, float>;
464 extern template struct CurtHealthLaunch<float, float>;
465 extern template struct CurtHealthLaunch<double, float>;
466 extern template struct CurtHealthLaunch<Eigen::half, double>;
467 extern template struct CurtHealthLaunch<float, double>;
468 extern template struct CurtHealthLaunch<double, double>;
469 
470 template <typename Tin, typename Tout>
471 struct ConciseHealthLaunch {
472   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]);
473 };
474 
475 extern template struct ConciseHealthLaunch<Eigen::half, float>;
476 extern template struct ConciseHealthLaunch<float, float>;
477 extern template struct ConciseHealthLaunch<double, float>;
478 extern template struct ConciseHealthLaunch<Eigen::half, double>;
479 extern template struct ConciseHealthLaunch<float, double>;
480 extern template struct ConciseHealthLaunch<double, double>;
481 
482 template <typename Tin, typename Tout>
483 struct FullHealthLaunch {
484   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]);
485 };
486 
487 extern template struct FullHealthLaunch<Eigen::half, float>;
488 extern template struct FullHealthLaunch<float, float>;
489 extern template struct FullHealthLaunch<double, float>;
490 extern template struct FullHealthLaunch<Eigen::half, double>;
491 extern template struct FullHealthLaunch<float, double>;
492 extern template struct FullHealthLaunch<double, double>;
493 
494 template <typename Tin, typename Tout>
495 struct ReduceInfNanThreeSlotsLaunch {
496   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]);
497 };
498 
499 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, float>;
500 extern template struct ReduceInfNanThreeSlotsLaunch<float, float>;
501 extern template struct ReduceInfNanThreeSlotsLaunch<double, float>;
502 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, double>;
503 extern template struct ReduceInfNanThreeSlotsLaunch<float, double>;
504 extern template struct ReduceInfNanThreeSlotsLaunch<double, double>;
505 
506 #endif
507 
508 template <typename Device, typename Tin, typename Tout>
509 class DebugNumericSummaryV2Op;
510 
511 // Numeric summary op for tfdbg v2: CPU Kernel.
512 template <typename Tin, typename Tout>
513 class DebugNumericSummaryV2Op<CPUDevice, Tin, Tout> : public OpKernel {
514  public:
515   explicit DebugNumericSummaryV2Op(OpKernelConstruction* context)
516       : OpKernel(context) {
517     OP_REQUIRES_OK(context,
518                    context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
519     OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_));
520   }
521 
522   void Compute(OpKernelContext* context) override {
523     const Tensor& tensor = context->input(0);
524     auto in = tensor.flat<Tin>();
525     const Tin* data = in.data();
526     const int64 size = in.size();
527     Tensor* output_tensor;
528     Tout tensor_id = static_cast<Tout>(tensor_id_);
529     const Tout num_elem = static_cast<Tout>(context->input(0).NumElements());
530     // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because
531     // that mode does not make use of tensor_id.
532     if (tensor_debug_mode_ != 8) {
533       OP_REQUIRES(
534           context, tensor_id_ <= kMaxTensorId,
535           errors::InvalidArgument("DebugNumericSummaryV2Op requires "
536                                   "tensor_id to be less than or equal to "
537                                   "(2^",
538                                   std::numeric_limits<Tout>::digits,
539                                   "). Given tensor_id:", tensor_id_));
540     }
541 
542     if (tensor_debug_mode_ == 2) {  // CURT_HEALTH
543       TensorShape shape({2});
544       OP_REQUIRES_OK(context,
545                      context->allocate_output(0, shape, &output_tensor));
546       output_tensor->flat<Tout>()(0) = tensor_id;  // Slot tensor id
547       output_tensor->flat<Tout>()(1) = 0.0;        // Has inf or nan
548       int fp_props =
549           std::accumulate(data, data + size, 0, [](const int x, const Tin& y) {
550             return Eigen::numext::isfinite(y) ? x : 1;
551           });
552       if (fp_props) {
553         output_tensor->flat<Tout>()(1) = 1.0;
554       }
555     } else if (tensor_debug_mode_ == 3) {  // CONCISE_HEALTH
556       TensorShape shape({5});
557       OP_REQUIRES_OK(context,
558                      context->allocate_output(0, shape, &output_tensor));
559       output_tensor->flat<Tout>()(0) = tensor_id;
560       output_tensor->flat<Tout>()(1) = num_elem;
561 
562       // Accumulator value [neg_inf_count, pos_inf_count, nan_count]
563       Tout fp_props[3] = {0.0, 0.0, 0.0};
564       std::for_each(data, data + size, [&fp_props](const Tin& y) {
565         if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
566           // Do nothing: common case.
567         } else if (Eigen::numext::isinf(y)) {
568           if (y < static_cast<Tin>(0.f)) {
569             ++fp_props[0];
570           } else {
571             ++fp_props[1];
572           }
573         } else if (Eigen::numext::isnan(y)) {
574           ++fp_props[2];
575         }
576       });
577       output_tensor->flat<Tout>()(2) = fp_props[0];  // Slot for -inf count
578       output_tensor->flat<Tout>()(3) = fp_props[1];  // Slot for inf count
579       output_tensor->flat<Tout>()(4) = fp_props[2];  // Slot for nan count
580     } else if (tensor_debug_mode_ == 4) {            // FULL HEALTH
581       TensorShape shape({11});
582       OP_REQUIRES_OK(context,
583                      context->allocate_output(0, shape, &output_tensor));
584       int num_dims = tensor.dims();
585       output_tensor->flat<Tout>()(0) = tensor_id;
586       output_tensor->flat<Tout>()(1) = -1.0;  // TODO(144919262): Device ID
587       output_tensor->flat<Tout>()(2) = static_cast<Tout>(tensor.dtype());
588       output_tensor->flat<Tout>()(3) = static_cast<Tout>(num_dims);
589       output_tensor->flat<Tout>()(4) = num_elem;
590 
591       // Accumulator value [neg_inf_count, pos_inf_count, nan_count, neg_count,
592       //                   zero_count, pos_count]
593       Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
594       std::for_each(data, data + size, [&fp_props](const Tin& y) {
595         if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
596           if (y < static_cast<Tin>(0.f)) {
597             ++fp_props[3];
598           } else if (y == static_cast<Tin>(0.f)) {
599             ++fp_props[4];
600           } else {
601             ++fp_props[5];
602           }
603         } else if (Eigen::numext::isinf(y)) {
604           if (y < static_cast<Tin>(0.f)) {
605             ++fp_props[0];
606           } else {
607             ++fp_props[1];
608           }
609         } else if (Eigen::numext::isnan(y)) {
610           ++fp_props[2];
611         }
612       });
613       output_tensor->flat<Tout>()(5) = fp_props[0];   // Slot for -inf count
614       output_tensor->flat<Tout>()(6) = fp_props[1];   // Slot for inf count
615       output_tensor->flat<Tout>()(7) = fp_props[2];   // Slot for nan count.
616       output_tensor->flat<Tout>()(8) = fp_props[3];   // Slot for neg count.
617       output_tensor->flat<Tout>()(9) = fp_props[4];   // Slot for zero count.
618       output_tensor->flat<Tout>()(10) = fp_props[5];  // Slot for pos count.
619     } else if (tensor_debug_mode_ == 5) {             // SHAPE
620       TensorShape shape({10});
621       OP_REQUIRES_OK(context,
622                      context->allocate_output(0, shape, &output_tensor));
623 
624       int num_dims = tensor.dims();
625       output_tensor->flat<Tout>()(0) = tensor_id;
626       output_tensor->flat<Tout>()(1) = static_cast<Tout>(tensor.dtype());
627       output_tensor->flat<Tout>()(2) = static_cast<Tout>(num_dims);
628       output_tensor->flat<Tout>()(3) = num_elem;
629 
630       // Tensor shape - stored as (6 columns)
631       // if num_dim is less than 6, we right pad the shape with zeros
632       // if num_dim is greater than 6, we truncate the head (left most) of the
633       // dimensions as they are more predictable than the last few (e.g. batch
634       // size as first dimension)
635       int dim_idx = 4;
636       for (int i = std::max(0, num_dims - kShapeDims);
637            i < std::max(6, num_dims); ++i) {
638         if (i < num_dims) {
639           output_tensor->flat<Tout>()(dim_idx++) =
640               static_cast<Tout>(tensor.dim_size(i));
641         } else {
642           output_tensor->flat<Tout>()(dim_idx++) = 0.0;
643         }
644       }
645     } else if (tensor_debug_mode_ == 8) {  // REDUCE_INF_NAN_THREE_SLOTS.
646       TensorShape shape({3});
647       OP_REQUIRES_OK(context,
648                      context->allocate_output(0, shape, &output_tensor));
649       output_tensor->flat<Tout>()(0) = 0.0;  // Slot for -inf.
650       output_tensor->flat<Tout>()(1) = 0.0;  // Slot for inf.
651       output_tensor->flat<Tout>()(2) = 0.0;  // Slot for nan.
652 
653       int fp_props =
654           std::accumulate(data, data + size, 0, [](const int x, const Tin& y) {
655             int result = x;
656             if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
657               // Do nothing: common case.
658             } else if (Eigen::numext::isinf(y)) {
659               result |= y < static_cast<Tin>(0.f) ? kNegInfBit : kPosInfBit;
660             } else if (Eigen::numext::isnan(y)) {
661               result |= kNaNBit;
662             }
663             return result;
664           });
665 
666       if (fp_props & kNegInfBit) {
667         output_tensor->flat<Tout>()(0) = -std::numeric_limits<Tout>::infinity();
668       }
669       if (fp_props & kPosInfBit) {
670         output_tensor->flat<Tout>()(1) = std::numeric_limits<Tout>::infinity();
671       }
672       if (fp_props & kNaNBit) {
673         output_tensor->flat<Tout>()(2) = std::numeric_limits<Tout>::quiet_NaN();
674       }
675     } else {
676       // TODO(cais): Implement other tensor debug modes in debug_event.proto.
677       context->SetStatus(errors::Unimplemented(
678           "Unimplemented tensor debug mode: ", tensor_debug_mode_));
679     }
680   }
681 
682  private:
683   int tensor_debug_mode_;
684   int64 tensor_id_;
685   static constexpr int kShapeDims = 6;
686   static constexpr int kNegInfBit = 0x01;
687   static constexpr int kPosInfBit = 0x02;
688   static constexpr int kNaNBit = 0x04;
689   static constexpr int64 kMaxTensorId = 1LL
690                                         << std::numeric_limits<Tout>::digits;
691 };
692 
693 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
694 
695 template <typename Tin, typename Tout>
696 class DebugNumericSummaryV2Op<GPUDevice, Tin, Tout> : public AsyncOpKernel {
697  public:
698   typedef GPUDevice Device;
699 
700   explicit DebugNumericSummaryV2Op(OpKernelConstruction* context)
701       : AsyncOpKernel(context) {
702     OP_REQUIRES_OK(context,
703                    context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
704     OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_));
705   }
706 
707   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
708     Tensor* output_tensor;
709     Tout tensor_id = static_cast<Tout>(tensor_id_);
710     const Tensor& tensor = context->input(0);
711     const Tout num_elem = static_cast<Tout>(tensor.NumElements());
712     const Device& d = context->eigen_device<Device>();
713     auto input = tensor.flat<Tin>();
714     auto check_cb = [this, done]() { done(); };
715     // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because
716     // that mode does not make use of tensor_id.
717     if (tensor_debug_mode_ != 8) {
718       OP_REQUIRES_ASYNC(
719           context, tensor_id_ <= kMaxTensorId,
720           errors::InvalidArgument("DebugNumericSummaryV2Op requires "
721                                   "tensor_id to be less than or equal to "
722                                   "(2^",
723                                   std::numeric_limits<Tout>::digits,
724                                   "). Given tensor_id:", tensor_id_),
725           done);
726     }
727 
728     if (tensor_debug_mode_ == 2) {  // CURT_HEALTH.
729       TensorShape shape({2});
730       OP_REQUIRES_OK(context,
731                      context->allocate_output(0, shape, &output_tensor));
732 
733       auto* stream = context->op_device_context()->stream();
734       OP_REQUIRES_ASYNC(context, stream != nullptr,
735                         errors::Internal("No GPU stream available."), done);
736 
737       se::DeviceMemoryBase output_tensor_ptr(
738           output_tensor->flat<Tout>().data(),
739           output_tensor->flat<Tout>().size());
740       stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout));
741       // Copy tensor_id to slot zero
742       stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout));
743       if (num_elem == 0) {
744         done();
745         return;
746       }
747 
748       // Call the GPU kernels for the numerical (inf/nan) checks.
749       auto input = context->input(0).flat<Tin>();
750       CurtHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(),
751                                         output_tensor->flat<Tout>().data() + 1);
752 
753       context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
754           stream, std::move(check_cb));
755     } else if (tensor_debug_mode_ == 3) {  // CONCISE_HEALTH.
756       TensorShape shape({5});
757       OP_REQUIRES_OK(context,
758                      context->allocate_output(0, shape, &output_tensor));
759 
760       auto* stream = context->op_device_context()->stream();
761       OP_REQUIRES_ASYNC(context, stream != nullptr,
762                         errors::Internal("No GPU stream available."), done);
763 
764       se::DeviceMemoryBase output_tensor_ptr(
765           output_tensor->flat<Tout>().data(),
766           output_tensor->flat<Tout>().size());
767       stream->ThenMemset32(&output_tensor_ptr, 0, 5 * sizeof(Tout));
768       const Tout static_output[] = {tensor_id, num_elem};
769       stream->ThenMemcpy(&output_tensor_ptr, &static_output, 2 * sizeof(Tout));
770       if (num_elem == 0) {
771         done();
772         return;
773       }
774 
775       // Call the GPU kernels for the numerical (inf/nan) checks.
776       ConciseHealthLaunch<Tin, Tout>().Run(
777           d, input.data(), input.size(),
778           output_tensor->flat<Tout>().data() + 2);
779 
780       context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
781           stream, std::move(check_cb));
782     } else if (tensor_debug_mode_ == 4) {  // FULL HEALTH
783       TensorShape shape({11});
784       OP_REQUIRES_OK(context,
785                      context->allocate_output(0, shape, &output_tensor));
786 
787       auto* stream = context->op_device_context()->stream();
788       OP_REQUIRES_ASYNC(context, stream != nullptr,
789                         errors::Internal("No GPU stream available."), done);
790 
791       se::DeviceMemoryBase output_tensor_ptr(
792           output_tensor->flat<Tout>().data(),
793           output_tensor->flat<Tout>().size());
794       stream->ThenMemset32(&output_tensor_ptr, 0, 11 * sizeof(Tout));
795 
796       int num_dims = tensor.dims();
797       const Tout static_output[] = {tensor_id,
798                                     -1.0,  // TODO(144919262): Device ID
799                                     static_cast<Tout>(tensor.dtype()),
800                                     static_cast<Tout>(num_dims), num_elem};
801       stream->ThenMemcpy(&output_tensor_ptr, &static_output, 5 * sizeof(Tout));
802       if (num_elem == 0) {
803         done();
804         return;
805       }
806 
807       // Call the GPU kernels for the numerical (inf/nan) checks and
808       // pos/neg/zero counts.
809       FullHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(),
810                                         output_tensor->flat<Tout>().data() + 5);
811 
812       context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
813           stream, std::move(check_cb));
814     } else if (tensor_debug_mode_ == 5) {  // SHAPE
815       TensorShape shape({10});
816       OP_REQUIRES_OK(context,
817                      context->allocate_output(0, shape, &output_tensor));
818 
819       auto* stream = context->op_device_context()->stream();
820       OP_REQUIRES_ASYNC(context, stream != nullptr,
821                         errors::Internal("No GPU stream available."), done);
822 
823       se::DeviceMemoryBase output_tensor_ptr(
824           output_tensor->flat<Tout>().data(),
825           output_tensor->flat<Tout>().size());
826 
827       int num_dims = tensor.dims();
828       Tout static_output[10] = {tensor_id,
829                                 static_cast<Tout>(tensor.dtype()),
830                                 static_cast<Tout>(num_dims),
831                                 num_elem,
832                                 0.0,
833                                 0.0,
834                                 0.0,
835                                 0.0,
836                                 0.0,
837                                 0.0};
838       // Tensor shape: right pad zeros, truncate head
839       int dim_idx = 4;
840       for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) {
841         static_output[dim_idx++] = static_cast<Tout>(tensor.dim_size(i));
842       }
843       // Write to device stream
844       stream->ThenMemcpy(&output_tensor_ptr, &static_output, sizeof(Tout) * 10);
845       context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
846           stream, std::move(check_cb));
847     } else if (tensor_debug_mode_ == 8) {  // REDUCE_INF_NAN_THREE_SLOTS.
848       TensorShape shape({3});
849       OP_REQUIRES_OK(context,
850                      context->allocate_output(0, shape, &output_tensor));
851 
852       auto* stream = context->op_device_context()->stream();
853       OP_REQUIRES_ASYNC(context, stream != nullptr,
854                         errors::Internal("No GPU stream available."), done);
855 
856       se::DeviceMemoryBase output_tensor_ptr(
857           output_tensor->flat<Tout>().data(),
858           output_tensor->flat<Tout>().size());
859       stream->ThenMemset32(&output_tensor_ptr, 0,
860                            output_tensor->flat<Tout>().size() * sizeof(Tout));
861       if (num_elem == 0) {
862         done();
863         return;
864       }
865 
866       // Call the GPU kernels for the numerical (inf/nan) checks.
867       auto input = context->input(0).flat<Tin>();
868       ReduceInfNanThreeSlotsLaunch<Tin, Tout>().Run(
869           d, input.data(), input.size(), output_tensor->flat<Tout>().data());
870 
871       context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
872           stream, std::move(check_cb));
873     } else {
874       // TODO(cais): Implement other tensor debug modes in debug_event.proto.
875       context->SetStatus(errors::Unimplemented(
876           "Unimplemented tensor debug mode: ", tensor_debug_mode_));
877       done();
878     }
879   }
880 
881  private:
882   int tensor_debug_mode_;
883   int64 tensor_id_;
884   static constexpr int64 kMaxTensorId = 1L << std::numeric_limits<Tout>::digits;
885 };
886 
887 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
888 
889 }  // namespace tensorflow
890 
891 #endif  // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
892