1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/array_ops.cc. 17 18 // clang-format off 19 #include "tensorflow/core/platform/bfloat16.h" 20 21 #include <math.h> // NOLINT 22 #include <algorithm> // NOLINT 23 #include <numeric> // NOLINT 24 // clang-format on 25 26 #include "tensorflow/core/framework/op_kernel.h" 27 #include "tensorflow/core/framework/register_types.h" 28 #include "tensorflow/core/framework/tensor.h" 29 #include "tensorflow/core/framework/tensor_reference.h" 30 #include "tensorflow/core/framework/types.h" 31 32 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 33 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 34 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 35 36 #if GOOGLE_CUDA 37 #include "tensorflow/stream_executor/cuda/cuda_activation.h" 38 #elif TENSORFLOW_USE_ROCM 39 #include "tensorflow/core/platform/rocm.h" 40 #endif 41 namespace tensorflow { 42 43 typedef Eigen::ThreadPoolDevice CPUDevice; 44 typedef Eigen::GpuDevice GPUDevice; 45 46 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 47 template <typename T> 48 struct CheckNumericsLaunch { 49 void Run(const GPUDevice& d, const T* data, int size, 50 int abnormal_detected[2]); 51 }; 52 53 extern template struct CheckNumericsLaunch<Eigen::half>; 54 extern template struct CheckNumericsLaunch<float>; 55 extern template struct CheckNumericsLaunch<double>; 56 57 template <typename T> 58 struct CheckNumericsLaunchV2 { 59 void Run(const GPUDevice& d, const T* data, int size, 60 int abnormal_detected[3]); 61 }; 62 63 extern template struct CheckNumericsLaunchV2<Eigen::half>; 64 extern template struct CheckNumericsLaunchV2<float>; 65 extern template struct CheckNumericsLaunchV2<double>; 66 #endif 67 68 namespace { 69 70 const int kInfBit = 0x01; 71 const int kNaNBit = 0x02; 72 const int kNegativeInfBit = 0x04; 73 const int kPositiveInfBit = 0x08; 74 75 template <typename Device, typename T> 76 class CheckNumericsOp; 77 78 // Partial specialization for CPU 79 // TODO(jeff,rmlarsen): We should make this variant be an AsyncOpKernel, as 80 // was done for the GPU case below. 81 template <typename T> 82 class CheckNumericsOp<CPUDevice, T> : public OpKernel { 83 public: CheckNumericsOp(OpKernelConstruction * context)84 explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) { 85 // message_ is used as the prefix for the assertion error message. For 86 // instance, this can be the name of the input op that produced the tensor. 87 OP_REQUIRES_OK(context, context->GetAttr("message", &message_)); 88 } 89 Compute(OpKernelContext * context)90 void Compute(OpKernelContext* context) override { 91 // pass along the input to the output 92 context->set_output(0, context->input(0)); 93 94 auto in = context->input(0).flat<T>(); 95 const T* data = in.data(); 96 const int64 size = in.size(); 97 // Check to see if any element of the tensor is NaN or Inf. 98 int fp_props = std::accumulate( 99 data, data + size, 0, 100 [this](const int x, const T& y) { return checkFloatingElement(x, y); }); 101 if (fp_props != 0) { 102 const string& status = getErrorString(fp_props); 103 if (!status.empty()) { 104 context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ", 105 status, " values")); 106 } 107 } 108 } 109 110 protected: checkFloatingElement(const int x,const T & y)111 virtual int checkFloatingElement(const int x, const T& y) { 112 int result = x; 113 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 114 // Do nothing: common case. 115 } else { 116 if (Eigen::numext::isinf(y)) { 117 result |= kInfBit; 118 } else if (Eigen::numext::isnan(y)) { 119 result |= kNaNBit; 120 } 121 } 122 return result; 123 } 124 getErrorString(const int fp_props)125 virtual const string getErrorString(const int fp_props) { 126 string status; 127 if ((fp_props & kInfBit) && (fp_props & kNaNBit)) { 128 status = "Inf and NaN"; 129 } else { 130 if (fp_props & kInfBit) { 131 status = "Inf"; 132 } 133 if (fp_props & kNaNBit) { 134 status = "NaN"; 135 } 136 } 137 return status; 138 } 139 140 private: 141 string message_; 142 }; 143 144 template <typename Device, typename T> 145 class CheckNumericsV2Op; 146 147 // Partial specialization for CPU: v2. 148 // The v2 op differs from the v1 in that it distinguishes -inf and +inf. 149 template <typename T> 150 class CheckNumericsV2Op<CPUDevice, T> : public CheckNumericsOp<CPUDevice, T> { 151 public: CheckNumericsV2Op(OpKernelConstruction * context)152 explicit CheckNumericsV2Op(OpKernelConstruction* context) 153 : CheckNumericsOp<CPUDevice, T>(context) {} 154 155 protected: checkFloatingElement(const int x,const T & y)156 int checkFloatingElement(const int x, const T& y) override { 157 int result = x; 158 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 159 // Do nothing: common case. 160 } else { 161 if (Eigen::numext::isinf(y)) { 162 result |= y < static_cast<T>(0.) ? kNegativeInfBit : kPositiveInfBit; 163 } else if (Eigen::numext::isnan(y)) { 164 result |= kNaNBit; 165 } 166 } 167 return result; 168 } 169 getErrorString(const int fp_props)170 const string getErrorString(const int fp_props) override { 171 std::vector<string> anomalies; 172 if (fp_props & kNegativeInfBit) { 173 anomalies.push_back("-Inf"); 174 } 175 if (fp_props & kPositiveInfBit) { 176 anomalies.push_back("+Inf"); 177 } 178 if (fp_props & kNaNBit) { 179 anomalies.push_back("NaN"); 180 } 181 if (anomalies.size() == 3) { 182 return strings::StrCat(anomalies[0], ", ", anomalies[1], ", and ", 183 anomalies[2]); 184 } else if (anomalies.size() == 2) { 185 return strings::StrCat(anomalies[0], " and ", anomalies[1]); 186 } else { 187 return anomalies[0]; 188 } 189 } 190 }; 191 192 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 193 // Partial specialization for GPU 194 template <typename T> 195 class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel { 196 public: 197 typedef GPUDevice Device; 198 CheckNumericsOp(OpKernelConstruction * context)199 explicit CheckNumericsOp(OpKernelConstruction* context) 200 : AsyncOpKernel(context) { 201 // message_ is used as the prefix for the assertion error message. For 202 // instance, this can be the name of the input op that produced the tensor. 203 OP_REQUIRES_OK(context, context->GetAttr("message", &message_)); 204 } 205 ComputeAsync(OpKernelContext * context,DoneCallback done)206 void ComputeAsync(OpKernelContext* context, DoneCallback done) override { 207 // pass along the input to the output 208 context->set_output(0, context->input(0)); 209 if (context->input(0).NumElements() == 0) { 210 done(); 211 return; 212 } 213 auto input = context->input(0).flat<T>(); 214 215 // Allocate and initialize the elements to hold the check results 216 Tensor abnormal_detected; 217 const int abnormal_detected_size = getAnomalyIndicatorSize(); 218 OP_REQUIRES_OK(context, context->allocate_temp( 219 DT_INT32, TensorShape({abnormal_detected_size}), 220 &abnormal_detected)); 221 222 auto* stream = context->op_device_context()->stream(); 223 OP_REQUIRES_ASYNC(context, stream != nullptr, 224 errors::Internal("No GPU stream available."), done); 225 226 se::DeviceMemoryBase abnormal_detected_ptr( 227 abnormal_detected.flat<int>().data(), 228 abnormal_detected.flat<int>().size()); 229 stream->ThenMemset32(&abnormal_detected_ptr, 0, 230 abnormal_detected.flat<int>().size() * sizeof(int)); 231 232 // Call the GPU kernels for the numerical checks 233 const Device& d = context->eigen_device<Device>(); 234 RunKernel(d, input.data(), input.size(), 235 abnormal_detected.flat<int>().data()); 236 237 // Copy the results from device to host 238 AllocatorAttributes attr; 239 attr.set_on_host(true); 240 attr.set_gpu_compatible(true); 241 Tensor abnormal_detected_host; 242 OP_REQUIRES_OK_ASYNC( 243 context, 244 context->allocate_temp(DT_INT32, TensorShape({abnormal_detected_size}), 245 &abnormal_detected_host, attr), 246 done); 247 OP_REQUIRES_ASYNC( 248 context, 249 stream 250 ->ThenMemcpy(abnormal_detected_host.flat<int>().data(), 251 abnormal_detected_ptr, 252 abnormal_detected_size * sizeof(int)) 253 .ok(), 254 errors::Internal("GPU memcpy from device to host failed"), done); 255 256 // We have observed crashes on some network stacks when not holding 257 // this tensor reference. 258 TensorReference abnormal_detected_ref(abnormal_detected); 259 auto check_cb = [this, stream, abnormal_detected_ref, 260 abnormal_detected_host, context, done]() { 261 #if GOOGLE_CUDA 262 se::cuda::ScopedActivateExecutorContext scoped_activation{ 263 stream->parent()}; 264 #elif TENSORFLOW_USE_ROCM 265 se::rocm::ScopedActivateExecutorContext scoped_activation{ 266 stream->parent()}; 267 #endif 268 TTypes<const int>::Vec abnormal_detected_host_flat = 269 abnormal_detected_host.flat<int>(); 270 abnormal_detected_ref.Unref(); 271 checkForAnomalies(context, abnormal_detected_host_flat); 272 done(); 273 }; 274 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 275 stream, std::move(check_cb)); 276 } 277 278 protected: getAnomalyIndicatorSize()279 virtual int getAnomalyIndicatorSize() { return 2; } 280 RunKernel(const GPUDevice & d,const T * data,int size,int * abnormal_detected)281 virtual void RunKernel(const GPUDevice& d, const T* data, int size, 282 int* abnormal_detected) { 283 CheckNumericsLaunch<T>().Run(d, data, size, abnormal_detected); 284 } 285 checkForAnomalies(OpKernelContext * context,const TTypes<const int>::Vec & abnormality_indicators)286 virtual void checkForAnomalies( 287 OpKernelContext* context, 288 const TTypes<const int>::Vec& abnormality_indicators) { 289 const int is_nan = abnormality_indicators(0); 290 const int is_inf = abnormality_indicators(1); 291 if (is_nan || is_inf) { 292 LOG(ERROR) << "abnormal_detected_host @" << abnormality_indicators.data() 293 << " = {" << is_nan << ", " << is_inf << "} " << message_; 294 295 string anomalies; 296 if (is_nan && is_inf) { 297 anomalies = "Inf and NaN"; 298 } else if (is_nan) { 299 anomalies = "NaN"; 300 } else if (is_inf) { 301 anomalies = "Inf"; 302 } 303 context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ", 304 anomalies, " values")); 305 } 306 } 307 308 string message_; 309 }; 310 311 template <typename T> 312 class CheckNumericsV2Op<GPUDevice, T> : public CheckNumericsOp<GPUDevice, T> { 313 public: CheckNumericsV2Op(OpKernelConstruction * context)314 CheckNumericsV2Op(OpKernelConstruction* context) 315 : CheckNumericsOp<GPUDevice, T>(context) {} 316 317 protected: getAnomalyIndicatorSize()318 int getAnomalyIndicatorSize() override { return 3; } 319 RunKernel(const GPUDevice & d,const T * data,int size,int * abnormal_detected)320 void RunKernel(const GPUDevice& d, const T* data, int size, 321 int* abnormal_detected) override { 322 CheckNumericsLaunchV2<T>().Run(d, data, size, abnormal_detected); 323 } 324 checkForAnomalies(OpKernelContext * context,const TTypes<const int>::Vec & abnormality_indicators)325 void checkForAnomalies( 326 OpKernelContext* context, 327 const TTypes<const int>::Vec& abnormality_indicators) override { 328 const int is_nan = abnormality_indicators(0); 329 const int is_negative_inf = abnormality_indicators(1); 330 const int is_positive_inf = abnormality_indicators(2); 331 if (is_negative_inf || is_positive_inf || is_nan) { 332 std::vector<string> anomalies; 333 if (is_negative_inf) { 334 anomalies.push_back("-Inf"); 335 } 336 if (is_positive_inf) { 337 anomalies.push_back("+Inf"); 338 } 339 if (is_nan) { 340 anomalies.push_back("NaN"); 341 } 342 string all_anomalies; 343 if (anomalies.size() == 3) { 344 all_anomalies = strings::StrCat(anomalies[0], ", ", anomalies[1], 345 ", and ", anomalies[2]); 346 } else if (anomalies.size() == 2) { 347 all_anomalies = strings::StrCat(anomalies[0], " and ", anomalies[1]); 348 } else { 349 all_anomalies = anomalies[0]; 350 } 351 context->SetStatus(errors::InvalidArgument( 352 this->message_, " : Tensor had ", all_anomalies, " values")); 353 } 354 } 355 356 static constexpr int abnormal_detected_size = 3; 357 }; 358 359 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 360 361 } // namespace 362 363 #define REGISTER_CPU_KERNEL(T) \ 364 REGISTER_KERNEL_BUILDER( \ 365 Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 366 CheckNumericsOp<CPUDevice, T>); 367 TF_CALL_half(REGISTER_CPU_KERNEL); 368 TF_CALL_bfloat16(REGISTER_CPU_KERNEL); 369 TF_CALL_float(REGISTER_CPU_KERNEL); 370 TF_CALL_double(REGISTER_CPU_KERNEL); 371 372 #define REGISTER_V2_CPU_KERNEL(T) \ 373 REGISTER_KERNEL_BUILDER( \ 374 Name("CheckNumericsV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 375 CheckNumericsV2Op<CPUDevice, T>); 376 TF_CALL_half(REGISTER_V2_CPU_KERNEL); 377 TF_CALL_bfloat16(REGISTER_V2_CPU_KERNEL); 378 TF_CALL_float(REGISTER_V2_CPU_KERNEL); 379 TF_CALL_double(REGISTER_V2_CPU_KERNEL); 380 381 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 382 REGISTER_KERNEL_BUILDER( 383 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), 384 CheckNumericsOp<GPUDevice, Eigen::half>); 385 REGISTER_KERNEL_BUILDER( 386 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<float>("T"), 387 CheckNumericsOp<GPUDevice, float>); 388 REGISTER_KERNEL_BUILDER( 389 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<double>("T"), 390 CheckNumericsOp<GPUDevice, double>); 391 392 REGISTER_KERNEL_BUILDER( 393 Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), 394 CheckNumericsV2Op<GPUDevice, Eigen::half>); 395 REGISTER_KERNEL_BUILDER( 396 Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<float>("T"), 397 CheckNumericsV2Op<GPUDevice, float>); 398 REGISTER_KERNEL_BUILDER( 399 Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<double>("T"), 400 CheckNumericsV2Op<GPUDevice, double>); 401 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 402 403 } // namespace tensorflow 404