• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "cuda/include/cublas_v2.h"
17 #include "cuda/include/cuda.h"
18 
19 #define SE_CUDA_DATA_HALF CUDA_R_16F
20 
21 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
22 
23 // Both Eigen Half.h and CUDA cuda_fp16.h provide similar typedef for __half. As
24 // such, there are two ways to get the typedef for __half:
25 //
26 // (1) Includes cuda_fp16.h and defines EIGEN_HAS_CUDA_FP16.
27 // (2) Neither includes cuda_fp16.h nor defines EIGEN_HAS_CUDA_FP16.
28 //
29 // Due to issue b/73793421, when the first approach is used and NVCC is used to
30 // compile this file, NVCC will complain duplicated definition for
31 // EIGEN_HAS_CUDA_FP16. On the other hand, when the second approach is used and
32 // clang is used to compile this file, clang will not understand __half
33 // due to missing the definition and macro EIGEN_HAS_CUDA_FP16.
34 //
35 // Because this file may be compiled with CLANG but will never be compiled with
36 // NVCC, we choose the first approach for CUDA < 9.0. For CUDA >= 9.0, we have
37 // to use the second approach because the data member in the __half defined
38 // by CUDA > 9.0 is `__x` while Eigen expects it to be `x`.
39 //
40 // TODO(b/73793421): Remove the following code block to switch to the second
41 // approach when the issue is fixed.
42 #if CUDA_VERSION < 9000
43 #include "cuda/include/cuda_fp16.h"
44 #define EIGEN_HAS_CUDA_FP16
45 #endif
46 
47 #include "third_party/eigen3/Eigen/Core"
48 
49 #include <assert.h>
50 #include <complex>
51 
52 #include "absl/strings/str_cat.h"
53 #include "tensorflow/core/util/env_var.h"
54 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
55 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
56 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
57 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
58 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
59 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
60 #include "tensorflow/stream_executor/device_memory.h"
61 #include "tensorflow/stream_executor/lib/env.h"
62 #include "tensorflow/stream_executor/lib/initialize.h"
63 #include "tensorflow/stream_executor/lib/status.h"
64 #include "tensorflow/stream_executor/lib/status_macros.h"
65 #include "tensorflow/stream_executor/lib/stringprintf.h"
66 #include "tensorflow/stream_executor/platform/logging.h"
67 #include "tensorflow/stream_executor/platform/port.h"
68 #include "tensorflow/stream_executor/plugin_registry.h"
69 #include "tensorflow/stream_executor/scratch_allocator.h"
70 #include "tensorflow/stream_executor/stream_executor.h"
71 
72 namespace stream_executor {
73 namespace gpu {
74 
75 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
76 
ToString(cublasStatus_t status)77 static string ToString(cublasStatus_t status) {
78   switch (status) {
79     case CUBLAS_STATUS_SUCCESS:
80       return "CUBLAS_STATUS_SUCCESS";
81     case CUBLAS_STATUS_NOT_INITIALIZED:
82       return "CUBLAS_STATUS_NOT_INITIALIZED";
83     case CUBLAS_STATUS_ALLOC_FAILED:
84       return "CUBLAS_STATUS_ALLOC_FAILED";
85     case CUBLAS_STATUS_INVALID_VALUE:
86       return "CUBLAS_STATUS_INVALID_VALUE";
87     case CUBLAS_STATUS_ARCH_MISMATCH:
88       return "CUBLAS_STATUS_ARCH_MISMATCH";
89     case CUBLAS_STATUS_MAPPING_ERROR:
90       return "CUBLAS_STATUS_MAPPING_ERROR";
91     case CUBLAS_STATUS_EXECUTION_FAILED:
92       return "CUBLAS_STATUS_EXECUTION_FAILED";
93     case CUBLAS_STATUS_INTERNAL_ERROR:
94       return "CUBLAS_STATUS_INTERNAL_ERROR";
95 #if CUDA_VERSION >= 8000
96     case CUBLAS_STATUS_NOT_SUPPORTED:
97       return "CUBLAS_STATUS_NOT_SUPPORTED";
98     case CUBLAS_STATUS_LICENSE_ERROR:
99       return "CUBLAS_STATUS_LICENSE_ERROR";
100 #endif
101     default:
102       return absl::StrCat("<invalid cublas status: ", status, ">");
103   }
104 }
105 
106 // Decide whether to enable TENSOR_OP_MATH
TensorOpMathEnabled()107 static bool TensorOpMathEnabled() {
108   static bool is_enabled = [] {
109     bool is_disabled;
110     TF_CHECK_OK(
111         tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
112                                        /*default_val=*/false, &is_disabled));
113     return !is_disabled;
114   }();
115   return is_enabled;
116 }
117 
118 // cuBLAS has interfaces that permit pointers to be passed from either the host
119 // memory space or the device memory space; however, you must instruct it as to
120 // which address space those pointers are in with cublasSetPointerMode.
121 //
122 // This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call
123 // you are about to perform in a given scope.
124 //
125 // The prior cuBLAS pointer mode is retained and restored when this object goes
126 // out of scope.
127 class ScopedCublasPointerMode {
128  public:
129   // Note that, because the setting of the cublas pointer mode is fallible,
130   // construction of this scoped datatype must be paired with a call to
131   // Init().
132   //
133   // Parameters:
134   //  handle: The cublas library handle to act upon in setting the pointer mode.
ScopedCublasPointerMode(cublasHandle_t handle)135   explicit ScopedCublasPointerMode(cublasHandle_t handle)
136       : handle_(handle), ok_(false) {}
137 
138   // Attempts the switch to the requested scoped pointer mode, new_mode.
139   //
140   // Note that when false is returned, an appropriate error has already been
141   // logged.
Init(cublasPointerMode_t new_mode)142   bool Init(cublasPointerMode_t new_mode) {
143     cublasStatus_t ret = cublasGetPointerMode(handle_, &old_mode_);
144     if (ret != CUBLAS_STATUS_SUCCESS) {
145       LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret);
146       return ok_ = false;
147     }
148 
149     ret = cublasSetPointerMode(handle_, new_mode);
150     if (ret != CUBLAS_STATUS_SUCCESS) {
151       LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret);
152       return ok_ = false;
153     }
154 
155     return ok_ = true;
156   }
157 
158   // Switches back to the prior pointer mode, if the switch operation was
159   // successful in the first place.
~ScopedCublasPointerMode()160   ~ScopedCublasPointerMode() {
161     if (ok_) {
162       cublasStatus_t ret = cublasSetPointerMode(handle_, old_mode_);
163       if (ret != CUBLAS_STATUS_SUCCESS) {
164         LOG(ERROR) << "failed to set former cublas pointer mode: "
165                    << ToString(ret);
166       }
167     }
168   }
169 
170  private:
171   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
172   cublasPointerMode_t old_mode_;  // Prior cuBLAS pointer mode, to be restored.
173   bool ok_;                       // Whether the change was successful.
174 };
175 
176 #if CUDA_VERSION >= 9000
177 // cuBLAS has interfaces that permit computations to use the Volta hardware.
178 // This must be enabled via the cublasGet/SetMathMode APIs.
179 //
180 // This helper sets the cuBLAS math mode to a desired value for a cuBLAS call
181 // you are about to perform in a given scope.
182 //
183 // The prior cuBLAS math mode is retained and restored when this object goes
184 // out of scope.
185 class ScopedCublasMathMode {
186  public:
187   // Note that, because the setting of the cublas math mode is fallible,
188   // construction of this scoped datatype must be paired with a call to
189   // Init().
190   //
191   // Parameters:
192   //  handle: The cublas library handle to act upon in setting the math mode.
ScopedCublasMathMode(cublasHandle_t handle)193   explicit ScopedCublasMathMode(cublasHandle_t handle)
194       : handle_(handle), ok_(false) {}
195 
196   // Attempts the switch to the requested scoped math mode, new_mode.
197   //
198   // Note that when false is returned, an appropriate error has already been
199   // logged.
Init(cublasMath_t new_mode)200   bool Init(cublasMath_t new_mode) {
201     cublasStatus_t ret = cublasGetMathMode(handle_, &old_mode_);
202     if (ret != CUBLAS_STATUS_SUCCESS) {
203       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
204       return ok_ = false;
205     }
206 
207     ret = cublasSetMathMode(handle_, new_mode);
208     if (ret != CUBLAS_STATUS_SUCCESS) {
209       LOG(ERROR) << "failed to set new cublas math mode: " << ToString(ret);
210       return ok_ = false;
211     }
212     return ok_ = true;
213   }
214 
215   // Switches back to the prior math mode, if the switch operation was
216   // successful in the first place.
~ScopedCublasMathMode()217   ~ScopedCublasMathMode() {
218     if (ok_) {
219       cublasStatus_t ret = cublasSetMathMode(handle_, old_mode_);
220       if (ret != CUBLAS_STATUS_SUCCESS) {
221         LOG(ERROR) << "failed to set former cublas math mode: "
222                    << ToString(ret);
223       }
224     }
225   }
226 
227  private:
228   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
229   cublasMath_t old_mode_;  // Prior cuBLAS math mode, to be restored.
230   bool ok_;                // Whether the change was successful.
231 };
232 #endif  // CUDA_VERSION >= 9000
233 
Init()234 bool CUDABlas::Init() {
235   gpu::ScopedActivateExecutorContext sac{parent_};
236   cublasStatus_t ret = cublasCreate(&blas_);
237   if (ret != CUBLAS_STATUS_SUCCESS) {
238     LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
239     return false;
240   }
241 
242   return true;
243 }
244 
CUDABlas(gpu::GpuExecutor * parent)245 CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
246     : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
247 
~CUDABlas()248 CUDABlas::~CUDABlas() {
249   if (blas_ != nullptr) {
250     gpu::ScopedActivateExecutorContext sac{parent_};
251     cublasDestroy(blas_);
252   }
253 }
254 
SetStream(Stream * stream)255 bool CUDABlas::SetStream(Stream *stream) {
256   CHECK(stream != nullptr);
257   CHECK(AsGpuStreamValue(stream) != nullptr);
258   CHECK(blas_ != nullptr);
259   gpu::ScopedActivateExecutorContext sac{parent_};
260   cublasStatus_t ret = cublasSetStream(blas_, AsGpuStreamValue(stream));
261   if (ret != CUBLAS_STATUS_SUCCESS) {
262     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
263     return false;
264   }
265 
266   return true;
267 }
268 
269 namespace {
270 
271 // Helper functions transforming blas arguments into cuBLAS arguments.
272 
CUDABlasTranspose(blas::Transpose trans)273 cublasOperation_t CUDABlasTranspose(blas::Transpose trans) {
274   switch (trans) {
275     case blas::Transpose::kNoTranspose:
276       return CUBLAS_OP_N;
277     case blas::Transpose::kTranspose:
278       return CUBLAS_OP_T;
279     case blas::Transpose::kConjugateTranspose:
280       return CUBLAS_OP_C;
281     default:
282       LOG(FATAL) << "Invalid value of blas::Transpose.";
283   }
284 }
285 
CUDABlasUpperLower(blas::UpperLower uplo)286 cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) {
287   switch (uplo) {
288     case blas::UpperLower::kUpper:
289       return CUBLAS_FILL_MODE_UPPER;
290     case blas::UpperLower::kLower:
291       return CUBLAS_FILL_MODE_LOWER;
292     default:
293       LOG(FATAL) << "Invalid value of blas::UpperLower.";
294   }
295 }
296 
CUDABlasDiagonal(blas::Diagonal diag)297 cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) {
298   switch (diag) {
299     case blas::Diagonal::kUnit:
300       return CUBLAS_DIAG_UNIT;
301     case blas::Diagonal::kNonUnit:
302       return CUBLAS_DIAG_NON_UNIT;
303     default:
304       LOG(FATAL) << "Invalid value of blas::Diagonal.";
305   }
306 }
307 
CUDABlasSide(blas::Side side)308 cublasSideMode_t CUDABlasSide(blas::Side side) {
309   switch (side) {
310     case blas::Side::kLeft:
311       return CUBLAS_SIDE_LEFT;
312     case blas::Side::kRight:
313       return CUBLAS_SIDE_RIGHT;
314     default:
315       LOG(FATAL) << "Invalid value of blas::Side.";
316   }
317 }
318 
319 // CUDADataType<T>::type translates from a C++ type (e.g. float) to a
320 // cudaDataType_t (e.g. CUDA_R_32F).  CUDAComputationType(ty) translates from a
321 // blas::ComputationType to a cudaDataType_t.
322 //
323 // These are used to build the argument type and computation type args to
324 // cublasGemmEx.
325 template <typename T>
326 struct CUDADataType;
327 
328 template <>
329 struct CUDADataType<Eigen::half> {
330   static constexpr cudaDataType_t type = SE_CUDA_DATA_HALF;
331 };
332 
333 template <>
334 struct CUDADataType<std::complex<Eigen::half>> {
335   static constexpr cudaDataType_t type = CUDA_C_16F;
336 };
337 
338 template <>
339 struct CUDADataType<float> {
340   static constexpr cudaDataType_t type = CUDA_R_32F;
341 };
342 
343 template <>
344 struct CUDADataType<std::complex<float>> {
345   static constexpr cudaDataType_t type = CUDA_C_32F;
346 };
347 
348 template <>
349 struct CUDADataType<double> {
350   static constexpr cudaDataType_t type = CUDA_R_64F;
351 };
352 
353 template <>
354 struct CUDADataType<std::complex<double>> {
355   static constexpr cudaDataType_t type = CUDA_C_64F;
356 };
357 
358 template <>
359 struct CUDADataType<int> {
360   static constexpr cudaDataType_t type = CUDA_R_32I;
361 };
362 
363 template <>
364 struct CUDADataType<int8> {
365   static constexpr cudaDataType_t type = CUDA_R_8I;
366 };
367 
368 template <>
369 struct CUDADataType<std::complex<int8>> {
370   static constexpr cudaDataType_t type = CUDA_C_8I;
371 };
372 
373 template <>
374 struct CUDADataType<uint8> {
375   static constexpr cudaDataType_t type = CUDA_R_8U;
376 };
377 
378 template <>
379 struct CUDADataType<std::complex<uint8>> {
380   static constexpr cudaDataType_t type = CUDA_C_8U;
381 };
382 
CUDAComputationType(blas::ComputationType ty)383 cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
384   switch (ty) {
385     case blas::ComputationType::kF16:
386       return CUDA_R_16F;
387     case blas::ComputationType::kF32:
388       return CUDA_R_32F;
389     case blas::ComputationType::kF64:
390       return CUDA_R_64F;
391     case blas::ComputationType::kI32:
392       return CUDA_R_32I;
393     case blas::ComputationType::kComplexF32:
394       return CUDA_C_32F;
395     case blas::ComputationType::kComplexF64:
396       return CUDA_C_64F;
397   }
398 }
399 }  // namespace
400 
401 template <typename FuncT, typename... Args>
DoBlasInternalImpl(FuncT cublas_func,Stream * stream,bool pointer_mode_host,bool err_on_failure,bool use_tensor_op_math,Args...args)402 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
403                                   bool pointer_mode_host, bool err_on_failure,
404                                   bool use_tensor_op_math, Args... args) {
405   mutex_lock lock(mu_);
406 
407   CHECK(blas_ != nullptr);
408   if (!SetStream(stream)) {
409     return false;
410   }
411 
412   gpu::ScopedActivateExecutorContext sac{parent_};
413   ScopedCublasPointerMode pointer_mode{blas_};
414   if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
415                                            : CUBLAS_POINTER_MODE_DEVICE)) {
416     return false;
417   }
418 #if CUDA_VERSION >= 9000
419   ScopedCublasMathMode math_mode{blas_};
420   if (use_tensor_op_math) {
421     if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
422       return false;
423     }
424   }
425 #endif
426   cublasStatus_t ret = cublas_func(blas_, args...);
427   if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) {
428     LOG(ERROR) << "failed to run cuBLAS routine: " << ToString(ret);
429   }
430   return ret == CUBLAS_STATUS_SUCCESS;
431 }
432 
DoBlasAsum(Stream * stream,uint64 elem_count,const DeviceMemory<float> & x,int incx,DeviceMemory<float> * result)433 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
434                           const DeviceMemory<float> &x, int incx,
435                           DeviceMemory<float> *result) {
436   return DoBlasInternal(cublasSasum, stream, false /* = pointer_mode_host */,
437                         elem_count, GpuMemory(x), incx,
438                         GpuMemoryMutable(result));
439 }
440 
DoBlasAsum(Stream * stream,uint64 elem_count,const DeviceMemory<double> & x,int incx,DeviceMemory<double> * result)441 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
442                           const DeviceMemory<double> &x, int incx,
443                           DeviceMemory<double> *result) {
444   return DoBlasInternal(cublasDasum, stream, false /* = pointer_mode_host */,
445                         elem_count, GpuMemory(x), incx,
446                         GpuMemoryMutable(result));
447 }
448 
DoBlasAsum(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<float> * result)449 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
450                           const DeviceMemory<std::complex<float>> &x, int incx,
451                           DeviceMemory<float> *result) {
452   return DoBlasInternal(cublasScasum, stream, false /* = pointer_mode_host */,
453                         elem_count, GpuComplex(GpuMemory(x)), incx,
454                         GpuMemoryMutable(result));
455 }
456 
DoBlasAsum(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<double> * result)457 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
458                           const DeviceMemory<std::complex<double>> &x, int incx,
459                           DeviceMemory<double> *result) {
460   return DoBlasInternal(cublasDzasum, stream, false /* = pointer_mode_host */,
461                         elem_count, GpuComplex(GpuMemory(x)), incx,
462                         GpuMemoryMutable(result));
463 }
464 
DoBlasAxpy(Stream * stream,uint64 elem_count,float alpha,const DeviceMemory<float> & x,int incx,DeviceMemory<float> * y,int incy)465 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
466                           const DeviceMemory<float> &x, int incx,
467                           DeviceMemory<float> *y, int incy) {
468   return DoBlasInternal(cublasSaxpy, stream, true /* = pointer_mode_host */,
469                         elem_count, &alpha, GpuMemory(x), incx,
470                         GpuMemoryMutable(y), incy);
471 }
472 
DoBlasAxpy(Stream * stream,uint64 elem_count,double alpha,const DeviceMemory<double> & x,int incx,DeviceMemory<double> * y,int incy)473 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
474                           const DeviceMemory<double> &x, int incx,
475                           DeviceMemory<double> *y, int incy) {
476   return DoBlasInternal(cublasDaxpy, stream, true /* = pointer_mode_host */,
477                         elem_count, &alpha, GpuMemory(x), incx,
478                         GpuMemoryMutable(y), incy);
479 }
480 
DoBlasAxpy(Stream * stream,uint64 elem_count,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<std::complex<float>> * y,int incy)481 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
482                           std::complex<float> alpha,
483                           const DeviceMemory<std::complex<float>> &x, int incx,
484                           DeviceMemory<std::complex<float>> *y, int incy) {
485   return DoBlasInternal(cublasCaxpy, stream, true /* = pointer_mode_host */,
486                         elem_count, GpuComplex(&alpha),
487                         GpuComplex(GpuMemory(x)), incx,
488                         GpuComplex(GpuMemoryMutable(y)), incy);
489 }
490 
DoBlasAxpy(Stream * stream,uint64 elem_count,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<std::complex<double>> * y,int incy)491 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
492                           std::complex<double> alpha,
493                           const DeviceMemory<std::complex<double>> &x, int incx,
494                           DeviceMemory<std::complex<double>> *y, int incy) {
495   return DoBlasInternal(cublasZaxpy, stream, true /* = pointer_mode_host */,
496                         elem_count, GpuComplex(&alpha),
497                         GpuComplex(GpuMemory(x)), incx,
498                         GpuComplex(GpuMemoryMutable(y)), incy);
499 }
500 
DoBlasCopy(Stream * stream,uint64 elem_count,const DeviceMemory<float> & x,int incx,DeviceMemory<float> * y,int incy)501 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
502                           const DeviceMemory<float> &x, int incx,
503                           DeviceMemory<float> *y, int incy) {
504   return DoBlasInternal(cublasScopy, stream, true /* = pointer_mode_host */,
505                         elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
506                         incy);
507 }
508 
DoBlasCopy(Stream * stream,uint64 elem_count,const DeviceMemory<double> & x,int incx,DeviceMemory<double> * y,int incy)509 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
510                           const DeviceMemory<double> &x, int incx,
511                           DeviceMemory<double> *y, int incy) {
512   return DoBlasInternal(cublasDcopy, stream, true /* = pointer_mode_host */,
513                         elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
514                         incy);
515 }
516 
DoBlasCopy(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<std::complex<float>> * y,int incy)517 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
518                           const DeviceMemory<std::complex<float>> &x, int incx,
519                           DeviceMemory<std::complex<float>> *y, int incy) {
520   return DoBlasInternal(cublasCcopy, stream, true /* = pointer_mode_host */,
521                         elem_count, GpuComplex(GpuMemory(x)), incx,
522                         GpuComplex(GpuMemoryMutable(y)), incy);
523 }
524 
DoBlasCopy(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<std::complex<double>> * y,int incy)525 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
526                           const DeviceMemory<std::complex<double>> &x, int incx,
527                           DeviceMemory<std::complex<double>> *y, int incy) {
528   return DoBlasInternal(cublasZcopy, stream, true /* = pointer_mode_host */,
529                         elem_count, GpuComplex(GpuMemory(x)), incx,
530                         GpuComplex(GpuMemoryMutable(y)), incy);
531 }
532 
DoBlasDot(Stream * stream,uint64 elem_count,const DeviceMemory<float> & x,int incx,const DeviceMemory<float> & y,int incy,DeviceMemory<float> * result)533 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
534                          const DeviceMemory<float> &x, int incx,
535                          const DeviceMemory<float> &y, int incy,
536                          DeviceMemory<float> *result) {
537   return DoBlasInternal(cublasSdot, stream, false /* = pointer_mode_host */,
538                         elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
539                         GpuMemoryMutable(result));
540 }
541 
DoBlasDot(Stream * stream,uint64 elem_count,const DeviceMemory<double> & x,int incx,const DeviceMemory<double> & y,int incy,DeviceMemory<double> * result)542 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
543                          const DeviceMemory<double> &x, int incx,
544                          const DeviceMemory<double> &y, int incy,
545                          DeviceMemory<double> *result) {
546   return DoBlasInternal(cublasDdot, stream, false /* = pointer_mode_host */,
547                         elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
548                         GpuMemoryMutable(result));
549 }
550 
DoBlasDotc(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,const DeviceMemory<std::complex<float>> & y,int incy,DeviceMemory<std::complex<float>> * result)551 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
552                           const DeviceMemory<std::complex<float>> &x, int incx,
553                           const DeviceMemory<std::complex<float>> &y, int incy,
554                           DeviceMemory<std::complex<float>> *result) {
555   return DoBlasInternal(cublasCdotc, stream, false /* = pointer_mode_host */,
556                         elem_count, GpuComplex(GpuMemory(x)), incx,
557                         GpuComplex(GpuMemory(y)), incy,
558                         GpuComplex(GpuMemoryMutable(result)));
559 }
560 
DoBlasDotc(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,const DeviceMemory<std::complex<double>> & y,int incy,DeviceMemory<std::complex<double>> * result)561 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
562                           const DeviceMemory<std::complex<double>> &x, int incx,
563                           const DeviceMemory<std::complex<double>> &y, int incy,
564                           DeviceMemory<std::complex<double>> *result) {
565   return DoBlasInternal(cublasZdotc, stream, false /* = pointer_mode_host */,
566                         elem_count, GpuComplex(GpuMemory(x)), incx,
567                         GpuComplex(GpuMemory(y)), incy,
568                         GpuComplex(GpuMemoryMutable(result)));
569 }
570 
DoBlasDotu(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,const DeviceMemory<std::complex<float>> & y,int incy,DeviceMemory<std::complex<float>> * result)571 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
572                           const DeviceMemory<std::complex<float>> &x, int incx,
573                           const DeviceMemory<std::complex<float>> &y, int incy,
574                           DeviceMemory<std::complex<float>> *result) {
575   return DoBlasInternal(cublasCdotu, stream, false /* = pointer_mode_host */,
576                         elem_count, GpuComplex(GpuMemory(x)), incx,
577                         GpuComplex(GpuMemory(y)), incy,
578                         GpuComplex(GpuMemoryMutable(result)));
579 }
580 
DoBlasDotu(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,const DeviceMemory<std::complex<double>> & y,int incy,DeviceMemory<std::complex<double>> * result)581 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
582                           const DeviceMemory<std::complex<double>> &x, int incx,
583                           const DeviceMemory<std::complex<double>> &y, int incy,
584                           DeviceMemory<std::complex<double>> *result) {
585   return DoBlasInternal(cublasZdotu, stream, false /* = pointer_mode_host */,
586                         elem_count, GpuComplex(GpuMemory(x)), incx,
587                         GpuComplex(GpuMemory(y)), incy,
588                         GpuComplex(GpuMemoryMutable(result)));
589 }
590 
DoBlasNrm2(Stream * stream,uint64 elem_count,const DeviceMemory<float> & x,int incx,DeviceMemory<float> * result)591 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
592                           const DeviceMemory<float> &x, int incx,
593                           DeviceMemory<float> *result) {
594   return DoBlasInternal(cublasSnrm2, stream, false /* = pointer_mode_host */,
595                         elem_count, GpuMemory(x), incx,
596                         GpuMemoryMutable(result));
597 }
598 
DoBlasNrm2(Stream * stream,uint64 elem_count,const DeviceMemory<double> & x,int incx,DeviceMemory<double> * result)599 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
600                           const DeviceMemory<double> &x, int incx,
601                           DeviceMemory<double> *result) {
602   return DoBlasInternal(cublasDnrm2, stream, false /* = pointer_mode_host */,
603                         elem_count, GpuMemory(x), incx,
604                         GpuMemoryMutable(result));
605 }
606 
DoBlasNrm2(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<float> * result)607 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
608                           const DeviceMemory<std::complex<float>> &x, int incx,
609                           DeviceMemory<float> *result) {
610   return DoBlasInternal(cublasScnrm2, stream, false /* = pointer_mode_host */,
611                         elem_count, GpuComplex(GpuMemory(x)), incx,
612                         GpuMemoryMutable(result));
613 }
614 
DoBlasNrm2(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<double> * result)615 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
616                           const DeviceMemory<std::complex<double>> &x, int incx,
617                           DeviceMemory<double> *result) {
618   return DoBlasInternal(cublasDznrm2, stream, false /* = pointer_mode_host */,
619                         elem_count, GpuComplex(GpuMemory(x)), incx,
620                         GpuMemoryMutable(result));
621 }
622 
DoBlasRot(Stream * stream,uint64 elem_count,DeviceMemory<float> * x,int incx,DeviceMemory<float> * y,int incy,float c,float s)623 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
624                          DeviceMemory<float> *x, int incx,
625                          DeviceMemory<float> *y, int incy, float c, float s) {
626   return DoBlasInternal(cublasSrot, stream, true /* = pointer_mode_host */,
627                         elem_count, GpuMemoryMutable(x), incx,
628                         GpuMemoryMutable(y), incy, &c, &s);
629 }
630 
DoBlasRot(Stream * stream,uint64 elem_count,DeviceMemory<double> * x,int incx,DeviceMemory<double> * y,int incy,double c,double s)631 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
632                          DeviceMemory<double> *x, int incx,
633                          DeviceMemory<double> *y, int incy, double c,
634                          double s) {
635   return DoBlasInternal(cublasDrot, stream, true /* = pointer_mode_host */,
636                         elem_count, GpuMemoryMutable(x), incx,
637                         GpuMemoryMutable(y), incy, &c, &s);
638 }
639 
DoBlasRot(Stream * stream,uint64 elem_count,DeviceMemory<std::complex<float>> * x,int incx,DeviceMemory<std::complex<float>> * y,int incy,float c,float s)640 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
641                          DeviceMemory<std::complex<float>> *x, int incx,
642                          DeviceMemory<std::complex<float>> *y, int incy,
643                          float c, float s) {
644   return DoBlasInternal(cublasCsrot, stream, true /* = pointer_mode_host */,
645                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
646                         GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
647 }
648 
DoBlasRot(Stream * stream,uint64 elem_count,DeviceMemory<std::complex<double>> * x,int incx,DeviceMemory<std::complex<double>> * y,int incy,double c,double s)649 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
650                          DeviceMemory<std::complex<double>> *x, int incx,
651                          DeviceMemory<std::complex<double>> *y, int incy,
652                          double c, double s) {
653   return DoBlasInternal(cublasZdrot, stream, true /* = pointer_mode_host */,
654                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
655                         GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
656 }
657 
DoBlasRotg(Stream * stream,DeviceMemory<float> * a,DeviceMemory<float> * b,DeviceMemory<float> * c,DeviceMemory<float> * s)658 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
659                           DeviceMemory<float> *b, DeviceMemory<float> *c,
660                           DeviceMemory<float> *s) {
661   return DoBlasInternal(cublasSrotg, stream, false /* = pointer_mode_host */,
662                         GpuMemoryMutable(a), GpuMemoryMutable(b),
663                         GpuMemoryMutable(c), GpuMemoryMutable(s));
664 }
665 
DoBlasRotg(Stream * stream,DeviceMemory<double> * a,DeviceMemory<double> * b,DeviceMemory<double> * c,DeviceMemory<double> * s)666 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
667                           DeviceMemory<double> *b, DeviceMemory<double> *c,
668                           DeviceMemory<double> *s) {
669   return DoBlasInternal(cublasDrotg, stream, false /* = pointer_mode_host */,
670                         GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b),
671                         GpuMemoryMutable(c), GpuMemoryMutable(s));
672 }
673 
DoBlasRotg(Stream * stream,DeviceMemory<std::complex<float>> * a,DeviceMemory<std::complex<float>> * b,DeviceMemory<float> * c,DeviceMemory<std::complex<float>> * s)674 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
675                           DeviceMemory<std::complex<float>> *b,
676                           DeviceMemory<float> *c,
677                           DeviceMemory<std::complex<float>> *s) {
678   return DoBlasInternal(
679       cublasCrotg, stream, false /* = pointer_mode_host */,
680       GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
681       GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
682 }
683 
DoBlasRotg(Stream * stream,DeviceMemory<std::complex<double>> * a,DeviceMemory<std::complex<double>> * b,DeviceMemory<double> * c,DeviceMemory<std::complex<double>> * s)684 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
685                           DeviceMemory<std::complex<double>> *b,
686                           DeviceMemory<double> *c,
687                           DeviceMemory<std::complex<double>> *s) {
688   return DoBlasInternal(
689       cublasZrotg, stream, false /* = pointer_mode_host */,
690       GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
691       GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
692 }
693 
DoBlasRotm(Stream * stream,uint64 elem_count,DeviceMemory<float> * x,int incx,DeviceMemory<float> * y,int incy,const DeviceMemory<float> & param)694 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
695                           DeviceMemory<float> *x, int incx,
696                           DeviceMemory<float> *y, int incy,
697                           const DeviceMemory<float> &param) {
698   return DoBlasInternal(cublasSrotm, stream, false /* = pointer_mode_host */,
699                         elem_count, GpuMemoryMutable(x), incx,
700                         GpuMemoryMutable(y), incy, GpuMemory(param));
701 }
702 
DoBlasRotm(Stream * stream,uint64 elem_count,DeviceMemory<double> * x,int incx,DeviceMemory<double> * y,int incy,const DeviceMemory<double> & param)703 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
704                           DeviceMemory<double> *x, int incx,
705                           DeviceMemory<double> *y, int incy,
706                           const DeviceMemory<double> &param) {
707   return DoBlasInternal(cublasDrotm, stream, false /* = pointer_mode_host */,
708                         elem_count, GpuMemoryMutable(x), incx,
709                         GpuMemoryMutable(y), incy, GpuMemory(param));
710 }
711 
DoBlasRotmg(Stream * stream,DeviceMemory<float> * d1,DeviceMemory<float> * d2,DeviceMemory<float> * x1,const DeviceMemory<float> & y1,DeviceMemory<float> * param)712 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
713                            DeviceMemory<float> *d2, DeviceMemory<float> *x1,
714                            const DeviceMemory<float> &y1,
715                            DeviceMemory<float> *param) {
716   return DoBlasInternal(cublasSrotmg, stream, false /* = pointer_mode_host */,
717                         GpuMemoryMutable(d1), GpuMemoryMutable(d2),
718                         GpuMemoryMutable(x1), GpuMemory(y1),
719                         GpuMemoryMutable(param));
720 }
721 
DoBlasRotmg(Stream * stream,DeviceMemory<double> * d1,DeviceMemory<double> * d2,DeviceMemory<double> * x1,const DeviceMemory<double> & y1,DeviceMemory<double> * param)722 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
723                            DeviceMemory<double> *d2, DeviceMemory<double> *x1,
724                            const DeviceMemory<double> &y1,
725                            DeviceMemory<double> *param) {
726   return DoBlasInternal(cublasDrotmg, stream, false /* = pointer_mode_host */,
727                         GpuMemoryMutable(d1), GpuMemoryMutable(d2),
728                         GpuMemoryMutable(x1), GpuMemory(y1),
729                         GpuMemoryMutable(param));
730 }
731 
DoBlasScal(Stream * stream,uint64 elem_count,float alpha,DeviceMemory<float> * x,int incx)732 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
733                           DeviceMemory<float> *x, int incx) {
734   return DoBlasInternal(cublasSscal, stream, true /* = pointer_mode_host */,
735                         elem_count, &alpha, GpuMemoryMutable(x), incx);
736 }
737 
DoBlasScal(Stream * stream,uint64 elem_count,double alpha,DeviceMemory<double> * x,int incx)738 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
739                           DeviceMemory<double> *x, int incx) {
740   return DoBlasInternal(cublasDscal, stream, true /* = pointer_mode_host */,
741                         elem_count, &alpha, GpuMemoryMutable(x), incx);
742 }
743 
DoBlasScal(Stream * stream,uint64 elem_count,float alpha,DeviceMemory<std::complex<float>> * x,int incx)744 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
745                           DeviceMemory<std::complex<float>> *x, int incx) {
746   return DoBlasInternal(cublasCsscal, stream, true /* = pointer_mode_host */,
747                         elem_count, GpuComplex(&alpha),
748                         GpuComplex(GpuMemoryMutable(x)), incx);
749 }
750 
DoBlasScal(Stream * stream,uint64 elem_count,double alpha,DeviceMemory<std::complex<double>> * x,int incx)751 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
752                           DeviceMemory<std::complex<double>> *x, int incx) {
753   return DoBlasInternal(cublasZdscal, stream, true /* = pointer_mode_host */,
754                         elem_count, GpuComplex(&alpha),
755                         GpuComplex(GpuMemoryMutable(x)), incx);
756 }
757 
DoBlasScal(Stream * stream,uint64 elem_count,std::complex<float> alpha,DeviceMemory<std::complex<float>> * x,int incx)758 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
759                           std::complex<float> alpha,
760                           DeviceMemory<std::complex<float>> *x, int incx) {
761   return DoBlasInternal(cublasCscal, stream, true /* = pointer_mode_host */,
762                         elem_count, GpuComplex(&alpha),
763                         GpuComplex(GpuMemoryMutable(x)), incx);
764 }
765 
DoBlasScal(Stream * stream,uint64 elem_count,std::complex<double> alpha,DeviceMemory<std::complex<double>> * x,int incx)766 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
767                           std::complex<double> alpha,
768                           DeviceMemory<std::complex<double>> *x, int incx) {
769   return DoBlasInternal(cublasZscal, stream, true /* = pointer_mode_host */,
770                         elem_count, GpuComplex(&alpha),
771                         GpuComplex(GpuMemoryMutable(x)), incx);
772 }
773 
DoBlasSwap(Stream * stream,uint64 elem_count,DeviceMemory<float> * x,int incx,DeviceMemory<float> * y,int incy)774 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
775                           DeviceMemory<float> *x, int incx,
776                           DeviceMemory<float> *y, int incy) {
777   return DoBlasInternal(cublasSswap, stream, true /* = pointer_mode_host */,
778                         elem_count, GpuMemoryMutable(x), incx,
779                         GpuMemoryMutable(y), incy);
780 }
781 
DoBlasSwap(Stream * stream,uint64 elem_count,DeviceMemory<double> * x,int incx,DeviceMemory<double> * y,int incy)782 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
783                           DeviceMemory<double> *x, int incx,
784                           DeviceMemory<double> *y, int incy) {
785   return DoBlasInternal(cublasDswap, stream, true /* = pointer_mode_host */,
786                         elem_count, GpuMemoryMutable(x), incx,
787                         GpuMemoryMutable(y), incy);
788 }
789 
DoBlasSwap(Stream * stream,uint64 elem_count,DeviceMemory<std::complex<float>> * x,int incx,DeviceMemory<std::complex<float>> * y,int incy)790 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
791                           DeviceMemory<std::complex<float>> *x, int incx,
792                           DeviceMemory<std::complex<float>> *y, int incy) {
793   return DoBlasInternal(cublasCswap, stream, true /* = pointer_mode_host */,
794                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
795                         GpuComplex(GpuMemoryMutable(y)), incy);
796 }
797 
DoBlasSwap(Stream * stream,uint64 elem_count,DeviceMemory<std::complex<double>> * x,int incx,DeviceMemory<std::complex<double>> * y,int incy)798 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
799                           DeviceMemory<std::complex<double>> *x, int incx,
800                           DeviceMemory<std::complex<double>> *y, int incy) {
801   return DoBlasInternal(cublasZswap, stream, true /* = pointer_mode_host */,
802                         elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
803                         GpuComplex(GpuMemoryMutable(y)), incy);
804 }
805 
DoBlasIamax(Stream * stream,uint64 elem_count,const DeviceMemory<float> & x,int incx,DeviceMemory<int> * result)806 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
807                            const DeviceMemory<float> &x, int incx,
808                            DeviceMemory<int> *result) {
809   return DoBlasInternal(cublasIsamax, stream, false /* = pointer_mode_host */,
810                         elem_count, GpuMemory(x), incx,
811                         GpuMemoryMutable(result));
812 }
813 
DoBlasIamax(Stream * stream,uint64 elem_count,const DeviceMemory<double> & x,int incx,DeviceMemory<int> * result)814 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
815                            const DeviceMemory<double> &x, int incx,
816                            DeviceMemory<int> *result) {
817   return DoBlasInternal(cublasIdamax, stream, false /* = pointer_mode_host */,
818                         elem_count, GpuMemory(x), incx,
819                         GpuMemoryMutable(result));
820 }
821 
DoBlasIamax(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<int> * result)822 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
823                            const DeviceMemory<std::complex<float>> &x, int incx,
824                            DeviceMemory<int> *result) {
825   return DoBlasInternal(cublasIcamax, stream, false /* = pointer_mode_host */,
826                         elem_count, GpuComplex(GpuMemory(x)), incx,
827                         GpuMemoryMutable(result));
828 }
829 
DoBlasIamax(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<int> * result)830 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
831                            const DeviceMemory<std::complex<double>> &x,
832                            int incx, DeviceMemory<int> *result) {
833   return DoBlasInternal(cublasIzamax, stream, false /* = pointer_mode_host */,
834                         elem_count, GpuComplex(GpuMemory(x)), incx,
835                         GpuMemoryMutable(result));
836 }
837 
DoBlasIamin(Stream * stream,uint64 elem_count,const DeviceMemory<float> & x,int incx,DeviceMemory<int> * result)838 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
839                            const DeviceMemory<float> &x, int incx,
840                            DeviceMemory<int> *result) {
841   return DoBlasInternal(cublasIsamin, stream, false /* = pointer_mode_host */,
842                         elem_count, GpuComplex(GpuMemory(x)), incx,
843                         GpuMemoryMutable(result));
844 }
845 
DoBlasIamin(Stream * stream,uint64 elem_count,const DeviceMemory<double> & x,int incx,DeviceMemory<int> * result)846 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
847                            const DeviceMemory<double> &x, int incx,
848                            DeviceMemory<int> *result) {
849   return DoBlasInternal(cublasIdamin, stream, false /* = pointer_mode_host */,
850                         elem_count, GpuComplex(GpuMemory(x)), incx,
851                         GpuMemoryMutable(result));
852 }
853 
DoBlasIamin(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<int> * result)854 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
855                            const DeviceMemory<std::complex<float>> &x, int incx,
856                            DeviceMemory<int> *result) {
857   return DoBlasInternal(cublasIcamin, stream, false /* = pointer_mode_host */,
858                         elem_count, GpuComplex(GpuMemory(x)), incx,
859                         GpuMemoryMutable(result));
860 }
861 
DoBlasIamin(Stream * stream,uint64 elem_count,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<int> * result)862 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
863                            const DeviceMemory<std::complex<double>> &x,
864                            int incx, DeviceMemory<int> *result) {
865   return DoBlasInternal(cublasIzamin, stream, false /* = pointer_mode_host */,
866                         elem_count, GpuComplex(GpuMemory(x)), incx,
867                         GpuMemoryMutable(result));
868 }
869 
DoBlasGbmv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,uint64 kl,uint64 ku,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & x,int incx,float beta,DeviceMemory<float> * y,int incy)870 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
871                           uint64 n, uint64 kl, uint64 ku, float alpha,
872                           const DeviceMemory<float> &a, int lda,
873                           const DeviceMemory<float> &x, int incx, float beta,
874                           DeviceMemory<float> *y, int incy) {
875   return DoBlasInternal(cublasSgbmv, stream, true /* = pointer_mode_host */,
876                         CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
877                         GpuMemory(a), lda, GpuMemory(x), incx, &beta,
878                         GpuMemoryMutable(y), incy);
879 }
880 
DoBlasGbmv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,uint64 kl,uint64 ku,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & x,int incx,double beta,DeviceMemory<double> * y,int incy)881 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
882                           uint64 n, uint64 kl, uint64 ku, double alpha,
883                           const DeviceMemory<double> &a, int lda,
884                           const DeviceMemory<double> &x, int incx, double beta,
885                           DeviceMemory<double> *y, int incy) {
886   return DoBlasInternal(cublasDgbmv, stream, true /* = pointer_mode_host */,
887                         CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
888                         GpuMemory(a), lda, GpuMemory(x), incx, &beta,
889                         GpuMemoryMutable(y), incy);
890 }
891 
DoBlasGbmv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,uint64 kl,uint64 ku,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & x,int incx,std::complex<float> beta,DeviceMemory<std::complex<float>> * y,int incy)892 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
893                           uint64 n, uint64 kl, uint64 ku,
894                           std::complex<float> alpha,
895                           const DeviceMemory<std::complex<float>> &a, int lda,
896                           const DeviceMemory<std::complex<float>> &x, int incx,
897                           std::complex<float> beta,
898                           DeviceMemory<std::complex<float>> *y, int incy) {
899   return DoBlasInternal(cublasCgbmv, stream, true /* = pointer_mode_host */,
900                         CUDABlasTranspose(trans), m, n, kl, ku,
901                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
902                         GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
903                         GpuComplex(GpuMemoryMutable(y)), incy);
904 }
905 
DoBlasGbmv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,uint64 kl,uint64 ku,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & x,int incx,std::complex<double> beta,DeviceMemory<std::complex<double>> * y,int incy)906 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
907                           uint64 n, uint64 kl, uint64 ku,
908                           std::complex<double> alpha,
909                           const DeviceMemory<std::complex<double>> &a, int lda,
910                           const DeviceMemory<std::complex<double>> &x, int incx,
911                           std::complex<double> beta,
912                           DeviceMemory<std::complex<double>> *y, int incy) {
913   return DoBlasInternal(cublasZgbmv, stream, true /* = pointer_mode_host */,
914                         CUDABlasTranspose(trans), m, n, kl, ku,
915                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
916                         GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
917                         GpuComplex(GpuMemoryMutable(y)), incy);
918 }
919 
DoBlasGemv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & x,int incx,float beta,DeviceMemory<float> * y,int incy)920 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
921                           uint64 n, float alpha, const DeviceMemory<float> &a,
922                           int lda, const DeviceMemory<float> &x, int incx,
923                           float beta, DeviceMemory<float> *y, int incy) {
924   return DoBlasInternal(cublasSgemv, stream, true /* = pointer_mode_host */,
925                         CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
926                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
927                         incy);
928 }
929 
DoBlasGemv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & x,int incx,double beta,DeviceMemory<double> * y,int incy)930 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
931                           uint64 n, double alpha, const DeviceMemory<double> &a,
932                           int lda, const DeviceMemory<double> &x, int incx,
933                           double beta, DeviceMemory<double> *y, int incy) {
934   return DoBlasInternal(cublasDgemv, stream, true /* = pointer_mode_host */,
935                         CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
936                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
937                         incy);
938 }
939 
DoBlasGemv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & x,int incx,std::complex<float> beta,DeviceMemory<std::complex<float>> * y,int incy)940 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
941                           uint64 n, std::complex<float> alpha,
942                           const DeviceMemory<std::complex<float>> &a, int lda,
943                           const DeviceMemory<std::complex<float>> &x, int incx,
944                           std::complex<float> beta,
945                           DeviceMemory<std::complex<float>> *y, int incy) {
946   return DoBlasInternal(cublasCgemv, stream, true /* = pointer_mode_host */,
947                         CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
948                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
949                         incx, GpuComplex(&beta),
950                         GpuComplex(GpuMemoryMutable(y)), incy);
951 }
952 
DoBlasGemv(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & x,int incx,std::complex<double> beta,DeviceMemory<std::complex<double>> * y,int incy)953 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
954                           uint64 n, std::complex<double> alpha,
955                           const DeviceMemory<std::complex<double>> &a, int lda,
956                           const DeviceMemory<std::complex<double>> &x, int incx,
957                           std::complex<double> beta,
958                           DeviceMemory<std::complex<double>> *y, int incy) {
959   return DoBlasInternal(cublasZgemv, stream, true /* = pointer_mode_host */,
960                         CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
961                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
962                         incx, GpuComplex(&beta),
963                         GpuComplex(GpuMemoryMutable(y)), incy);
964 }
965 
DoBlasGer(Stream * stream,uint64 m,uint64 n,float alpha,const DeviceMemory<float> & x,int incx,const DeviceMemory<float> & y,int incy,DeviceMemory<float> * a,int lda)966 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
967                          const DeviceMemory<float> &x, int incx,
968                          const DeviceMemory<float> &y, int incy,
969                          DeviceMemory<float> *a, int lda) {
970   return DoBlasInternal(cublasSger, stream, true /* = pointer_mode_host */, m,
971                         n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
972                         GpuMemoryMutable(a), lda);
973 }
974 
DoBlasGer(Stream * stream,uint64 m,uint64 n,double alpha,const DeviceMemory<double> & x,int incx,const DeviceMemory<double> & y,int incy,DeviceMemory<double> * a,int lda)975 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
976                          const DeviceMemory<double> &x, int incx,
977                          const DeviceMemory<double> &y, int incy,
978                          DeviceMemory<double> *a, int lda) {
979   return DoBlasInternal(cublasDger, stream, true /* = pointer_mode_host */, m,
980                         n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
981                         GpuMemoryMutable(a), lda);
982 }
983 
DoBlasGerc(Stream * stream,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & x,int incx,const DeviceMemory<std::complex<float>> & y,int incy,DeviceMemory<std::complex<float>> * a,int lda)984 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
985                           std::complex<float> alpha,
986                           const DeviceMemory<std::complex<float>> &x, int incx,
987                           const DeviceMemory<std::complex<float>> &y, int incy,
988                           DeviceMemory<std::complex<float>> *a, int lda) {
989   return DoBlasInternal(cublasCgerc, stream, true /* = pointer_mode_host */, m,
990                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
991                         GpuComplex(GpuMemory(y)), incy,
992                         GpuComplex(GpuMemoryMutable(a)), lda);
993 }
994 
DoBlasGerc(Stream * stream,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & x,int incx,const DeviceMemory<std::complex<double>> & y,int incy,DeviceMemory<std::complex<double>> * a,int lda)995 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
996                           std::complex<double> alpha,
997                           const DeviceMemory<std::complex<double>> &x, int incx,
998                           const DeviceMemory<std::complex<double>> &y, int incy,
999                           DeviceMemory<std::complex<double>> *a, int lda) {
1000   return DoBlasInternal(cublasZgerc, stream, true /* = pointer_mode_host */, m,
1001                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
1002                         GpuComplex(GpuMemory(y)), incy,
1003                         GpuComplex(GpuMemoryMutable(a)), lda);
1004 }
1005 
DoBlasGeru(Stream * stream,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & x,int incx,const DeviceMemory<std::complex<float>> & y,int incy,DeviceMemory<std::complex<float>> * a,int lda)1006 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
1007                           std::complex<float> alpha,
1008                           const DeviceMemory<std::complex<float>> &x, int incx,
1009                           const DeviceMemory<std::complex<float>> &y, int incy,
1010                           DeviceMemory<std::complex<float>> *a, int lda) {
1011   return DoBlasInternal(cublasCgeru, stream, true /* = pointer_mode_host */, m,
1012                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
1013                         GpuComplex(GpuMemory(y)), incy,
1014                         GpuComplex(GpuMemoryMutable(a)), lda);
1015 }
1016 
DoBlasGeru(Stream * stream,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & x,int incx,const DeviceMemory<std::complex<double>> & y,int incy,DeviceMemory<std::complex<double>> * a,int lda)1017 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
1018                           std::complex<double> alpha,
1019                           const DeviceMemory<std::complex<double>> &x, int incx,
1020                           const DeviceMemory<std::complex<double>> &y, int incy,
1021                           DeviceMemory<std::complex<double>> *a, int lda) {
1022   return DoBlasInternal(cublasZgeru, stream, true /* = pointer_mode_host */, m,
1023                         n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
1024                         GpuComplex(GpuMemory(y)), incy,
1025                         GpuComplex(GpuMemoryMutable(a)), lda);
1026 }
1027 
DoBlasHbmv(Stream * stream,blas::UpperLower uplo,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & x,int incx,std::complex<float> beta,DeviceMemory<std::complex<float>> * y,int incy)1028 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1029                           uint64 k, std::complex<float> alpha,
1030                           const DeviceMemory<std::complex<float>> &a, int lda,
1031                           const DeviceMemory<std::complex<float>> &x, int incx,
1032                           std::complex<float> beta,
1033                           DeviceMemory<std::complex<float>> *y, int incy) {
1034   return DoBlasInternal(cublasChbmv, stream, true /* = pointer_mode_host */,
1035                         CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
1036                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
1037                         incx, GpuComplex(&beta),
1038                         GpuComplex(GpuMemoryMutable(y)), incy);
1039 }
1040 
DoBlasHbmv(Stream * stream,blas::UpperLower uplo,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & x,int incx,std::complex<double> beta,DeviceMemory<std::complex<double>> * y,int incy)1041 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1042                           uint64 k, std::complex<double> alpha,
1043                           const DeviceMemory<std::complex<double>> &a, int lda,
1044                           const DeviceMemory<std::complex<double>> &x, int incx,
1045                           std::complex<double> beta,
1046                           DeviceMemory<std::complex<double>> *y, int incy) {
1047   return DoBlasInternal(cublasZhbmv, stream, true /* = pointer_mode_host */,
1048                         CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
1049                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
1050                         incx, GpuComplex(&beta),
1051                         GpuComplex(GpuMemoryMutable(y)), incy);
1052 }
1053 
DoBlasHemv(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & x,int incx,std::complex<float> beta,DeviceMemory<std::complex<float>> * y,int incy)1054 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
1055                           std::complex<float> alpha,
1056                           const DeviceMemory<std::complex<float>> &a, int lda,
1057                           const DeviceMemory<std::complex<float>> &x, int incx,
1058                           std::complex<float> beta,
1059                           DeviceMemory<std::complex<float>> *y, int incy) {
1060   return DoBlasInternal(cublasChemv, stream, true /* = pointer_mode_host */,
1061                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1062                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
1063                         incx, GpuComplex(&beta),
1064                         GpuComplex(GpuMemoryMutable(y)), incy);
1065 }
1066 
DoBlasHemv(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & x,int incx,std::complex<double> beta,DeviceMemory<std::complex<double>> * y,int incy)1067 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
1068                           std::complex<double> alpha,
1069                           const DeviceMemory<std::complex<double>> &a, int lda,
1070                           const DeviceMemory<std::complex<double>> &x, int incx,
1071                           std::complex<double> beta,
1072                           DeviceMemory<std::complex<double>> *y, int incy) {
1073   return DoBlasInternal(cublasZhemv, stream, true /* = pointer_mode_host */,
1074                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1075                         GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
1076                         incx, GpuComplex(&beta),
1077                         GpuComplex(GpuMemoryMutable(y)), incy);
1078 }
1079 
DoBlasHer(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<std::complex<float>> * a,int lda)1080 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
1081                          float alpha,
1082                          const DeviceMemory<std::complex<float>> &x, int incx,
1083                          DeviceMemory<std::complex<float>> *a, int lda) {
1084   return DoBlasInternal(cublasCher, stream, true /* = pointer_mode_host */,
1085                         CUDABlasUpperLower(uplo), n, &alpha,
1086                         GpuComplex(GpuMemory(x)), incx,
1087                         GpuComplex(GpuMemoryMutable(a)), lda);
1088 }
1089 
DoBlasHer(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<std::complex<double>> * a,int lda)1090 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
1091                          double alpha,
1092                          const DeviceMemory<std::complex<double>> &x, int incx,
1093                          DeviceMemory<std::complex<double>> *a, int lda) {
1094   return DoBlasInternal(cublasZher, stream, true /* = pointer_mode_host */,
1095                         CUDABlasUpperLower(uplo), n, &alpha,
1096                         GpuComplex(GpuMemory(x)), incx,
1097                         GpuComplex(GpuMemoryMutable(a)), lda);
1098 }
1099 
DoBlasHer2(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & x,int incx,const DeviceMemory<std::complex<float>> & y,int incy,DeviceMemory<std::complex<float>> * a,int lda)1100 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
1101                           std::complex<float> alpha,
1102                           const DeviceMemory<std::complex<float>> &x, int incx,
1103                           const DeviceMemory<std::complex<float>> &y, int incy,
1104                           DeviceMemory<std::complex<float>> *a, int lda) {
1105   return DoBlasInternal(cublasCher2, stream, true /* = pointer_mode_host */,
1106                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1107                         GpuComplex(GpuMemory(x)), incx,
1108                         GpuComplex(GpuMemory(y)), incy,
1109                         GpuComplex(GpuMemoryMutable(a)), lda);
1110 }
1111 
DoBlasHer2(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & x,int incx,const DeviceMemory<std::complex<double>> & y,int incy,DeviceMemory<std::complex<double>> * a,int lda)1112 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
1113                           std::complex<double> alpha,
1114                           const DeviceMemory<std::complex<double>> &x, int incx,
1115                           const DeviceMemory<std::complex<double>> &y, int incy,
1116                           DeviceMemory<std::complex<double>> *a, int lda) {
1117   return DoBlasInternal(cublasZher2, stream, true /* = pointer_mode_host */,
1118                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1119                         GpuComplex(GpuMemory(x)), incx,
1120                         GpuComplex(GpuMemory(y)), incy,
1121                         GpuComplex(GpuMemoryMutable(a)), lda);
1122 }
1123 
DoBlasHpmv(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & ap,const DeviceMemory<std::complex<float>> & x,int incx,std::complex<float> beta,DeviceMemory<std::complex<float>> * y,int incy)1124 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1125                           std::complex<float> alpha,
1126                           const DeviceMemory<std::complex<float>> &ap,
1127                           const DeviceMemory<std::complex<float>> &x, int incx,
1128                           std::complex<float> beta,
1129                           DeviceMemory<std::complex<float>> *y, int incy) {
1130   return DoBlasInternal(cublasChpmv, stream, true /* = pointer_mode_host */,
1131                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1132                         GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
1133                         incx, GpuComplex(&beta),
1134                         GpuComplex(GpuMemoryMutable(y)), incy);
1135 }
1136 
DoBlasHpmv(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & ap,const DeviceMemory<std::complex<double>> & x,int incx,std::complex<double> beta,DeviceMemory<std::complex<double>> * y,int incy)1137 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1138                           std::complex<double> alpha,
1139                           const DeviceMemory<std::complex<double>> &ap,
1140                           const DeviceMemory<std::complex<double>> &x, int incx,
1141                           std::complex<double> beta,
1142                           DeviceMemory<std::complex<double>> *y, int incy) {
1143   return DoBlasInternal(cublasZhpmv, stream, true /* = pointer_mode_host */,
1144                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1145                         GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
1146                         incx, GpuComplex(&beta),
1147                         GpuComplex(GpuMemoryMutable(y)), incy);
1148 }
1149 
DoBlasHpr(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<std::complex<float>> & x,int incx,DeviceMemory<std::complex<float>> * ap)1150 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
1151                          float alpha,
1152                          const DeviceMemory<std::complex<float>> &x, int incx,
1153                          DeviceMemory<std::complex<float>> *ap) {
1154   return DoBlasInternal(cublasChpr, stream, true /* = pointer_mode_host */,
1155                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1156                         GpuComplex(GpuMemory(x)), incx,
1157                         GpuComplex(GpuMemoryMutable(ap)));
1158 }
1159 
DoBlasHpr(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<std::complex<double>> & x,int incx,DeviceMemory<std::complex<double>> * ap)1160 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
1161                          double alpha,
1162                          const DeviceMemory<std::complex<double>> &x, int incx,
1163                          DeviceMemory<std::complex<double>> *ap) {
1164   return DoBlasInternal(cublasZhpr, stream, true /* = pointer_mode_host */,
1165                         CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
1166                         GpuComplex(GpuMemory(x)), incx,
1167                         GpuComplex(GpuMemoryMutable(ap)));
1168 }
1169 
DoBlasHpr2(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & x,int incx,const DeviceMemory<std::complex<float>> & y,int incy,DeviceMemory<std::complex<float>> * ap)1170 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
1171                           std::complex<float> alpha,
1172                           const DeviceMemory<std::complex<float>> &x, int incx,
1173                           const DeviceMemory<std::complex<float>> &y, int incy,
1174                           DeviceMemory<std::complex<float>> *ap) {
1175   return DoBlasInternal(
1176       cublasChpr2, stream, true /* = pointer_mode_host */,
1177       CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
1178       incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
1179 }
1180 
DoBlasHpr2(Stream * stream,blas::UpperLower uplo,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & x,int incx,const DeviceMemory<std::complex<double>> & y,int incy,DeviceMemory<std::complex<double>> * ap)1181 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
1182                           std::complex<double> alpha,
1183                           const DeviceMemory<std::complex<double>> &x, int incx,
1184                           const DeviceMemory<std::complex<double>> &y, int incy,
1185                           DeviceMemory<std::complex<double>> *ap) {
1186   return DoBlasInternal(
1187       cublasZhpr2, stream, true /* = pointer_mode_host */,
1188       CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
1189       incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
1190 }
1191 
DoBlasSbmv(Stream * stream,blas::UpperLower uplo,uint64 n,uint64 k,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & x,int incx,float beta,DeviceMemory<float> * y,int incy)1192 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1193                           uint64 k, float alpha, const DeviceMemory<float> &a,
1194                           int lda, const DeviceMemory<float> &x, int incx,
1195                           float beta, DeviceMemory<float> *y, int incy) {
1196   return DoBlasInternal(cublasSsbmv, stream, true /* = pointer_mode_host */,
1197                         CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
1198                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
1199                         incy);
1200 }
1201 
DoBlasSbmv(Stream * stream,blas::UpperLower uplo,uint64 n,uint64 k,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & x,int incx,double beta,DeviceMemory<double> * y,int incy)1202 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1203                           uint64 k, double alpha, const DeviceMemory<double> &a,
1204                           int lda, const DeviceMemory<double> &x, int incx,
1205                           double beta, DeviceMemory<double> *y, int incy) {
1206   return DoBlasInternal(cublasDsbmv, stream, true /* = pointer_mode_host */,
1207                         CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
1208                         lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
1209                         incy);
1210 }
1211 
DoBlasSpmv(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<float> & ap,const DeviceMemory<float> & x,int incx,float beta,DeviceMemory<float> * y,int incy)1212 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1213                           float alpha, const DeviceMemory<float> &ap,
1214                           const DeviceMemory<float> &x, int incx, float beta,
1215                           DeviceMemory<float> *y, int incy) {
1216   return DoBlasInternal(cublasSspmv, stream, true /* = pointer_mode_host */,
1217                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
1218                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
1219 }
1220 
DoBlasSpmv(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<double> & ap,const DeviceMemory<double> & x,int incx,double beta,DeviceMemory<double> * y,int incy)1221 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
1222                           double alpha, const DeviceMemory<double> &ap,
1223                           const DeviceMemory<double> &x, int incx, double beta,
1224                           DeviceMemory<double> *y, int incy) {
1225   return DoBlasInternal(cublasDspmv, stream, true /* = pointer_mode_host */,
1226                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
1227                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
1228 }
1229 
DoBlasSpr(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<float> & x,int incx,DeviceMemory<float> * ap)1230 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
1231                          float alpha, const DeviceMemory<float> &x, int incx,
1232                          DeviceMemory<float> *ap) {
1233   return DoBlasInternal(cublasSspr, stream, true /* = pointer_mode_host */,
1234                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1235                         GpuMemoryMutable(ap));
1236 }
1237 
DoBlasSpr(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<double> & x,int incx,DeviceMemory<double> * ap)1238 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
1239                          double alpha, const DeviceMemory<double> &x, int incx,
1240                          DeviceMemory<double> *ap) {
1241   return DoBlasInternal(cublasDspr, stream, true /* = pointer_mode_host */,
1242                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1243                         GpuMemoryMutable(ap));
1244 }
1245 
DoBlasSpr2(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<float> & x,int incx,const DeviceMemory<float> & y,int incy,DeviceMemory<float> * ap)1246 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
1247                           float alpha, const DeviceMemory<float> &x, int incx,
1248                           const DeviceMemory<float> &y, int incy,
1249                           DeviceMemory<float> *ap) {
1250   return DoBlasInternal(cublasSspr2, stream, true /* = pointer_mode_host */,
1251                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1252                         GpuMemory(y), incy, GpuMemoryMutable(ap));
1253 }
1254 
DoBlasSpr2(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<double> & x,int incx,const DeviceMemory<double> & y,int incy,DeviceMemory<double> * ap)1255 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
1256                           double alpha, const DeviceMemory<double> &x, int incx,
1257                           const DeviceMemory<double> &y, int incy,
1258                           DeviceMemory<double> *ap) {
1259   return DoBlasInternal(cublasDspr2, stream, true /* = pointer_mode_host */,
1260                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1261                         GpuMemory(y), incy, GpuMemoryMutable(ap));
1262 }
1263 
DoBlasSymv(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & x,int incx,float beta,DeviceMemory<float> * y,int incy)1264 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
1265                           float alpha, const DeviceMemory<float> &a, int lda,
1266                           const DeviceMemory<float> &x, int incx, float beta,
1267                           DeviceMemory<float> *y, int incy) {
1268   return DoBlasInternal(cublasSsymv, stream, true /* = pointer_mode_host */,
1269                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
1270                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
1271 }
1272 
DoBlasSymv(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & x,int incx,double beta,DeviceMemory<double> * y,int incy)1273 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
1274                           double alpha, const DeviceMemory<double> &a, int lda,
1275                           const DeviceMemory<double> &x, int incx, double beta,
1276                           DeviceMemory<double> *y, int incy) {
1277   return DoBlasInternal(cublasDsymv, stream, true /* = pointer_mode_host */,
1278                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
1279                         GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
1280 }
1281 
DoBlasSyr(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<float> & x,int incx,DeviceMemory<float> * a,int lda)1282 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
1283                          float alpha, const DeviceMemory<float> &x, int incx,
1284                          DeviceMemory<float> *a, int lda) {
1285   return DoBlasInternal(cublasSsyr, stream, true /* = pointer_mode_host */,
1286                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1287                         GpuMemoryMutable(a), lda);
1288 }
1289 
DoBlasSyr(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<double> & x,int incx,DeviceMemory<double> * a,int lda)1290 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
1291                          double alpha, const DeviceMemory<double> &x, int incx,
1292                          DeviceMemory<double> *a, int lda) {
1293   return DoBlasInternal(cublasDsyr, stream, true /* = pointer_mode_host */,
1294                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1295                         GpuMemoryMutable(a), lda);
1296 }
1297 
DoBlasSyr2(Stream * stream,blas::UpperLower uplo,uint64 n,float alpha,const DeviceMemory<float> & x,int incx,const DeviceMemory<float> & y,int incy,DeviceMemory<float> * a,int lda)1298 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
1299                           float alpha, const DeviceMemory<float> &x, int incx,
1300                           const DeviceMemory<float> &y, int incy,
1301                           DeviceMemory<float> *a, int lda) {
1302   return DoBlasInternal(cublasSsyr2, stream, true /* = pointer_mode_host */,
1303                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1304                         GpuMemory(y), incy, GpuMemoryMutable(a), lda);
1305 }
1306 
DoBlasSyr2(Stream * stream,blas::UpperLower uplo,uint64 n,double alpha,const DeviceMemory<double> & x,int incx,const DeviceMemory<double> & y,int incy,DeviceMemory<double> * a,int lda)1307 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
1308                           double alpha, const DeviceMemory<double> &x, int incx,
1309                           const DeviceMemory<double> &y, int incy,
1310                           DeviceMemory<double> *a, int lda) {
1311   return DoBlasInternal(cublasDsyr2, stream, true /* = pointer_mode_host */,
1312                         CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
1313                         GpuMemory(y), incy, GpuMemoryMutable(a), lda);
1314 }
1315 
DoBlasTbmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<float> & a,int lda,DeviceMemory<float> * x,int incx)1316 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
1317                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1318                           uint64 k, const DeviceMemory<float> &a, int lda,
1319                           DeviceMemory<float> *x, int incx) {
1320   return DoBlasInternal(cublasStbmv, stream, true /* = pointer_mode_host */,
1321                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1322                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
1323                         GpuMemoryMutable(x), incx);
1324 }
1325 
DoBlasTbmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<double> & a,int lda,DeviceMemory<double> * x,int incx)1326 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
1327                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1328                           uint64 k, const DeviceMemory<double> &a, int lda,
1329                           DeviceMemory<double> *x, int incx) {
1330   return DoBlasInternal(cublasDtbmv, stream, true /* = pointer_mode_host */,
1331                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1332                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
1333                         GpuMemoryMutable(x), incx);
1334 }
1335 
DoBlasTbmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<std::complex<float>> & a,int lda,DeviceMemory<std::complex<float>> * x,int incx)1336 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
1337                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1338                           uint64 k, const DeviceMemory<std::complex<float>> &a,
1339                           int lda, DeviceMemory<std::complex<float>> *x,
1340                           int incx) {
1341   return DoBlasInternal(cublasCtbmv, stream, true /* = pointer_mode_host */,
1342                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1343                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
1344                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1345 }
1346 
DoBlasTbmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<std::complex<double>> & a,int lda,DeviceMemory<std::complex<double>> * x,int incx)1347 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
1348                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1349                           uint64 k, const DeviceMemory<std::complex<double>> &a,
1350                           int lda, DeviceMemory<std::complex<double>> *x,
1351                           int incx) {
1352   return DoBlasInternal(cublasZtbmv, stream, true /* = pointer_mode_host */,
1353                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1354                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
1355                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1356 }
1357 
DoBlasTbsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<float> & a,int lda,DeviceMemory<float> * x,int incx)1358 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
1359                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1360                           uint64 k, const DeviceMemory<float> &a, int lda,
1361                           DeviceMemory<float> *x, int incx) {
1362   return DoBlasInternal(cublasStbsv, stream, true /* = pointer_mode_host */,
1363                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1364                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
1365                         GpuMemoryMutable(x), incx);
1366 }
1367 
DoBlasTbsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<double> & a,int lda,DeviceMemory<double> * x,int incx)1368 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
1369                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1370                           uint64 k, const DeviceMemory<double> &a, int lda,
1371                           DeviceMemory<double> *x, int incx) {
1372   return DoBlasInternal(cublasDtbsv, stream, true /* = pointer_mode_host */,
1373                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1374                         CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
1375                         GpuMemoryMutable(x), incx);
1376 }
1377 
DoBlasTbsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<std::complex<float>> & a,int lda,DeviceMemory<std::complex<float>> * x,int incx)1378 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
1379                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1380                           uint64 k, const DeviceMemory<std::complex<float>> &a,
1381                           int lda, DeviceMemory<std::complex<float>> *x,
1382                           int incx) {
1383   return DoBlasInternal(cublasCtbsv, stream, true /* = pointer_mode_host */,
1384                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1385                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
1386                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1387 }
1388 
DoBlasTbsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,uint64 k,const DeviceMemory<std::complex<double>> & a,int lda,DeviceMemory<std::complex<double>> * x,int incx)1389 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
1390                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1391                           uint64 k, const DeviceMemory<std::complex<double>> &a,
1392                           int lda, DeviceMemory<std::complex<double>> *x,
1393                           int incx) {
1394   return DoBlasInternal(cublasZtbsv, stream, true /* = pointer_mode_host */,
1395                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1396                         CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
1397                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1398 }
1399 
DoBlasTpmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<float> & ap,DeviceMemory<float> * x,int incx)1400 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
1401                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1402                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
1403                           int incx) {
1404   return DoBlasInternal(cublasStpmv, stream, true /* = pointer_mode_host */,
1405                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1406                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
1407                         GpuMemoryMutable(x), incx);
1408 }
1409 
DoBlasTpmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<double> & ap,DeviceMemory<double> * x,int incx)1410 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
1411                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1412                           const DeviceMemory<double> &ap,
1413                           DeviceMemory<double> *x, int incx) {
1414   return DoBlasInternal(cublasDtpmv, stream, true /* = pointer_mode_host */,
1415                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1416                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
1417                         GpuMemoryMutable(x), incx);
1418 }
1419 
DoBlasTpmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<float>> & ap,DeviceMemory<std::complex<float>> * x,int incx)1420 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
1421                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1422                           const DeviceMemory<std::complex<float>> &ap,
1423                           DeviceMemory<std::complex<float>> *x, int incx) {
1424   return DoBlasInternal(cublasCtpmv, stream, true /* = pointer_mode_host */,
1425                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1426                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
1427                         GpuComplex(GpuMemoryMutable(x)), incx);
1428 }
1429 
DoBlasTpmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<double>> & ap,DeviceMemory<std::complex<double>> * x,int incx)1430 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
1431                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1432                           const DeviceMemory<std::complex<double>> &ap,
1433                           DeviceMemory<std::complex<double>> *x, int incx) {
1434   return DoBlasInternal(cublasZtpmv, stream, true /* = pointer_mode_host */,
1435                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1436                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
1437                         GpuComplex(GpuMemoryMutable(x)), incx);
1438 }
1439 
DoBlasTpsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<float> & ap,DeviceMemory<float> * x,int incx)1440 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
1441                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1442                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
1443                           int incx) {
1444   return DoBlasInternal(cublasStpsv, stream, true /* = pointer_mode_host */,
1445                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1446                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
1447                         GpuMemoryMutable(x), incx);
1448 }
1449 
DoBlasTpsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<double> & ap,DeviceMemory<double> * x,int incx)1450 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
1451                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1452                           const DeviceMemory<double> &ap,
1453                           DeviceMemory<double> *x, int incx) {
1454   return DoBlasInternal(cublasDtpsv, stream, true /* = pointer_mode_host */,
1455                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1456                         CUDABlasDiagonal(diag), n, GpuMemory(ap),
1457                         GpuMemoryMutable(x), incx);
1458 }
1459 
DoBlasTpsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<float>> & ap,DeviceMemory<std::complex<float>> * x,int incx)1460 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
1461                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1462                           const DeviceMemory<std::complex<float>> &ap,
1463                           DeviceMemory<std::complex<float>> *x, int incx) {
1464   return DoBlasInternal(cublasCtpsv, stream, true /* = pointer_mode_host */,
1465                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1466                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
1467                         GpuComplex(GpuMemoryMutable(x)), incx);
1468 }
1469 
DoBlasTpsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<double>> & ap,DeviceMemory<std::complex<double>> * x,int incx)1470 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
1471                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1472                           const DeviceMemory<std::complex<double>> &ap,
1473                           DeviceMemory<std::complex<double>> *x, int incx) {
1474   return DoBlasInternal(cublasZtpsv, stream, true /* = pointer_mode_host */,
1475                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1476                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
1477                         GpuComplex(GpuMemoryMutable(x)), incx);
1478 }
1479 
DoBlasTrmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<float> & a,int lda,DeviceMemory<float> * x,int incx)1480 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
1481                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1482                           const DeviceMemory<float> &a, int lda,
1483                           DeviceMemory<float> *x, int incx) {
1484   return DoBlasInternal(cublasStrmv, stream, true /* = pointer_mode_host */,
1485                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1486                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
1487                         GpuMemoryMutable(x), incx);
1488 }
1489 
DoBlasTrmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<double> & a,int lda,DeviceMemory<double> * x,int incx)1490 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
1491                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1492                           const DeviceMemory<double> &a, int lda,
1493                           DeviceMemory<double> *x, int incx) {
1494   return DoBlasInternal(cublasDtrmv, stream, true /* = pointer_mode_host */,
1495                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1496                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
1497                         GpuMemoryMutable(x), incx);
1498 }
1499 
DoBlasTrmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<float>> & a,int lda,DeviceMemory<std::complex<float>> * x,int incx)1500 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
1501                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1502                           const DeviceMemory<std::complex<float>> &a, int lda,
1503                           DeviceMemory<std::complex<float>> *x, int incx) {
1504   return DoBlasInternal(cublasCtrmv, stream, true /* = pointer_mode_host */,
1505                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1506                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
1507                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1508 }
1509 
DoBlasTrmv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<double>> & a,int lda,DeviceMemory<std::complex<double>> * x,int incx)1510 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
1511                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1512                           const DeviceMemory<std::complex<double>> &a, int lda,
1513                           DeviceMemory<std::complex<double>> *x, int incx) {
1514   return DoBlasInternal(cublasZtrmv, stream, true /* = pointer_mode_host */,
1515                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1516                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
1517                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1518 }
1519 
DoBlasTrsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<float> & a,int lda,DeviceMemory<float> * x,int incx)1520 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
1521                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1522                           const DeviceMemory<float> &a, int lda,
1523                           DeviceMemory<float> *x, int incx) {
1524   return DoBlasInternal(cublasStrsv, stream, true /* = pointer_mode_host */,
1525                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1526                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
1527                         GpuMemoryMutable(x), incx);
1528 }
1529 
DoBlasTrsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<double> & a,int lda,DeviceMemory<double> * x,int incx)1530 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
1531                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1532                           const DeviceMemory<double> &a, int lda,
1533                           DeviceMemory<double> *x, int incx) {
1534   return DoBlasInternal(cublasDtrsv, stream, true /* = pointer_mode_host */,
1535                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1536                         CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
1537                         GpuMemoryMutable(x), incx);
1538 }
1539 
DoBlasTrsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<float>> & a,int lda,DeviceMemory<std::complex<float>> * x,int incx)1540 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
1541                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1542                           const DeviceMemory<std::complex<float>> &a, int lda,
1543                           DeviceMemory<std::complex<float>> *x, int incx) {
1544   return DoBlasInternal(cublasCtrsv, stream, true /* = pointer_mode_host */,
1545                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1546                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
1547                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1548 }
1549 
DoBlasTrsv(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,blas::Diagonal diag,uint64 n,const DeviceMemory<std::complex<double>> & a,int lda,DeviceMemory<std::complex<double>> * x,int incx)1550 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
1551                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
1552                           const DeviceMemory<std::complex<double>> &a, int lda,
1553                           DeviceMemory<std::complex<double>> *x, int incx) {
1554   return DoBlasInternal(cublasZtrsv, stream, true /* = pointer_mode_host */,
1555                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
1556                         CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
1557                         lda, GpuComplex(GpuMemoryMutable(x)), incx);
1558 }
1559 
DoBlasGemm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const DeviceMemory<Eigen::half> & a,int lda,const DeviceMemory<Eigen::half> & b,int ldb,float beta,DeviceMemory<Eigen::half> * c,int ldc)1560 bool CUDABlas::DoBlasGemm(
1561     Stream *stream, blas::Transpose transa,
1562     blas::Transpose transb, uint64 m, uint64 n, uint64 k,
1563     float alpha, const DeviceMemory<Eigen::half> &a, int lda,
1564     const DeviceMemory<Eigen::half> &b, int ldb, float beta,
1565     DeviceMemory<Eigen::half> *c, int ldc) {
1566 #if CUDA_VERSION >= 7050
1567   VLOG(1) << port::Printf(
1568       "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
1569       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
1570       "c=%p ldc=%d",
1571       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
1572       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
1573   if (transa == blas::Transpose::kNoTranspose) {
1574     if (lda < static_cast<int64>(m)) {
1575       LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
1576                       "precondition violation";
1577     }
1578   } else {
1579     if (lda < static_cast<int64>(k)) {
1580       LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
1581                    << ") (transpose case); precondition violation";
1582     }
1583   }
1584   if (transb == blas::Transpose::kNoTranspose) {
1585     if (ldb < static_cast<int64>(k)) {
1586       LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
1587                    << ") (no transpose case); precondition violation";
1588     }
1589   } else {
1590     if (ldb < static_cast<int64>(n)) {
1591       LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
1592                       "precondition violation";
1593     }
1594   }
1595 
1596   bool use_tensor_ops = false;
1597 #if CUDA_VERSION >= 9000
1598   int cc_major, cc_minor;
1599   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
1600                                                                    &cc_minor);
1601 
1602   // GPUs < sm_70 don't support tensor ops.
1603   if (cc_major >= 7 && TensorOpMathEnabled()) {
1604     use_tensor_ops = true;
1605   }
1606 #endif
1607 
1608   return DoBlasInternalImpl(
1609       cublasSgemmEx, stream, true /* = pointer_mode_host */,
1610       true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
1611       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
1612       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
1613       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
1614 
1615 #else
1616   LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
1617              << "(need at least CUDA 7.5)";
1618   return false;
1619 #endif
1620 }
1621 
DoBlasGemm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & b,int ldb,float beta,DeviceMemory<float> * c,int ldc)1622 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
1623                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
1624                           float alpha, const DeviceMemory<float> &a, int lda,
1625                           const DeviceMemory<float> &b, int ldb, float beta,
1626                           DeviceMemory<float> *c, int ldc) {
1627   VLOG(1) << port::Printf(
1628       "doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
1629       "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
1630       "c=%p ldc=%d",
1631       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
1632       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
1633   if (transa == blas::Transpose::kNoTranspose) {
1634     if (lda < static_cast<int64>(m)) {
1635       LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
1636                       "precondition violation";
1637     }
1638   } else {
1639     if (lda < static_cast<int64>(k)) {
1640       LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
1641                    << ") (transpose case); precondition violation";
1642     }
1643   }
1644   if (transb == blas::Transpose::kNoTranspose) {
1645     if (ldb < static_cast<int64>(k)) {
1646       LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
1647                    << ") (no transpose case); precondition violation";
1648     }
1649   } else {
1650     if (ldb < static_cast<int64>(n)) {
1651       LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
1652                       "precondition violation";
1653     }
1654   }
1655   return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */,
1656                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
1657                         n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
1658                         &beta, GpuMemoryMutable(c), ldc);
1659 }
1660 
DoBlasGemm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & b,int ldb,double beta,DeviceMemory<double> * c,int ldc)1661 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
1662                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
1663                           double alpha, const DeviceMemory<double> &a, int lda,
1664                           const DeviceMemory<double> &b, int ldb, double beta,
1665                           DeviceMemory<double> *c, int ldc) {
1666   return DoBlasInternal(cublasDgemm, stream, true /* = pointer_mode_host */,
1667                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
1668                         n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
1669                         &beta, GpuMemoryMutable(c), ldc);
1670 }
1671 
DoBlasGemm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc)1672 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
1673                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
1674                           std::complex<float> alpha,
1675                           const DeviceMemory<std::complex<float>> &a, int lda,
1676                           const DeviceMemory<std::complex<float>> &b, int ldb,
1677                           std::complex<float> beta,
1678                           DeviceMemory<std::complex<float>> *c, int ldc) {
1679   return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
1680                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
1681                         n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
1682                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
1683                         GpuComplex(GpuMemoryMutable(c)), ldc);
1684 }
1685 
DoBlasGemm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc)1686 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
1687                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
1688                           std::complex<double> alpha,
1689                           const DeviceMemory<std::complex<double>> &a, int lda,
1690                           const DeviceMemory<std::complex<double>> &b, int ldb,
1691                           std::complex<double> beta,
1692                           DeviceMemory<std::complex<double>> *c, int ldc) {
1693   return DoBlasInternal(cublasZgemm, stream, true /* = pointer_mode_host */,
1694                         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
1695                         n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
1696                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
1697                         GpuComplex(GpuMemoryMutable(c)), ldc);
1698 }
1699 
DoBlasGemvWithProfiling(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & x,int incx,float beta,DeviceMemory<float> * y,int incy,blas::ProfileResult * output_profile_result)1700 bool CUDABlas::DoBlasGemvWithProfiling(
1701     Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
1702     const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
1703     int incx, float beta, DeviceMemory<float> *y, int incy,
1704     blas::ProfileResult *output_profile_result) {
1705   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
1706                                      incx, beta, y, incy,
1707                                      output_profile_result);
1708 }
1709 
DoBlasGemvWithProfiling(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & x,int incx,double beta,DeviceMemory<double> * y,int incy,blas::ProfileResult * output_profile_result)1710 bool CUDABlas::DoBlasGemvWithProfiling(
1711     Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
1712     const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
1713     int incx, double beta, DeviceMemory<double> *y, int incy,
1714     blas::ProfileResult *output_profile_result) {
1715   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
1716                                      incx, beta, y, incy,
1717                                      output_profile_result);
1718 }
1719 
DoBlasGemvWithProfiling(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & x,int incx,std::complex<float> beta,DeviceMemory<std::complex<float>> * y,int incy,blas::ProfileResult * output_profile_result)1720 bool CUDABlas::DoBlasGemvWithProfiling(
1721     Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
1722     std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
1723     int lda, const DeviceMemory<std::complex<float>> &x, int incx,
1724     std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
1725     blas::ProfileResult *output_profile_result) {
1726   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
1727                                      incx, beta, y, incy,
1728                                      output_profile_result);
1729 }
1730 
DoBlasGemvWithProfiling(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & x,int incx,std::complex<double> beta,DeviceMemory<std::complex<double>> * y,int incy,blas::ProfileResult * output_profile_result)1731 bool CUDABlas::DoBlasGemvWithProfiling(
1732     Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
1733     std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
1734     int lda, const DeviceMemory<std::complex<double>> &x, int incx,
1735     std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
1736     blas::ProfileResult *output_profile_result) {
1737   return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
1738                                      incx, beta, y, incy,
1739                                      output_profile_result);
1740 }
1741 
DoBlasGemmWithProfiling(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const DeviceMemory<Eigen::half> & a,int lda,const DeviceMemory<Eigen::half> & b,int ldb,float beta,DeviceMemory<Eigen::half> * c,int ldc,blas::ProfileResult * output_profile_result)1742 bool CUDABlas::DoBlasGemmWithProfiling(
1743     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1744     uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
1745     int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
1746     DeviceMemory<Eigen::half> *c, int ldc,
1747     blas::ProfileResult *output_profile_result) {
1748   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
1749                                      lda, b, ldb, beta, c, ldc,
1750                                      output_profile_result);
1751 }
1752 
DoBlasGemmWithProfiling(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & b,int ldb,float beta,DeviceMemory<float> * c,int ldc,blas::ProfileResult * output_profile_result)1753 bool CUDABlas::DoBlasGemmWithProfiling(
1754     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1755     uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
1756     const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
1757     int ldc, blas::ProfileResult *output_profile_result) {
1758   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
1759                                      lda, b, ldb, beta, c, ldc,
1760                                      output_profile_result);
1761 }
1762 
DoBlasGemmWithProfiling(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & b,int ldb,double beta,DeviceMemory<double> * c,int ldc,blas::ProfileResult * output_profile_result)1763 bool CUDABlas::DoBlasGemmWithProfiling(
1764     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1765     uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
1766     const DeviceMemory<double> &b, int ldb, double beta,
1767     DeviceMemory<double> *c, int ldc,
1768     blas::ProfileResult *output_profile_result) {
1769   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
1770                                      lda, b, ldb, beta, c, ldc,
1771                                      output_profile_result);
1772 }
1773 
DoBlasGemmWithProfiling(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc,blas::ProfileResult * output_profile_result)1774 bool CUDABlas::DoBlasGemmWithProfiling(
1775     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1776     uint64 n, uint64 k, std::complex<float> alpha,
1777     const DeviceMemory<std::complex<float>> &a, int lda,
1778     const DeviceMemory<std::complex<float>> &b, int ldb,
1779     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
1780     blas::ProfileResult *output_profile_result) {
1781   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
1782                                      lda, b, ldb, beta, c, ldc,
1783                                      output_profile_result);
1784 }
1785 
DoBlasGemmWithProfiling(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc,blas::ProfileResult * output_profile_result)1786 bool CUDABlas::DoBlasGemmWithProfiling(
1787     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1788     uint64 n, uint64 k, std::complex<double> alpha,
1789     const DeviceMemory<std::complex<double>> &a, int lda,
1790     const DeviceMemory<std::complex<double>> &b, int ldb,
1791     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
1792     blas::ProfileResult *output_profile_result) {
1793   return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
1794                                      lda, b, ldb, beta, c, ldc,
1795                                      output_profile_result);
1796 }
1797 
1798 template <typename T>
DoBlasGemvWithProfilingImpl(Stream * stream,blas::Transpose trans,uint64 m,uint64 n,const T & alpha,const DeviceMemory<T> & a,int lda,const DeviceMemory<T> & x,int incx,const T & beta,DeviceMemory<T> * y,int incy,blas::ProfileResult * output_profile_result)1799 bool CUDABlas::DoBlasGemvWithProfilingImpl(
1800     Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
1801     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
1802     const T &beta, DeviceMemory<T> *y, int incy,
1803     blas::ProfileResult *output_profile_result) {
1804   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
1805   if (output_profile_result != nullptr) {
1806     timer.reset(new GpuTimer(parent_));
1807     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
1808       return false;
1809     }
1810   }
1811 
1812   // Call blasGemm
1813   bool result =
1814       DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
1815 
1816   if (timer != nullptr && result) {
1817     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
1818     // state.
1819     if (!timer->Stop(AsGpuStream(stream))) {
1820       return false;
1821     }
1822     output_profile_result->set_is_valid(true);
1823     output_profile_result->set_algorithm(blas::kDefaultBlasGemv);
1824     output_profile_result->set_elapsed_time_in_ms(
1825         timer->GetElapsedMilliseconds());
1826   }
1827   return result;
1828 }
1829 
1830 template <typename T, typename ParamType>
DoBlasGemmWithProfilingImpl(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const ParamType & alpha,const DeviceMemory<T> & a,int lda,const DeviceMemory<T> & b,int ldb,const ParamType & beta,DeviceMemory<T> * c,int ldc,blas::ProfileResult * output_profile_result)1831 bool CUDABlas::DoBlasGemmWithProfilingImpl(
1832     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1833     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
1834     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
1835     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
1836   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
1837   if (output_profile_result != nullptr) {
1838     timer.reset(new GpuTimer(parent_));
1839     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
1840       return false;
1841     }
1842   }
1843 
1844   // Call blasGemm
1845   bool result = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b,
1846                            ldb, beta, c, ldc);
1847 
1848   if (timer != nullptr && result) {
1849     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
1850     // state.
1851     if (!timer->Stop(AsGpuStream(stream))) {
1852       return false;
1853     }
1854     output_profile_result->set_is_valid(true);
1855     output_profile_result->set_algorithm(blas::kDefaultBlasGemm);
1856     output_profile_result->set_elapsed_time_in_ms(
1857         timer->GetElapsedMilliseconds());
1858   }
1859   return result;
1860 }
1861 
UsesTensorOps(blas::AlgorithmType algo)1862 static bool UsesTensorOps(blas::AlgorithmType algo) {
1863 #if CUDA_VERSION >= 9000
1864   cublasGemmAlgo_t cublas_algo = static_cast<cublasGemmAlgo_t>(algo);
1865   return cublas_algo >= CUBLAS_GEMM_DEFAULT_TENSOR_OP;
1866 #else
1867   return false;
1868 #endif
1869 }
1870 
1871 template <typename InType>
TensorOpsAvailable(int cc_major)1872 static bool TensorOpsAvailable(int cc_major) {
1873 #if CUDA_VERSION >= 9000
1874   // cublas *does* allow tensor ops on inputs that are not fp16, so this is not
1875   // strictly correct.  We can't simply enable it, though, as that would change
1876   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
1877   // to be rounded to fp16.
1878   if (cc_major >= 7 && TensorOpMathEnabled() &&
1879       std::is_same<InType, Eigen::half>::value) {
1880     return true;
1881   }
1882 #endif
1883   return false;
1884 }
1885 
1886 template <typename InT, typename OutT, typename CompT>
DoBlasGemmWithAlgorithmImpl(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<CompT> & alpha,const DeviceMemory<InT> & a,int lda,const DeviceMemory<InT> & b,int ldb,const HostOrDeviceScalar<CompT> & beta,DeviceMemory<OutT> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)1887 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
1888     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
1889     uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
1890     const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
1891     const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
1892     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
1893     blas::ProfileResult *output_profile_result) {
1894   // GPUs < sm_50 don't support cublasGemmEx.
1895   int cc_major, cc_minor;
1896   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
1897           &cc_major, &cc_minor) &&
1898       cc_major < 5) {
1899     VLOG(2) << "DoBlasGemmWithAlgorithm returning false because sm" << cc_major
1900             << cc_minor << " devices don't support explicit gemm algorithms.";
1901     return false;
1902   }
1903 
1904   if (UsesTensorOps(algorithm) && !TensorOpsAvailable<InT>(cc_major)) {
1905     if (std::is_same<InT, Eigen::half>::value) {
1906       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
1907               << algorithm
1908               << " uses tensor ops, but tensor ops are not available in sm"
1909               << cc_major << "X devices.";
1910     } else {
1911       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
1912               << algorithm
1913               << " uses tensor ops, but the input data type is not fp16.";
1914     }
1915     return false;
1916   }
1917 
1918   // Either both 'alpha' and 'beta' need to be pointers to device memory, or
1919   // they need to be both host scalars.
1920   if (alpha.is_pointer() != beta.is_pointer()) {
1921     VLOG(2) << "DoBlasGemmWithAlgorithm returning false because one of `alpha` "
1922                "and `beta` is a pointer, but the other is not.";
1923     return false;
1924   }
1925 
1926   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
1927   if (output_profile_result != nullptr) {
1928     timer.reset(new GpuTimer(parent_));
1929     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
1930       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because "
1931                  "output_profile_result was given, but we were unable to "
1932                  "create a GpuTimer.";
1933       return false;
1934     }
1935   }
1936 
1937   // Return false if we might be hitting a cuBLAS bug that produces the wrong
1938   // result. See nvbugs/2156201, b/79126339.
1939 #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
1940   if ((algorithm == CUBLAS_GEMM_DEFAULT || algorithm >= CUBLAS_GEMM_ALGO13) &&
1941       std::max({m, n, k}) >= 2097153 && cc_major < 7) {
1942     VLOG(2) << "DoBlasGemmWithAlgorithm returning false to work around cudnn "
1943                "<9.2 bug with m, n, or k >= 2097153.  See b/79126339.";
1944     return false;
1945   }
1946 #endif
1947 
1948   cudaDataType_t cuda_in_type = CUDADataType<InT>::type;
1949   // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
1950   // we do the following compile-time check on the default value:
1951   static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, "");
1952   // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we
1953   // essentially reinterpet_cast to __half, which is safe because Eigen::half
1954   // inherits from __half.
1955   bool result = DoBlasInternalFailureOK(
1956       cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
1957       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
1958       alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
1959       GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
1960       beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
1961       GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc,
1962       CUDAComputationType(computation_type),
1963       static_cast<cublasGemmAlgo_t>(algorithm));
1964 
1965   if (timer != nullptr && result) {
1966     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
1967     // state.
1968     if (!timer->Stop(AsGpuStream(stream))) {
1969       VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop "
1970                  "GpuTimer.";
1971       return false;
1972     }
1973     output_profile_result->set_is_valid(true);
1974     output_profile_result->set_algorithm(algorithm);
1975     output_profile_result->set_elapsed_time_in_ms(
1976         timer->GetElapsedMilliseconds());
1977   }
1978   return result;
1979 }
1980 
GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> * out_algorithms)1981 bool CUDABlas::GetBlasGemmAlgorithms(
1982     std::vector<blas::AlgorithmType> *out_algorithms) {
1983   // cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx)
1984   // were first introduced in CUDA 8.
1985   //
1986   // Note that when CUDA version and compute capability is not sufficient, we
1987   // still return the out_algorithms. Caller needs to make sure that in this
1988   // case, the returned vector is empty.
1989   *out_algorithms = {
1990     CUBLAS_GEMM_DFALT,
1991     CUBLAS_GEMM_ALGO0,
1992     CUBLAS_GEMM_ALGO1,
1993     CUBLAS_GEMM_ALGO2,
1994     CUBLAS_GEMM_ALGO3,
1995     CUBLAS_GEMM_ALGO4,
1996     CUBLAS_GEMM_ALGO5,
1997     CUBLAS_GEMM_ALGO6,
1998     CUBLAS_GEMM_ALGO7,
1999 #if CUDA_VERSION >= 9000
2000     CUBLAS_GEMM_ALGO8,
2001     CUBLAS_GEMM_ALGO9,
2002     CUBLAS_GEMM_ALGO10,
2003     CUBLAS_GEMM_ALGO11,
2004     CUBLAS_GEMM_ALGO12,
2005     CUBLAS_GEMM_ALGO13,
2006     CUBLAS_GEMM_ALGO14,
2007     CUBLAS_GEMM_ALGO15,
2008     CUBLAS_GEMM_ALGO16,
2009     CUBLAS_GEMM_ALGO17,
2010     CUBLAS_GEMM_DFALT_TENSOR_OP,
2011     CUBLAS_GEMM_ALGO0_TENSOR_OP,
2012     CUBLAS_GEMM_ALGO1_TENSOR_OP,
2013     CUBLAS_GEMM_ALGO2_TENSOR_OP,
2014     CUBLAS_GEMM_ALGO3_TENSOR_OP,
2015     CUBLAS_GEMM_ALGO4_TENSOR_OP,
2016 #endif
2017 #if CUDA_VERSION >= 9020
2018     CUBLAS_GEMM_ALGO18,
2019     CUBLAS_GEMM_ALGO19,
2020     CUBLAS_GEMM_ALGO20,
2021     CUBLAS_GEMM_ALGO21,
2022     CUBLAS_GEMM_ALGO22,
2023     CUBLAS_GEMM_ALGO23,
2024     CUBLAS_GEMM_ALGO5_TENSOR_OP,
2025     CUBLAS_GEMM_ALGO6_TENSOR_OP,
2026     CUBLAS_GEMM_ALGO7_TENSOR_OP,
2027     CUBLAS_GEMM_ALGO8_TENSOR_OP,
2028     CUBLAS_GEMM_ALGO9_TENSOR_OP,
2029     CUBLAS_GEMM_ALGO10_TENSOR_OP,
2030     CUBLAS_GEMM_ALGO11_TENSOR_OP,
2031     CUBLAS_GEMM_ALGO12_TENSOR_OP,
2032     CUBLAS_GEMM_ALGO13_TENSOR_OP,
2033     CUBLAS_GEMM_ALGO14_TENSOR_OP,
2034     CUBLAS_GEMM_ALGO15_TENSOR_OP,
2035 #endif
2036   };
2037   return true;
2038 }
2039 
DoBlasGemmWithAlgorithm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<int> & alpha,const DeviceMemory<int8> & a,int lda,const DeviceMemory<int8> & b,int ldb,const HostOrDeviceScalar<int> & beta,DeviceMemory<int> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)2040 bool CUDABlas::DoBlasGemmWithAlgorithm(
2041     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2042     uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
2043     const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
2044     const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
2045     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
2046     blas::ProfileResult *output_profile_result) {
2047   return DoBlasGemmWithAlgorithmImpl(
2048       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
2049       computation_type, algorithm, output_profile_result);
2050 }
2051 
DoBlasGemmWithAlgorithm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<Eigen::half> & alpha,const DeviceMemory<Eigen::half> & a,int lda,const DeviceMemory<Eigen::half> & b,int ldb,const HostOrDeviceScalar<Eigen::half> & beta,DeviceMemory<Eigen::half> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)2052 bool CUDABlas::DoBlasGemmWithAlgorithm(
2053     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2054     uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
2055     const DeviceMemory<Eigen::half> &a, int lda,
2056     const DeviceMemory<Eigen::half> &b, int ldb,
2057     const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
2058     int ldc, blas::ComputationType computation_type,
2059     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
2060   if (computation_type == blas::ComputationType::kF32) {
2061     if (alpha.is_pointer() || beta.is_pointer()) {
2062       // We cannot easily convert a pointer to f16 memory to a pointer to f32
2063       // memory from here, so we don't support this for now.
2064       // TODO(akuegel): Investigate whether we can do the conversion before
2065       // calling DoBlasGemmWithAlgorithm.
2066       return false;
2067     }
2068     HostOrDeviceScalar<float> float_alpha(static_cast<float>(alpha.value()));
2069     HostOrDeviceScalar<float> float_beta(static_cast<float>(beta.value()));
2070     return DoBlasGemmWithAlgorithmImpl(
2071         stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb,
2072         float_beta, c, ldc, computation_type, algorithm, output_profile_result);
2073   }
2074 
2075   CHECK_EQ(computation_type, blas::ComputationType::kF16);
2076   return DoBlasGemmWithAlgorithmImpl(
2077       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
2078       computation_type, algorithm, output_profile_result);
2079 }
2080 
DoBlasGemmWithAlgorithm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<float> & alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & b,int ldb,const HostOrDeviceScalar<float> & beta,DeviceMemory<float> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)2081 bool CUDABlas::DoBlasGemmWithAlgorithm(
2082     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2083     uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
2084     const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
2085     int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
2086     int ldc, blas::ComputationType computation_type,
2087     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
2088   return DoBlasGemmWithAlgorithmImpl(
2089       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
2090       computation_type, algorithm, output_profile_result);
2091 }
2092 
DoBlasGemmWithAlgorithm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<double> & alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & b,int ldb,const HostOrDeviceScalar<double> & beta,DeviceMemory<double> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)2093 bool CUDABlas::DoBlasGemmWithAlgorithm(
2094     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2095     uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
2096     const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
2097     int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
2098     int ldc, blas::ComputationType computation_type,
2099     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
2100   return DoBlasGemmWithAlgorithmImpl(
2101       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
2102       computation_type, algorithm, output_profile_result);
2103 }
2104 
DoBlasGemmWithAlgorithm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<std::complex<float>> & alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,const HostOrDeviceScalar<std::complex<float>> & beta,DeviceMemory<std::complex<float>> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)2105 bool CUDABlas::DoBlasGemmWithAlgorithm(
2106     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2107     uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
2108     const DeviceMemory<std::complex<float>> &a, int lda,
2109     const DeviceMemory<std::complex<float>> &b, int ldb,
2110     const HostOrDeviceScalar<std::complex<float>> &beta,
2111     DeviceMemory<std::complex<float>> *c, int ldc,
2112     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
2113     blas::ProfileResult *output_profile_result) {
2114   return DoBlasGemmWithAlgorithmImpl(
2115       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
2116       computation_type, algorithm, output_profile_result);
2117 }
2118 
DoBlasGemmWithAlgorithm(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,const HostOrDeviceScalar<std::complex<double>> & alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,const HostOrDeviceScalar<std::complex<double>> & beta,DeviceMemory<std::complex<double>> * c,int ldc,blas::ComputationType computation_type,blas::AlgorithmType algorithm,blas::ProfileResult * output_profile_result)2119 bool CUDABlas::DoBlasGemmWithAlgorithm(
2120     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2121     uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
2122     const DeviceMemory<std::complex<double>> &a, int lda,
2123     const DeviceMemory<std::complex<double>> &b, int ldb,
2124     const HostOrDeviceScalar<std::complex<double>> &beta,
2125     DeviceMemory<std::complex<double>> *c, int ldc,
2126     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
2127     blas::ProfileResult *output_profile_result) {
2128   return DoBlasGemmWithAlgorithmImpl(
2129       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
2130       computation_type, algorithm, output_profile_result);
2131 }
2132 
2133 template <typename T>
2134 struct HalfAsFloat {
2135   typedef T type;
2136 };
2137 
2138 template <>
2139 struct HalfAsFloat<Eigen::half> {
2140   typedef float type;
2141 };
2142 
2143 template <typename T, typename Scalar, typename FuncT>
DoBlasGemmBatchedInternal(FuncT cublas_func,Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,Scalar alpha,const port::ArraySlice<DeviceMemory<T> * > & a_ptrs_to_wrappers,int lda,const port::ArraySlice<DeviceMemory<T> * > & b_ptrs_to_wrappers,int ldb,Scalar beta,const port::ArraySlice<DeviceMemory<T> * > & c_ptrs_to_wrappers,int ldc,int batch_count,ScratchAllocator * scratch_allocator)2144 port::Status CUDABlas::DoBlasGemmBatchedInternal(
2145     FuncT cublas_func, Stream *stream, blas::Transpose transa,
2146     blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
2147     const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
2148     const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
2149     Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
2150     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
2151   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
2152   for (int i = 0; i < batch_count; ++i) {
2153     a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
2154     b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque()));
2155     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
2156   }
2157 
2158   typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T;
2159 
2160   const size_t size = batch_count * sizeof(CUDA_T *);
2161 
2162   // Device-side copy of pointers to matrices.
2163   DeviceMemory<CUDA_T *> a;
2164   DeviceMemory<CUDA_T *> b;
2165   DeviceMemory<CUDA_T *> c;
2166 
2167   // If temporary space is allocated for device-side copies of pointers to
2168   // matrices, that temporary space should not be freed until this function
2169   // returns. Although the values for these unique_ptrs are not set here, they
2170   // are declared at this scope so they will be destroyed when the function
2171   // returns.
2172   //
2173   // If a scratch allocator is provided, these pointers will not be used at all.
2174   std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary;
2175   std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary;
2176   std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary;
2177 
2178   // Decide how to allocate device-side copy of pointers to matrices based on
2179   // whether a scratch allocator was passed.
2180   if (scratch_allocator != nullptr) {
2181     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes,
2182                         scratch_allocator->AllocateBytes(stream, size));
2183     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes,
2184                         scratch_allocator->AllocateBytes(stream, size));
2185     SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes,
2186                         scratch_allocator->AllocateBytes(stream, size));
2187     a = DeviceMemory<CUDA_T *>(a_bytes);
2188     b = DeviceMemory<CUDA_T *>(b_bytes);
2189     c = DeviceMemory<CUDA_T *>(c_bytes);
2190   } else {
2191     SE_ASSIGN_OR_RETURN(a_temporary,
2192                         stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
2193     SE_ASSIGN_OR_RETURN(b_temporary,
2194                         stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
2195     SE_ASSIGN_OR_RETURN(c_temporary,
2196                         stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
2197     a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory());
2198     b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory());
2199     c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory());
2200   }
2201 
2202   if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
2203       !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
2204       !stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) {
2205     return port::Status(port::error::INTERNAL,
2206                         "failed to copy memory from host to device in "
2207                         "CUDABlas::DoBlasGemmBatched");
2208   }
2209 
2210   cudaDataType_t data_type = CUDADataType<T>::type;
2211 
2212 #if CUDA_VERSION >= 9010
2213   int cc_major, cc_minor;
2214   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
2215           &cc_major, &cc_minor) &&
2216       cc_major >= 5) {
2217     bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
2218     cublasGemmAlgo_t algo =
2219         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
2220     cudaDataType_t compute_type =
2221         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
2222     const void **a_void_ptrs = reinterpret_cast<const void **>(
2223         const_cast<const CUDA_T **>(GpuMemory(a)));
2224     const void **b_void_ptrs = reinterpret_cast<const void **>(
2225         const_cast<const CUDA_T **>(GpuMemory(b)));
2226     void **c_void_ptrs =
2227         reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c)));
2228     bool ok;
2229     ok = DoBlasInternalImpl(
2230         cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
2231         true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
2232         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
2233         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
2234         batch_count, compute_type, algo);
2235     if (ok) {
2236       return port::Status::OK();
2237     }
2238     return port::Status(port::error::INTERNAL,
2239                         "failed BLAS call, see log for details");
2240   }
2241 #endif
2242   // either CUDA_VERSION < 9.1 or SM < 5.0
2243   if (data_type != CUDA_R_16F) {
2244     bool ok = DoBlasInternal(
2245         cublas_func, stream, true /* = pointer_mode_host */,
2246         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
2247         GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
2248         const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta),
2249         const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
2250     if (ok) {
2251       return port::Status::OK();
2252     }
2253     return port::Status(port::error::INTERNAL,
2254                         "failed BLAS call, see log for details");
2255   } else {
2256     // Fall back to a loop for fp16
2257     for (int b = 0; b < batch_count; ++b) {
2258       const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
2259       const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
2260       DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
2261       bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
2262                            lda, b_matrix, ldb, beta, c_matrix, ldc);
2263       if (!ok) {
2264         return port::Status(port::error::INTERNAL,
2265                             "failed BLAS call, see log for details");
2266       }
2267     }
2268     return port::Status::OK();
2269   }
2270 }
2271 
DoBlasGemmBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const port::ArraySlice<DeviceMemory<Eigen::half> * > & a_array,int lda,const port::ArraySlice<DeviceMemory<Eigen::half> * > & b_array,int ldb,float beta,const port::ArraySlice<DeviceMemory<Eigen::half> * > & c_array,int ldc,int batch_count,ScratchAllocator * scratch_allocator)2272 bool CUDABlas::DoBlasGemmBatched(
2273     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2274     uint64 n, uint64 k, float alpha,
2275     const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
2276     const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
2277     float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
2278     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
2279   // Note: The func passed here (cublasSgemmBatched) is not actually called,
2280   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
2281   port::Status status = DoBlasGemmBatchedInternal(
2282       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
2283       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
2284   if (!status.ok()) {
2285     LOG(ERROR) << status;
2286   }
2287   return status.ok();
2288 }
2289 
DoBlasGemmBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const port::ArraySlice<DeviceMemory<float> * > & a_array,int lda,const port::ArraySlice<DeviceMemory<float> * > & b_array,int ldb,float beta,const port::ArraySlice<DeviceMemory<float> * > & c_array,int ldc,int batch_count,ScratchAllocator * scratch_allocator)2290 bool CUDABlas::DoBlasGemmBatched(
2291     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2292     uint64 n, uint64 k, float alpha,
2293     const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
2294     const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
2295     const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
2296     int batch_count, ScratchAllocator *scratch_allocator) {
2297   port::Status status = DoBlasGemmBatchedInternal(
2298       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
2299       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
2300   if (!status.ok()) {
2301     LOG(ERROR) << status;
2302   }
2303   return status.ok();
2304 }
2305 
DoBlasGemmBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,double alpha,const port::ArraySlice<DeviceMemory<double> * > & a_array,int lda,const port::ArraySlice<DeviceMemory<double> * > & b_array,int ldb,double beta,const port::ArraySlice<DeviceMemory<double> * > & c_array,int ldc,int batch_count,ScratchAllocator * scratch_allocator)2306 bool CUDABlas::DoBlasGemmBatched(
2307     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2308     uint64 n, uint64 k, double alpha,
2309     const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
2310     const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
2311     double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
2312     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
2313   port::Status status = DoBlasGemmBatchedInternal(
2314       cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
2315       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
2316   if (!status.ok()) {
2317     LOG(ERROR) << status;
2318   }
2319   return status.ok();
2320 }
2321 
DoBlasGemmBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<float> alpha,const port::ArraySlice<DeviceMemory<std::complex<float>> * > & a_array,int lda,const port::ArraySlice<DeviceMemory<std::complex<float>> * > & b_array,int ldb,std::complex<float> beta,const port::ArraySlice<DeviceMemory<std::complex<float>> * > & c_array,int ldc,int batch_count,ScratchAllocator * scratch_allocator)2322 bool CUDABlas::DoBlasGemmBatched(
2323     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2324     uint64 n, uint64 k, std::complex<float> alpha,
2325     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
2326     int lda,
2327     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
2328     int ldb, std::complex<float> beta,
2329     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
2330     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
2331   port::Status status = DoBlasGemmBatchedInternal(
2332       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
2333       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
2334   if (!status.ok()) {
2335     LOG(ERROR) << status;
2336   }
2337   return status.ok();
2338 }
2339 
DoBlasGemmBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<double> alpha,const port::ArraySlice<DeviceMemory<std::complex<double>> * > & a_array,int lda,const port::ArraySlice<DeviceMemory<std::complex<double>> * > & b_array,int ldb,std::complex<double> beta,const port::ArraySlice<DeviceMemory<std::complex<double>> * > & c_array,int ldc,int batch_count,ScratchAllocator * scratch_allocator)2340 bool CUDABlas::DoBlasGemmBatched(
2341     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2342     uint64 n, uint64 k, std::complex<double> alpha,
2343     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
2344     int lda,
2345     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
2346     int ldb, std::complex<double> beta,
2347     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
2348     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
2349   port::Status status = DoBlasGemmBatchedInternal(
2350       cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
2351       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
2352   if (!status.ok()) {
2353     LOG(ERROR) << status;
2354   }
2355   return status.ok();
2356 }
2357 
DoBlasGemmStridedBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const DeviceMemory<Eigen::half> & a,int lda,int64 stride_a,const DeviceMemory<Eigen::half> & b,int ldb,int64 stride_b,float beta,DeviceMemory<Eigen::half> * c,int ldc,int64 stride_c,int batch_count)2358 bool CUDABlas::DoBlasGemmStridedBatched(
2359     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2360     uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
2361     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
2362     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
2363     int64 stride_c, int batch_count) {
2364   bool use_tensor_ops = false;
2365 #if CUDA_VERSION >= 9000
2366   int cc_major, cc_minor;
2367   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
2368           &cc_major, &cc_minor)) {
2369     // GPUs < sm_70 don't support tensor ops.
2370     if (cc_major >= 7 && TensorOpMathEnabled()) {
2371       use_tensor_ops = true;
2372     }
2373 #if CUDA_VERSION >= 9010
2374     if (cc_major >= 5) {
2375       cublasGemmAlgo_t algo =
2376           (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
2377       bool ok = DoBlasInternalImpl(
2378           cublasGemmStridedBatchedEx, stream, true /* = pointer_mode_host */,
2379           true /* = err_on_failure */, use_tensor_ops,
2380           CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
2381           GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F,
2382           ldb, stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
2383           batch_count, CUDA_R_32F, algo);
2384       if (ok) {
2385         return true;
2386       }
2387       LOG(ERROR) << "failed BLAS call, see log for details";
2388       return false;
2389     }
2390 #endif
2391   }
2392 #endif
2393   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
2394   for (int batch = 0; batch < batch_count; ++batch) {
2395     const auto *a_matrix =
2396         reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a);
2397     const auto *b_matrix =
2398         reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b);
2399     auto *c_matrix =
2400         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
2401     bool ok = DoBlasInternalImpl(
2402         cublasSgemmEx, stream, true /* = pointer_mode_host */,
2403         true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
2404         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
2405         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
2406         SE_CUDA_DATA_HALF, ldc);
2407     if (!ok) {
2408       LOG(ERROR) << "failed BLAS call, see log for details";
2409       return false;
2410     }
2411   }
2412   return true;
2413 }
2414 
DoBlasGemmStridedBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,float alpha,const DeviceMemory<float> & a,int lda,int64 stride_a,const DeviceMemory<float> & b,int ldb,int64 stride_b,float beta,DeviceMemory<float> * c,int ldc,int64 stride_c,int batch_count)2415 bool CUDABlas::DoBlasGemmStridedBatched(
2416     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2417     uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
2418     int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
2419     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
2420     int batch_count) {
2421   return DoBlasInternal(
2422       cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
2423       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
2424       GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
2425       GpuMemoryMutable(c), ldc, stride_c, batch_count);
2426 }
2427 
DoBlasGemmStridedBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,double alpha,const DeviceMemory<double> & a,int lda,int64 stride_a,const DeviceMemory<double> & b,int ldb,int64 stride_b,double beta,DeviceMemory<double> * c,int ldc,int64 stride_c,int batch_count)2428 bool CUDABlas::DoBlasGemmStridedBatched(
2429     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2430     uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
2431     int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
2432     double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
2433     int batch_count) {
2434   return DoBlasInternal(
2435       cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
2436       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
2437       GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
2438       GpuMemoryMutable(c), ldc, stride_c, batch_count);
2439 }
2440 
DoBlasGemmStridedBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,int64 stride_a,const DeviceMemory<std::complex<float>> & b,int ldb,int64 stride_b,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc,int64 stride_c,int batch_count)2441 bool CUDABlas::DoBlasGemmStridedBatched(
2442     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2443     uint64 n, uint64 k, std::complex<float> alpha,
2444     const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a,
2445     const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
2446     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
2447     int64 stride_c, int batch_count) {
2448   return DoBlasInternal(
2449       cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
2450       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
2451       GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
2452       GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
2453       GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
2454 }
2455 
DoBlasGemmStridedBatched(Stream * stream,blas::Transpose transa,blas::Transpose transb,uint64 m,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,int64 stride_a,const DeviceMemory<std::complex<double>> & b,int ldb,int64 stride_b,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc,int64 stride_c,int batch_count)2456 bool CUDABlas::DoBlasGemmStridedBatched(
2457     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
2458     uint64 n, uint64 k, std::complex<double> alpha,
2459     const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a,
2460     const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
2461     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
2462     int64 stride_c, int batch_count) {
2463   return DoBlasInternal(
2464       cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
2465       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
2466       GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
2467       GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
2468       GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
2469 }
2470 
DoBlasHemm(Stream * stream,blas::Side side,blas::UpperLower uplo,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc)2471 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
2472                           blas::UpperLower uplo, uint64 m, uint64 n,
2473                           std::complex<float> alpha,
2474                           const DeviceMemory<std::complex<float>> &a, int lda,
2475                           const DeviceMemory<std::complex<float>> &b, int ldb,
2476                           std::complex<float> beta,
2477                           DeviceMemory<std::complex<float>> *c, int ldc) {
2478   return DoBlasInternal(cublasChemm, stream, true /* = pointer_mode_host */,
2479                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
2480                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2481                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
2482                         GpuComplex(GpuMemoryMutable(c)), ldc);
2483 }
2484 
DoBlasHemm(Stream * stream,blas::Side side,blas::UpperLower uplo,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc)2485 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
2486                           blas::UpperLower uplo, uint64 m, uint64 n,
2487                           std::complex<double> alpha,
2488                           const DeviceMemory<std::complex<double>> &a, int lda,
2489                           const DeviceMemory<std::complex<double>> &b, int ldb,
2490                           std::complex<double> beta,
2491                           DeviceMemory<std::complex<double>> *c, int ldc) {
2492   return DoBlasInternal(cublasZhemm, stream, true /* = pointer_mode_host */,
2493                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
2494                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2495                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
2496                         GpuComplex(GpuMemoryMutable(c)), ldc);
2497 }
2498 
DoBlasHerk(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,float alpha,const DeviceMemory<std::complex<float>> & a,int lda,float beta,DeviceMemory<std::complex<float>> * c,int ldc)2499 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
2500                           blas::Transpose trans, uint64 n, uint64 k,
2501                           float alpha,
2502                           const DeviceMemory<std::complex<float>> &a, int lda,
2503                           float beta, DeviceMemory<std::complex<float>> *c,
2504                           int ldc) {
2505   return DoBlasInternal(cublasCherk, stream, true /* = pointer_mode_host */,
2506                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2507                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2508                         &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
2509 }
2510 
DoBlasHerk(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,double alpha,const DeviceMemory<std::complex<double>> & a,int lda,double beta,DeviceMemory<std::complex<double>> * c,int ldc)2511 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
2512                           blas::Transpose trans, uint64 n, uint64 k,
2513                           double alpha,
2514                           const DeviceMemory<std::complex<double>> &a, int lda,
2515                           double beta, DeviceMemory<std::complex<double>> *c,
2516                           int ldc) {
2517   return DoBlasInternal(cublasZherk, stream, true /* = pointer_mode_host */,
2518                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2519                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2520                         &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
2521 }
2522 
DoBlasHer2k(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,float beta,DeviceMemory<std::complex<float>> * c,int ldc)2523 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
2524                            blas::Transpose trans, uint64 n, uint64 k,
2525                            std::complex<float> alpha,
2526                            const DeviceMemory<std::complex<float>> &a, int lda,
2527                            const DeviceMemory<std::complex<float>> &b, int ldb,
2528                            float beta, DeviceMemory<std::complex<float>> *c,
2529                            int ldc) {
2530   return DoBlasInternal(cublasCher2k, stream, true /* = pointer_mode_host */,
2531                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2532                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2533                         GpuComplex(GpuMemory(b)), ldb, &beta,
2534                         GpuComplex(GpuMemoryMutable(c)), ldc);
2535 }
2536 
DoBlasHer2k(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,double beta,DeviceMemory<std::complex<double>> * c,int ldc)2537 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
2538                            blas::Transpose trans, uint64 n, uint64 k,
2539                            std::complex<double> alpha,
2540                            const DeviceMemory<std::complex<double>> &a, int lda,
2541                            const DeviceMemory<std::complex<double>> &b, int ldb,
2542                            double beta, DeviceMemory<std::complex<double>> *c,
2543                            int ldc) {
2544   return DoBlasInternal(cublasZher2k, stream, true /* = pointer_mode_host */,
2545                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2546                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2547                         GpuComplex(GpuMemory(b)), ldb, &beta,
2548                         GpuComplex(GpuMemoryMutable(c)), ldc);
2549 }
2550 
DoBlasSymm(Stream * stream,blas::Side side,blas::UpperLower uplo,uint64 m,uint64 n,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & b,int ldb,float beta,DeviceMemory<float> * c,int ldc)2551 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
2552                           blas::UpperLower uplo, uint64 m, uint64 n,
2553                           float alpha, const DeviceMemory<float> &a, int lda,
2554                           const DeviceMemory<float> &b, int ldb, float beta,
2555                           DeviceMemory<float> *c, int ldc) {
2556   return DoBlasInternal(cublasSsymm, stream, true /* = pointer_mode_host */,
2557                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
2558                         &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
2559                         GpuMemoryMutable(c), ldc);
2560 }
2561 
DoBlasSymm(Stream * stream,blas::Side side,blas::UpperLower uplo,uint64 m,uint64 n,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & b,int ldb,double beta,DeviceMemory<double> * c,int ldc)2562 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
2563                           blas::UpperLower uplo, uint64 m, uint64 n,
2564                           double alpha, const DeviceMemory<double> &a, int lda,
2565                           const DeviceMemory<double> &b, int ldb, double beta,
2566                           DeviceMemory<double> *c, int ldc) {
2567   return DoBlasInternal(cublasDsymm, stream, true /* = pointer_mode_host */,
2568                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
2569                         &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
2570                         GpuMemoryMutable(c), ldc);
2571 }
2572 
DoBlasSymm(Stream * stream,blas::Side side,blas::UpperLower uplo,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc)2573 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
2574                           blas::UpperLower uplo, uint64 m, uint64 n,
2575                           std::complex<float> alpha,
2576                           const DeviceMemory<std::complex<float>> &a, int lda,
2577                           const DeviceMemory<std::complex<float>> &b, int ldb,
2578                           std::complex<float> beta,
2579                           DeviceMemory<std::complex<float>> *c, int ldc) {
2580   return DoBlasInternal(cublasCsymm, stream, true /* = pointer_mode_host */,
2581                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
2582                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2583                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
2584                         GpuComplex(GpuMemoryMutable(c)), ldc);
2585 }
2586 
DoBlasSymm(Stream * stream,blas::Side side,blas::UpperLower uplo,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc)2587 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
2588                           blas::UpperLower uplo, uint64 m, uint64 n,
2589                           std::complex<double> alpha,
2590                           const DeviceMemory<std::complex<double>> &a, int lda,
2591                           const DeviceMemory<std::complex<double>> &b, int ldb,
2592                           std::complex<double> beta,
2593                           DeviceMemory<std::complex<double>> *c, int ldc) {
2594   return DoBlasInternal(cublasZsymm, stream, true /* = pointer_mode_host */,
2595                         CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
2596                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2597                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
2598                         GpuComplex(GpuMemoryMutable(c)), ldc);
2599 }
2600 
DoBlasSyrk(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,float alpha,const DeviceMemory<float> & a,int lda,float beta,DeviceMemory<float> * c,int ldc)2601 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
2602                           blas::Transpose trans, uint64 n, uint64 k,
2603                           float alpha, const DeviceMemory<float> &a, int lda,
2604                           float beta, DeviceMemory<float> *c, int ldc) {
2605   return DoBlasInternal(cublasSsyrk, stream, true /* = pointer_mode_host */,
2606                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2607                         k, &alpha, GpuMemory(a), lda, &beta,
2608                         GpuMemoryMutable(c), ldc);
2609 }
2610 
DoBlasSyrk(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,double alpha,const DeviceMemory<double> & a,int lda,double beta,DeviceMemory<double> * c,int ldc)2611 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
2612                           blas::Transpose trans, uint64 n, uint64 k,
2613                           double alpha, const DeviceMemory<double> &a, int lda,
2614                           double beta, DeviceMemory<double> *c, int ldc) {
2615   return DoBlasInternal(cublasDsyrk, stream, true /* = pointer_mode_host */,
2616                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2617                         k, &alpha, GpuMemory(a), lda, &beta,
2618                         GpuMemoryMutable(c), ldc);
2619 }
2620 
DoBlasSyrk(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc)2621 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
2622                           blas::Transpose trans, uint64 n, uint64 k,
2623                           std::complex<float> alpha,
2624                           const DeviceMemory<std::complex<float>> &a, int lda,
2625                           std::complex<float> beta,
2626                           DeviceMemory<std::complex<float>> *c, int ldc) {
2627   return DoBlasInternal(cublasCsyrk, stream, true /* = pointer_mode_host */,
2628                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2629                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2630                         GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
2631                         ldc);
2632 }
2633 
DoBlasSyrk(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc)2634 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
2635                           blas::Transpose trans, uint64 n, uint64 k,
2636                           std::complex<double> alpha,
2637                           const DeviceMemory<std::complex<double>> &a, int lda,
2638                           std::complex<double> beta,
2639                           DeviceMemory<std::complex<double>> *c, int ldc) {
2640   return DoBlasInternal(cublasZsyrk, stream, true /* = pointer_mode_host */,
2641                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2642                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2643                         GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
2644                         ldc);
2645 }
2646 
DoBlasSyr2k(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,float alpha,const DeviceMemory<float> & a,int lda,const DeviceMemory<float> & b,int ldb,float beta,DeviceMemory<float> * c,int ldc)2647 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
2648                            blas::Transpose trans, uint64 n, uint64 k,
2649                            float alpha, const DeviceMemory<float> &a, int lda,
2650                            const DeviceMemory<float> &b, int ldb, float beta,
2651                            DeviceMemory<float> *c, int ldc) {
2652   return DoBlasInternal(cublasSsyr2k, stream, true /* = pointer_mode_host */,
2653                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2654                         k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
2655                         GpuMemoryMutable(c), ldc);
2656 }
2657 
DoBlasSyr2k(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,double alpha,const DeviceMemory<double> & a,int lda,const DeviceMemory<double> & b,int ldb,double beta,DeviceMemory<double> * c,int ldc)2658 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
2659                            blas::Transpose trans, uint64 n, uint64 k,
2660                            double alpha, const DeviceMemory<double> &a, int lda,
2661                            const DeviceMemory<double> &b, int ldb, double beta,
2662                            DeviceMemory<double> *c, int ldc) {
2663   return DoBlasInternal(cublasDsyr2k, stream, true /* = pointer_mode_host */,
2664                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2665                         k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
2666                         GpuMemoryMutable(c), ldc);
2667 }
2668 
DoBlasSyr2k(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,const DeviceMemory<std::complex<float>> & b,int ldb,std::complex<float> beta,DeviceMemory<std::complex<float>> * c,int ldc)2669 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
2670                            blas::Transpose trans, uint64 n, uint64 k,
2671                            std::complex<float> alpha,
2672                            const DeviceMemory<std::complex<float>> &a, int lda,
2673                            const DeviceMemory<std::complex<float>> &b, int ldb,
2674                            std::complex<float> beta,
2675                            DeviceMemory<std::complex<float>> *c, int ldc) {
2676   return DoBlasInternal(cublasCsyr2k, stream, true /* = pointer_mode_host */,
2677                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2678                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2679                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
2680                         GpuComplex(GpuMemoryMutable(c)), ldc);
2681 }
2682 
DoBlasSyr2k(Stream * stream,blas::UpperLower uplo,blas::Transpose trans,uint64 n,uint64 k,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,const DeviceMemory<std::complex<double>> & b,int ldb,std::complex<double> beta,DeviceMemory<std::complex<double>> * c,int ldc)2683 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
2684                            blas::Transpose trans, uint64 n, uint64 k,
2685                            std::complex<double> alpha,
2686                            const DeviceMemory<std::complex<double>> &a, int lda,
2687                            const DeviceMemory<std::complex<double>> &b, int ldb,
2688                            std::complex<double> beta,
2689                            DeviceMemory<std::complex<double>> *c, int ldc) {
2690   return DoBlasInternal(cublasZsyr2k, stream, true /* = pointer_mode_host */,
2691                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
2692                         k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2693                         GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
2694                         GpuComplex(GpuMemoryMutable(c)), ldc);
2695 }
2696 
DoBlasTrmm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,float alpha,const DeviceMemory<float> & a,int lda,DeviceMemory<float> * b,int ldb)2697 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
2698                           blas::UpperLower uplo, blas::Transpose transa,
2699                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
2700                           const DeviceMemory<float> &a, int lda,
2701                           DeviceMemory<float> *b, int ldb) {
2702   return DoBlasInternal(cublasStrmm, stream, true /* = pointer_mode_host */,
2703                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2704                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2705                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
2706                         GpuMemoryMutable(b), ldb);
2707 }
2708 
DoBlasTrmm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,double alpha,const DeviceMemory<double> & a,int lda,DeviceMemory<double> * b,int ldb)2709 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
2710                           blas::UpperLower uplo, blas::Transpose transa,
2711                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
2712                           const DeviceMemory<double> &a, int lda,
2713                           DeviceMemory<double> *b, int ldb) {
2714   return DoBlasInternal(cublasDtrmm, stream, true /* = pointer_mode_host */,
2715                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2716                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2717                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
2718                         GpuMemoryMutable(b), ldb);
2719 }
2720 
DoBlasTrmm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,DeviceMemory<std::complex<float>> * b,int ldb)2721 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
2722                           blas::UpperLower uplo, blas::Transpose transa,
2723                           blas::Diagonal diag, uint64 m, uint64 n,
2724                           std::complex<float> alpha,
2725                           const DeviceMemory<std::complex<float>> &a, int lda,
2726                           DeviceMemory<std::complex<float>> *b, int ldb) {
2727   return DoBlasInternal(cublasCtrmm, stream, true /* = pointer_mode_host */,
2728                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2729                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2730                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2731                         GpuComplex(GpuMemoryMutable(b)), ldb,
2732                         GpuComplex(GpuMemoryMutable(b)), ldb);
2733 }
2734 
DoBlasTrmm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,DeviceMemory<std::complex<double>> * b,int ldb)2735 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
2736                           blas::UpperLower uplo, blas::Transpose transa,
2737                           blas::Diagonal diag, uint64 m, uint64 n,
2738                           std::complex<double> alpha,
2739                           const DeviceMemory<std::complex<double>> &a, int lda,
2740                           DeviceMemory<std::complex<double>> *b, int ldb) {
2741   return DoBlasInternal(cublasZtrmm, stream, true /* = pointer_mode_host */,
2742                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2743                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2744                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2745                         GpuComplex(GpuMemoryMutable(b)), ldb,
2746                         GpuComplex(GpuMemoryMutable(b)), ldb);
2747 }
2748 
DoBlasTrsm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,float alpha,const DeviceMemory<float> & a,int lda,DeviceMemory<float> * b,int ldb)2749 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
2750                           blas::UpperLower uplo, blas::Transpose transa,
2751                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
2752                           const DeviceMemory<float> &a, int lda,
2753                           DeviceMemory<float> *b, int ldb) {
2754   return DoBlasInternal(cublasStrsm, stream, true /* = pointer_mode_host */,
2755                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2756                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2757                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
2758 }
2759 
DoBlasTrsm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,double alpha,const DeviceMemory<double> & a,int lda,DeviceMemory<double> * b,int ldb)2760 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
2761                           blas::UpperLower uplo, blas::Transpose transa,
2762                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
2763                           const DeviceMemory<double> &a, int lda,
2764                           DeviceMemory<double> *b, int ldb) {
2765   return DoBlasInternal(cublasDtrsm, stream, true /* = pointer_mode_host */,
2766                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2767                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2768                         &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
2769 }
2770 
DoBlasTrsm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,std::complex<float> alpha,const DeviceMemory<std::complex<float>> & a,int lda,DeviceMemory<std::complex<float>> * b,int ldb)2771 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
2772                           blas::UpperLower uplo, blas::Transpose transa,
2773                           blas::Diagonal diag, uint64 m, uint64 n,
2774                           std::complex<float> alpha,
2775                           const DeviceMemory<std::complex<float>> &a, int lda,
2776                           DeviceMemory<std::complex<float>> *b, int ldb) {
2777   return DoBlasInternal(cublasCtrsm, stream, true /* = pointer_mode_host */,
2778                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2779                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2780                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2781                         GpuComplex(GpuMemoryMutable(b)), ldb);
2782 }
2783 
DoBlasTrsm(Stream * stream,blas::Side side,blas::UpperLower uplo,blas::Transpose transa,blas::Diagonal diag,uint64 m,uint64 n,std::complex<double> alpha,const DeviceMemory<std::complex<double>> & a,int lda,DeviceMemory<std::complex<double>> * b,int ldb)2784 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
2785                           blas::UpperLower uplo, blas::Transpose transa,
2786                           blas::Diagonal diag, uint64 m, uint64 n,
2787                           std::complex<double> alpha,
2788                           const DeviceMemory<std::complex<double>> &a, int lda,
2789                           DeviceMemory<std::complex<double>> *b, int ldb) {
2790   return DoBlasInternal(cublasZtrsm, stream, true /* = pointer_mode_host */,
2791                         CUDABlasSide(side), CUDABlasUpperLower(uplo),
2792                         CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
2793                         GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
2794                         GpuComplex(GpuMemoryMutable(b)), ldb);
2795 }
2796 
2797 }  // namespace gpu
2798 
initialize_cublas()2799 void initialize_cublas() {
2800   port::Status status =
2801       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
2802           cuda::kCudaPlatformId, gpu::kCuBlasPlugin, "cuBLAS",
2803           [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
2804             gpu::GpuExecutor *cuda_executor =
2805                 dynamic_cast<gpu::GpuExecutor *>(parent);
2806             if (cuda_executor == nullptr) {
2807               LOG(ERROR)
2808                   << "Attempting to initialize an instance of the cuBLAS "
2809                   << "support library with a non-CUDA StreamExecutor";
2810               return nullptr;
2811             }
2812 
2813             gpu::CUDABlas *blas = new gpu::CUDABlas(cuda_executor);
2814             if (!blas->Init()) {
2815               // Note: Init() will log a more specific error.
2816               delete blas;
2817               return nullptr;
2818             }
2819             return blas;
2820           });
2821 
2822   if (!status.ok()) {
2823     LOG(ERROR) << "Unable to register cuBLAS factory: "
2824                << status.error_message();
2825   }
2826 
2827   PluginRegistry::Instance()->SetDefaultFactory(
2828       cuda::kCudaPlatformId, PluginKind::kBlas, gpu::kCuBlasPlugin);
2829 }
2830 
2831 }  // namespace stream_executor
2832 
2833 REGISTER_MODULE_INITIALIZER(register_cublas,
2834                             { stream_executor::initialize_cublas(); });
2835