/external/pytorch/torch/csrc/cuda/ |
D | CUDAPluggableAllocator.h | 14 using MallocFuncType = void*(size_t, int, cudaStream_t); 15 using FreeFuncType = void(void*, size_t, int, cudaStream_t); 28 cudaStream_t stream); 37 cudaStream_t stream_; 63 cudaStream_t stream); 66 cudaStream_t stream; 88 std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn); 92 void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)> 101 void* malloc(size_t size, c10::DeviceIndex device, cudaStream_t stream); 107 void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override; [all …]
|
D | CUDAPluggableAllocator.cpp | 16 cudaStream_t stream) in CUDAPluggableAllocatorDeleterContext() 38 cudaStream_t stream) in _AllocationMetadata() 81 std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn) { in set_record_stream_fn() 87 void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)> in set_begin_allocate_to_pool() 105 cudaStream_t stream) { in malloc() 117 cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device); in allocate() 133 cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device); in raw_alloc() 139 cudaStream_t stream) { in raw_alloc_with_stream() 146 cudaStream_t stream{}; in raw_delete() 262 std::function<bool(cudaStream_t)> filter) { in beginAllocateToPool() [all …]
|
/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/ |
D | flash_bwd_launch_template.h | 74 void run_flash_bwd_seqk_parallel(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_flash_bwd_seqk_parallel() 130 void run_flash_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_flash_bwd() 137 void run_mha_bwd_hdim32(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim32() 161 void run_mha_bwd_hdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim64() 206 void run_mha_bwd_hdim96(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim96() 232 void run_mha_bwd_hdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim128() 266 void run_mha_bwd_hdim160(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim160() 286 void run_mha_bwd_hdim192(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim192() 306 void run_mha_bwd_hdim224(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim224() 314 void run_mha_bwd_hdim256(Flash_bwd_params ¶ms, cudaStream_t stream) { in run_mha_bwd_hdim256()
|
D | flash_fwd_launch_template.h | 58 void run_flash_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_flash_fwd() 103 void run_flash_splitkv_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_flash_splitkv_fwd() 165 void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_splitkv_dispatch() 175 void run_mha_fwd_hdim32(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim32() 185 void run_mha_fwd_hdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim64() 207 void run_mha_fwd_hdim96(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim96() 233 void run_mha_fwd_hdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim128() 270 void run_mha_fwd_hdim160(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim160() 300 void run_mha_fwd_hdim192(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim192() 319 void run_mha_fwd_hdim224(Flash_fwd_params ¶ms, cudaStream_t stream) { in run_mha_fwd_hdim224() [all …]
|
D | flash.h | 185 template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream); 186 …e T, int Headdim> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 188 template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream);
|
/external/tensorflow/tensorflow/compiler/xla/stream_executor/cuda/ |
D | cuda_runtime_9_0.inc | 292 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { 293 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *); 300 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { 301 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int); 308 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, 310 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int); 317 cudaStreamGetPriority(cudaStream_t hStream, int *priority) { 318 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *); 325 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { 326 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *); [all …]
|
D | cuda_runtime_11_2.inc | 325 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { 326 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *); 333 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { 334 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int); 341 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, 343 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int); 350 cudaStreamGetPriority(cudaStream_t hStream, int *priority) { 351 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *); 358 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { 359 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *); [all …]
|
D | cuda_runtime_10_1.inc | 293 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { 294 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *); 301 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { 302 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int); 309 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, 311 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int); 318 cudaStreamGetPriority(cudaStream_t hStream, int *priority) { 319 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *); 326 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { 327 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *); [all …]
|
D | cuda_runtime_10_0.inc | 293 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { 294 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *); 301 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { 302 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int); 309 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, 311 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int); 318 cudaStreamGetPriority(cudaStream_t hStream, int *priority) { 319 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *); 326 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { 327 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *); [all …]
|
D | cuda_runtime_11_0.inc | 358 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { 359 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *); 366 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { 367 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int); 374 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, 376 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int); 383 cudaStreamGetPriority(cudaStream_t hStream, int *priority) { 384 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *); 391 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { 392 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *); [all …]
|
D | cuda_runtime_10_2.inc | 302 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { 303 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *); 310 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { 311 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int); 318 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, 320 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int); 327 cudaStreamGetPriority(cudaStream_t hStream, int *priority) { 328 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *); 335 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { 336 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *); [all …]
|
/external/pytorch/c10/cuda/ |
D | CUDACachingAllocator.h | 79 cudaStream_t stream = nullptr; 118 cudaStream_t stream, 133 cudaStream_t stream_{}; 203 virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0; 220 std::function<bool(cudaStream_t)> filter) = 0; 283 cudaStream_t stream, 310 inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) { in raw_alloc_with_stream() 375 std::function<bool(cudaStream_t)> filter) { in beginAllocateToPool() 439 cudaStream_t stream, in memcpyAsync()
|
D | CUDAStream.h | 84 operator cudaStream_t() const { in cudaStream_t() function 144 cudaStream_t stream() const; 226 getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
|
D | CUDAMallocAsyncAllocator.cpp | 33 cudaStream_t stream; 36 UsageStream(cudaStream_t s, c10::DeviceIndex d) : stream(s), device(d) {} in UsageStream() 180 inline void sync_raw(cudaStream_t dependency, cudaStream_t dependent) { in sync_raw() 319 cudaStream_t stream) { in mallocAsync() 772 std::function<bool(cudaStream_t)>) override { in beginAllocateToPool() 843 void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override { in raw_alloc_with_stream() 878 cudaStream_t stream, in memcpyAsync()
|
D | CUDAStream.cpp | 51 std::array<cudaStream_t, kStreamsPerPool>, 269 cudaStream_t CUDAStream::stream() const { in stream() 287 return reinterpret_cast<cudaStream_t>(stream_id); in stream() 347 cudaStream_t ext_stream, in getStreamFromExternal()
|
D | CUDAFunctions.h | 82 cudaStream_t stream) { in memcpy_and_sync() 100 C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) { in stream_synchronize()
|
/external/pytorch/aten/src/ATen/test/ |
D | cuda_stream_test.cpp | 37 cudaStream_t cuda_stream; in TEST() 181 std::unordered_set<cudaStream_t> stream_set{}; in TEST() 184 cudaStream_t cuda_stream = streams[i]; in TEST() 305 cudaStream_t cuda_stream; in TEST() 327 cudaStream_t cuda_stream_0; in TEST() 328 cudaStream_t cuda_stream_1; in TEST() 358 cudaStream_t a_cuda_stream; in TEST() 359 cudaStream_t another_cuda_stream; in TEST() 400 cudaStream_t cuda_stream_a; in TEST() 401 cudaStream_t cuda_stream_b; in TEST()
|
/external/clang/test/SemaCUDA/Inputs/ |
D | cuda.h | 20 typedef struct cudaStream *cudaStream_t; typedef 23 cudaStream_t stream = 0);
|
/external/clang/test/PCH/Inputs/ |
D | cuda.h | 17 typedef struct cudaStream *cudaStream_t; typedef 20 cudaStream_t stream = 0);
|
/external/clang/test/CodeGenCUDA/Inputs/ |
D | cuda.h | 17 typedef struct cudaStream *cudaStream_t; typedef 20 cudaStream_t stream = 0);
|
/external/pytorch/torch/csrc/inductor/aoti_torch/ |
D | shim_cuda.cpp | 38 static_cast<cudaStream_t>(stream), device_index)); in aoti_torch_create_cuda_stream_guard() 52 *(cudaStream_t*)(ret_stream) = at::cuda::getCurrentCUDAStream(device_index); in aoti_torch_get_current_cuda_stream()
|
/external/pytorch/torch/csrc/cuda/shared/ |
D | cudart.cpp | 90 return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr)); in initCudartBindings() 97 return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr)); in initCudartBindings()
|
/external/pytorch/aten/src/ATen/native/cuda/ |
D | LegacyThrustHelpers.cu | 24 const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); in index_put_with_sort_kernel_thrust_helper() 48 cudaStream_t stream = at::cuda::getCurrentCUDAStream(); in embedding_dense_backward_cuda_scan()
|
/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/ |
D | flash_fwd_split_hdim256_bf16_sm80.cu | 11 …_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256>(Flash_fwd_params ¶ms, cudaStream_t stream);
|
D | flash_fwd_split_hdim32_fp16_sm80.cu | 11 …d run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32>(Flash_fwd_params ¶ms, cudaStream_t stream);
|