Home
last modified time | relevance | path

Searched refs:cudaStream_t (Results 1 – 25 of 177) sorted by relevance

12345678

/external/pytorch/torch/csrc/cuda/
DCUDAPluggableAllocator.h14 using MallocFuncType = void*(size_t, int, cudaStream_t);
15 using FreeFuncType = void(void*, size_t, int, cudaStream_t);
28 cudaStream_t stream);
37 cudaStream_t stream_;
63 cudaStream_t stream);
66 cudaStream_t stream;
88 std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn);
92 void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)>
101 void* malloc(size_t size, c10::DeviceIndex device, cudaStream_t stream);
107 void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override;
[all …]
DCUDAPluggableAllocator.cpp16 cudaStream_t stream) in CUDAPluggableAllocatorDeleterContext()
38 cudaStream_t stream) in _AllocationMetadata()
81 std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn) { in set_record_stream_fn()
87 void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)> in set_begin_allocate_to_pool()
105 cudaStream_t stream) { in malloc()
117 cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device); in allocate()
133 cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device); in raw_alloc()
139 cudaStream_t stream) { in raw_alloc_with_stream()
146 cudaStream_t stream{}; in raw_delete()
262 std::function<bool(cudaStream_t)> filter) { in beginAllocateToPool()
[all …]
/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/
Dflash_bwd_launch_template.h74 void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream) { in run_flash_bwd_seqk_parallel()
130 void run_flash_bwd(Flash_bwd_params &params, cudaStream_t stream) { in run_flash_bwd()
137 void run_mha_bwd_hdim32(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim32()
161 void run_mha_bwd_hdim64(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim64()
206 void run_mha_bwd_hdim96(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim96()
232 void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim128()
266 void run_mha_bwd_hdim160(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim160()
286 void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim192()
306 void run_mha_bwd_hdim224(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim224()
314 void run_mha_bwd_hdim256(Flash_bwd_params &params, cudaStream_t stream) { in run_mha_bwd_hdim256()
Dflash_fwd_launch_template.h58 void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) { in run_flash_fwd()
103 void run_flash_splitkv_fwd(Flash_fwd_params &params, cudaStream_t stream) { in run_flash_splitkv_fwd()
165 void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_splitkv_dispatch()
175 void run_mha_fwd_hdim32(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim32()
185 void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim64()
207 void run_mha_fwd_hdim96(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim96()
233 void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim128()
270 void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim160()
300 void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim192()
319 void run_mha_fwd_hdim224(Flash_fwd_params &params, cudaStream_t stream) { in run_mha_fwd_hdim224()
[all …]
Dflash.h185 template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
186 …e T, int Headdim> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);
188 template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
/external/tensorflow/tensorflow/compiler/xla/stream_executor/cuda/
Dcuda_runtime_9_0.inc292 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
293 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
300 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
301 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
308 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
310 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
317 cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
318 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
325 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
326 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
[all …]
Dcuda_runtime_11_2.inc325 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
326 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
333 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
334 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
341 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
343 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
350 cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
351 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
358 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
359 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
[all …]
Dcuda_runtime_10_1.inc293 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
294 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
301 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
302 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
309 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
311 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
318 cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
319 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
326 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
327 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
[all …]
Dcuda_runtime_10_0.inc293 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
294 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
301 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
302 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
309 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
311 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
318 cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
319 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
326 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
327 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
[all …]
Dcuda_runtime_11_0.inc358 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
359 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
366 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
367 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
374 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
376 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
383 cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
384 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
391 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
392 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
[all …]
Dcuda_runtime_10_2.inc302 extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
303 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
310 cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
311 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
318 cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
320 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
327 cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
328 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
335 cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
336 using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
[all …]
/external/pytorch/c10/cuda/
DCUDACachingAllocator.h79 cudaStream_t stream = nullptr;
118 cudaStream_t stream,
133 cudaStream_t stream_{};
203 virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
220 std::function<bool(cudaStream_t)> filter) = 0;
283 cudaStream_t stream,
310 inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) { in raw_alloc_with_stream()
375 std::function<bool(cudaStream_t)> filter) { in beginAllocateToPool()
439 cudaStream_t stream, in memcpyAsync()
DCUDAStream.h84 operator cudaStream_t() const { in cudaStream_t() function
144 cudaStream_t stream() const;
226 getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
DCUDAMallocAsyncAllocator.cpp33 cudaStream_t stream;
36 UsageStream(cudaStream_t s, c10::DeviceIndex d) : stream(s), device(d) {} in UsageStream()
180 inline void sync_raw(cudaStream_t dependency, cudaStream_t dependent) { in sync_raw()
319 cudaStream_t stream) { in mallocAsync()
772 std::function<bool(cudaStream_t)>) override { in beginAllocateToPool()
843 void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override { in raw_alloc_with_stream()
878 cudaStream_t stream, in memcpyAsync()
DCUDAStream.cpp51 std::array<cudaStream_t, kStreamsPerPool>,
269 cudaStream_t CUDAStream::stream() const { in stream()
287 return reinterpret_cast<cudaStream_t>(stream_id); in stream()
347 cudaStream_t ext_stream, in getStreamFromExternal()
DCUDAFunctions.h82 cudaStream_t stream) { in memcpy_and_sync()
100 C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) { in stream_synchronize()
/external/pytorch/aten/src/ATen/test/
Dcuda_stream_test.cpp37 cudaStream_t cuda_stream; in TEST()
181 std::unordered_set<cudaStream_t> stream_set{}; in TEST()
184 cudaStream_t cuda_stream = streams[i]; in TEST()
305 cudaStream_t cuda_stream; in TEST()
327 cudaStream_t cuda_stream_0; in TEST()
328 cudaStream_t cuda_stream_1; in TEST()
358 cudaStream_t a_cuda_stream; in TEST()
359 cudaStream_t another_cuda_stream; in TEST()
400 cudaStream_t cuda_stream_a; in TEST()
401 cudaStream_t cuda_stream_b; in TEST()
/external/clang/test/SemaCUDA/Inputs/
Dcuda.h20 typedef struct cudaStream *cudaStream_t; typedef
23 cudaStream_t stream = 0);
/external/clang/test/PCH/Inputs/
Dcuda.h17 typedef struct cudaStream *cudaStream_t; typedef
20 cudaStream_t stream = 0);
/external/clang/test/CodeGenCUDA/Inputs/
Dcuda.h17 typedef struct cudaStream *cudaStream_t; typedef
20 cudaStream_t stream = 0);
/external/pytorch/torch/csrc/inductor/aoti_torch/
Dshim_cuda.cpp38 static_cast<cudaStream_t>(stream), device_index)); in aoti_torch_create_cuda_stream_guard()
52 *(cudaStream_t*)(ret_stream) = at::cuda::getCurrentCUDAStream(device_index); in aoti_torch_get_current_cuda_stream()
/external/pytorch/torch/csrc/cuda/shared/
Dcudart.cpp90 return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr)); in initCudartBindings()
97 return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr)); in initCudartBindings()
/external/pytorch/aten/src/ATen/native/cuda/
DLegacyThrustHelpers.cu24 const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); in index_put_with_sort_kernel_thrust_helper()
48 cudaStream_t stream = at::cuda::getCurrentCUDAStream(); in embedding_dense_backward_cuda_scan()
/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/
Dflash_fwd_split_hdim256_bf16_sm80.cu11 …_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256>(Flash_fwd_params &params, cudaStream_t stream);
Dflash_fwd_split_hdim32_fp16_sm80.cu11 …d run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream);

12345678