Searched refs:block_work_size (Results 1 – 9 of 9) sorted by relevance
/external/pytorch/aten/src/ATen/native/cuda/ |
D | CUDALoops.cuh | 60 int remaining = N - block_work_size() * blockIdx.x; in C10_LAUNCH_BOUNDS_1() 62 if (remaining < block_work_size()) { // if this block handles the reminder, in C10_LAUNCH_BOUNDS_1() 99 int remaining = N - block_work_size() * blockIdx.x; in C10_LAUNCH_BOUNDS_1() 114 int64_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_vectorized_kernel() 161 int64_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_unrolled_kernel()
|
D | MemoryAccess.cuh | 64 auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx; in apply() 212 int linear_idx = thread_idx + block_work_size() * idx; in load() 227 int linear_idx = thread_idx + block_work_size() * idx; in store() 277 scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx; in store() 319 int linear_idx = thread_idx + block_work_size() * idx; in load() 335 int linear_idx = thread_idx + block_work_size() * idx; in store()
|
D | CUDAJitLoops.cuh | 93 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_unrolled_kernel() 119 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_vectorized_kernel()
|
D | RangeFactories.cu | 36 constexpr int block_work_size = thread_work_size * num_threads(); variable 43 index_t idx = block_work_size * blockIdx.x + num_threads() * i + threadIdx.x; in C10_LAUNCH_BOUNDS_1() 56 int64_t grid = (N + block_work_size - 1) / block_work_size; in gpu_kernel_with_index()
|
D | thread_constants.h | 22 constexpr int block_work_size() { return thread_work_size() * num_threads(); } in block_work_size() function
|
D | Loops.cuh | 260 int remaining = N - block_work_size() * blockIdx.x; in C10_LAUNCH_BOUNDS_1() 267 int64_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_unrolled_kernel_for_multi_outputs()
|
D | DistributionTemplates.h | 196 int base_index = block_work_size() * blockIdx.x; in distribution_binary_elementwise_kernel() 197 int remaining = std::min<int>(numel - base_index, block_work_size()); in distribution_binary_elementwise_kernel() 259 int64_t grid = (numel + block_work_size() - 1) / block_work_size(); in distribution_binary_kernel()
|
/external/pytorch/aten/src/ATen/test/ |
D | cuda_vectorized_test.cu | 95 policy.load_single_arg(accessor, src + block_work_size() * blockIdx.x); in vectorized_copy() 111 vectorized_copy<double, 4><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1); in TEST() 125 vectorized_copy<double, 2><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1); in TEST() 139 vectorized_copy<double, 1><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1); in TEST()
|
/external/pytorch/aten/src/ATen/cuda/ |
D | jiterator.cu | 21 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_vectorized_kernel_dynamic() 126 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_unrolled_kernel_dynamic()
|