Home
last modified time | relevance | path

Searched refs:block_work_size (Results 1 – 9 of 9) sorted by relevance

/external/pytorch/aten/src/ATen/native/cuda/
DCUDALoops.cuh60 int remaining = N - block_work_size() * blockIdx.x; in C10_LAUNCH_BOUNDS_1()
62 if (remaining < block_work_size()) { // if this block handles the reminder, in C10_LAUNCH_BOUNDS_1()
99 int remaining = N - block_work_size() * blockIdx.x; in C10_LAUNCH_BOUNDS_1()
114 int64_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_vectorized_kernel()
161 int64_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_unrolled_kernel()
DMemoryAccess.cuh64 auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx; in apply()
212 int linear_idx = thread_idx + block_work_size() * idx; in load()
227 int linear_idx = thread_idx + block_work_size() * idx; in store()
277 scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx; in store()
319 int linear_idx = thread_idx + block_work_size() * idx; in load()
335 int linear_idx = thread_idx + block_work_size() * idx; in store()
DCUDAJitLoops.cuh93 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_unrolled_kernel()
119 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_vectorized_kernel()
DRangeFactories.cu36 constexpr int block_work_size = thread_work_size * num_threads(); variable
43 index_t idx = block_work_size * blockIdx.x + num_threads() * i + threadIdx.x; in C10_LAUNCH_BOUNDS_1()
56 int64_t grid = (N + block_work_size - 1) / block_work_size; in gpu_kernel_with_index()
Dthread_constants.h22 constexpr int block_work_size() { return thread_work_size() * num_threads(); } in block_work_size() function
DLoops.cuh260 int remaining = N - block_work_size() * blockIdx.x; in C10_LAUNCH_BOUNDS_1()
267 int64_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_unrolled_kernel_for_multi_outputs()
DDistributionTemplates.h196 int base_index = block_work_size() * blockIdx.x; in distribution_binary_elementwise_kernel()
197 int remaining = std::min<int>(numel - base_index, block_work_size()); in distribution_binary_elementwise_kernel()
259 int64_t grid = (numel + block_work_size() - 1) / block_work_size(); in distribution_binary_kernel()
/external/pytorch/aten/src/ATen/test/
Dcuda_vectorized_test.cu95 policy.load_single_arg(accessor, src + block_work_size() * blockIdx.x); in vectorized_copy()
111 vectorized_copy<double, 4><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1); in TEST()
125 vectorized_copy<double, 2><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1); in TEST()
139 vectorized_copy<double, 1><<<total_work_size / block_work_size() , num_threads()>>>(b2, b1); in TEST()
/external/pytorch/aten/src/ATen/cuda/
Djiterator.cu21 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_vectorized_kernel_dynamic()
126 const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); in launch_jitted_unrolled_kernel_dynamic()