1 #include <cuda.h>
2 #include <cuda_runtime.h>
3 #include <c10/cuda/CUDAException.h>
4 
5 #include <ATen/ATen.h>
6 
7 #include "cuda_dlink_extension_add.cuh"
8 
add_kernel(const float * a,const float * b,float * output,int size)9 __global__ void add_kernel(const float* a, const float* b, float* output, int size) {
10   int i = blockIdx.x * blockDim.x + threadIdx.x;
11   if (i < size) {
12     add(a + i, b + i, output + i);
13   }
14 }
15 
16 // output = a * b + c
add_cuda(const float * a,const float * b,float * output,int size)17 void add_cuda(const float* a, const float* b, float* output, int size) {
18   const int threads = 1024;
19   const int blocks = (size + threads - 1) / threads;
20   add_kernel<<<blocks, threads>>>(a, b, output, size);
21   C10_CUDA_KERNEL_LAUNCH_CHECK();
22 }
23