1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "convert_gradient_impl.cuh"
18
19 template <typename T>
ConvertGradientKernel(const size_t size,const size_t height_h,const size_t height_w,const size_t batchwidth,const size_t width,T * input_addr,T * output_addr)20 __global__ void ConvertGradientKernel(const size_t size, const size_t height_h, const size_t height_w,
21 const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) {
22 for (size_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x; pointIdx < (size); pointIdx += blockDim.x * gridDim.x) {
23 size_t dst_batchIdx = pointIdx / (height_h * height_w);
24 size_t dst_batchIdxX = dst_batchIdx / batchwidth;
25 size_t dst_batchIdxY = dst_batchIdx % batchwidth;
26 size_t dst_x = (pointIdx - dst_batchIdx * height_h * height_w) / height_w;
27 size_t dst_y = (pointIdx - dst_batchIdx * height_h * height_w) % height_w;
28 size_t src_coordinate = dst_batchIdxX * height_h * width + dst_x * width + dst_batchIdxY * height_w + dst_y;
29 output_addr[pointIdx] = input_addr[src_coordinate];
30 }
31 }
32
33 template <typename T>
ConvertGradientBackKernel(const size_t size,const size_t height_h,const size_t height_w,const size_t batchwidth,const size_t width,T * input_addr,T * output_addr)34 __global__ void ConvertGradientBackKernel(const size_t size, const size_t height_h, const size_t height_w,
35 const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) {
36 for (size_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x; pointIdx < (size); pointIdx += blockDim.x * gridDim.x) {
37 size_t dst_batchIdx = pointIdx / (height_h * height_w);
38 size_t dst_batchIdxX = dst_batchIdx / batchwidth;
39 size_t dst_batchIdxY = dst_batchIdx % batchwidth;
40 size_t dst_x = (pointIdx - dst_batchIdx * height_h * height_w) / height_w;
41 size_t dst_y = (pointIdx - dst_batchIdx * height_h * height_w) % height_w;
42 size_t src_coordinate = dst_batchIdxX * height_h * width + dst_x * width + dst_batchIdxY * height_w + dst_y;
43 output_addr[src_coordinate] = input_addr[pointIdx];
44 }
45 }
46
47 template <typename T>
ConvertGradientBackKernel(const size_t size,const size_t height_h,const size_t height_w,const size_t ori_h,const size_t ori_w,const size_t batchwidth,const size_t width,T * input_addr,T * output_addr)48 __global__ void ConvertGradientBackKernel(const size_t size, const size_t height_h, const size_t height_w,
49 const size_t ori_h, const size_t ori_w, const size_t batchwidth,
50 const size_t width, T *input_addr, T *output_addr) {
51 for (size_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x; pointIdx < (size); pointIdx += blockDim.x * gridDim.x) {
52 size_t dst_batchIdx = pointIdx / (height_h * height_w);
53 size_t dst_batchIdxX = dst_batchIdx / batchwidth;
54 size_t dst_batchIdxY = dst_batchIdx % batchwidth;
55 size_t dst_x = (pointIdx - dst_batchIdx * height_h * height_w) / height_w;
56 size_t dst_y = (pointIdx - dst_batchIdx * height_h * height_w) % height_w;
57 size_t src_x = dst_batchIdxX * height_h + dst_x;
58 size_t src_y = dst_batchIdxY * height_w + dst_y;
59 if (src_x < ori_h && src_y < ori_w) {
60 size_t src_coordinate = src_x * ori_w + src_y;
61 output_addr[src_coordinate] = input_addr[pointIdx];
62 }
63 }
64 }
65
66 template <typename T>
ConvertGradient(const size_t size,const size_t height_h,const size_t height_w,const size_t batchwidth,const size_t width,T * input_addr,T * output_addr,cudaStream_t cuda_stream)67 cudaError_t ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
68 const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) {
69 ConvertGradientKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, height_h, height_w, batchwidth, width,
70 input_addr, output_addr);
71 return GetCudaStatus();
72 }
73
74 template <typename T>
ConvertGradientBack(const size_t size,const size_t height_h,const size_t height_w,const size_t batchwidth,const size_t width,T * input_addr,T * output_addr,cudaStream_t cuda_stream)75 cudaError_t ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w,
76 const size_t batchwidth, const size_t width, T *input_addr, T *output_addr,
77 cudaStream_t cuda_stream) {
78 ConvertGradientBackKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, height_h, height_w, batchwidth,
79 width, input_addr, output_addr);
80 return GetCudaStatus();
81 }
82
83 template <typename T>
ConvertGradientBack(const size_t size,const size_t height_h,const size_t height_w,const size_t ori_h,const size_t ori_w,const size_t batchwidth,const size_t width,T * input_addr,T * output_addr,cudaStream_t cuda_stream)84 cudaError_t ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t ori_h,
85 const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr,
86 T *output_addr, cudaStream_t cuda_stream) {
87 ConvertGradientBackKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
88 size, height_h, height_w, ori_h, ori_w, batchwidth, width, input_addr, output_addr);
89 return GetCudaStatus();
90 }
91
92 template CUDA_LIB_EXPORT cudaError_t ConvertGradient<float>(const size_t size, const size_t height_h,
93 const size_t height_w, const size_t batchwidth,
94 const size_t width, float *input_addr, float *output_addr,
95 cudaStream_t cuda_stream);
96
97 template CUDA_LIB_EXPORT cudaError_t ConvertGradientBack<float>(const size_t size, const size_t height_h,
98 const size_t height_w, const size_t batchwidth,
99 const size_t width, float *input_addr,
100 float *output_addr, cudaStream_t cuda_stream);
101
102 template CUDA_LIB_EXPORT cudaError_t ConvertGradientBack<float>(const size_t size, const size_t height_h,
103 const size_t height_w, const size_t ori_h,
104 const size_t ori_w, const size_t batchwidth,
105 const size_t width, float *input_addr,
106 float *output_addr, cudaStream_t cuda_stream);
107