• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "nnacl/fp16/pooling_fp16.h"
17 #include <float.h>
18 #include "nnacl/errorcode.h"
19 
AvgPoolingFp16(const float16_t * input_ptr,float16_t * output_ptr,const PoolingParameter * pooling_param,int task_id,float16_t min,float16_t max)20 int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
21                    int task_id, float16_t min, float16_t max) {
22   int win_w = pooling_param->window_w_;
23   int win_h = pooling_param->window_h_;
24   int channel = pooling_param->input_channel_;
25   int c8 = channel / C8NUM;
26   int in_w = pooling_param->input_w_;
27   int in_h = pooling_param->input_h_;
28   int output_w = pooling_param->output_w_;
29   int output_h = pooling_param->output_h_;
30   int out_plane = output_w * output_h;
31   int out_tile_count = UP_DIV(out_plane, TILE_NUM);
32 
33 #ifdef ENABLE_NEON
34   MS_FLOAT16X8 min_value = MS_MOVQ_F16(min);
35   MS_FLOAT16X8 max_value = MS_MOVQ_F16(max);
36 #endif
37 
38   NNACL_CHECK_ZERO_RETURN_ERR(output_w);
39   for (int batch = 0; batch < pooling_param->output_batch_; batch++) {
40     const float16_t *src_b_ptr = input_ptr + batch * in_h * in_w * channel;
41     float16_t *dst_b_ptr = output_ptr + batch * output_h * output_w * channel;
42     for (int thread_id = task_id; thread_id < out_tile_count; thread_id += pooling_param->thread_num_) {
43       int cal_start_index = thread_id * TILE_NUM;
44       int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
45       for (int i = 0; i < real_cal_num; i++) {
46         int index = cal_start_index + i;
47         int out_w_index = index % output_w;
48         int out_h_index = index / output_w;
49         int in_w_index = out_w_index * pooling_param->stride_w_ - pooling_param->pad_l_;
50         int in_h_index = out_h_index * pooling_param->stride_h_ - pooling_param->pad_u_;
51 
52         const float16_t *src_plane_ptr = src_b_ptr;
53         float16_t *dst_plane_ptr = dst_b_ptr + index * channel;
54 
55         int real_win_h_start = MSMAX(0, -in_h_index);
56         int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
57         int real_win_w_start = MSMAX(0, -in_w_index);
58         int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
59 
60         for (int ci = 0; ci < c8; ci++) {
61           const float16_t *src_c_ptr = src_plane_ptr + ci * C8NUM;
62           float16_t *dst_c_ptr = dst_plane_ptr + ci * C8NUM;
63 #ifdef ENABLE_NEON
64           MS_FLOAT16X8 tmp_avg = MS_MOVQ_F16(0);
65 #else
66           float16_t tmp_avg0 = 0;
67           float16_t tmp_avg1 = 0;
68           float16_t tmp_avg2 = 0;
69           float16_t tmp_avg3 = 0;
70           float16_t tmp_avg4 = 0;
71           float16_t tmp_avg5 = 0;
72           float16_t tmp_avg6 = 0;
73           float16_t tmp_avg7 = 0;
74 #endif
75           int real_count = 0;
76           for (int h = real_win_h_start; h < real_win_h_end; h++) {
77             for (int w = real_win_w_start; w < real_win_w_end; w++) {
78               const float16_t *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
79 #ifdef ENABLE_NEON
80               tmp_avg = MS_ADDQ_F16(tmp_avg, MS_LDQ_F16(src_win_ptr));
81 #else
82               tmp_avg0 += src_win_ptr[0];
83               tmp_avg1 += src_win_ptr[1];
84               tmp_avg2 += src_win_ptr[2];
85               tmp_avg3 += src_win_ptr[3];
86               tmp_avg4 += src_win_ptr[4];
87               tmp_avg5 += src_win_ptr[5];
88               tmp_avg6 += src_win_ptr[6];
89               tmp_avg7 += src_win_ptr[7];
90 #endif
91               ++real_count;
92             }
93           }
94           if (real_count == 0) {
95             return NNACL_ERR;
96           }
97 #ifdef ENABLE_NEON
98           tmp_avg = MS_DIVQ_F16(tmp_avg, MS_MOVQ_F16((float16_t)real_count));
99           MS_STQ_F16(dst_c_ptr, MS_MINQ_F16(MS_MAXQ_F16(tmp_avg, min_value), max_value));
100 #else
101           dst_c_ptr[0] = MSMIN(MSMAX(tmp_avg0 / (float16_t)real_count, min), max);
102           dst_c_ptr[1] = MSMIN(MSMAX(tmp_avg1 / (float16_t)real_count, min), max);
103           dst_c_ptr[2] = MSMIN(MSMAX(tmp_avg2 / (float16_t)real_count, min), max);
104           dst_c_ptr[3] = MSMIN(MSMAX(tmp_avg3 / (float16_t)real_count, min), max);
105           dst_c_ptr[4] = MSMIN(MSMAX(tmp_avg4 / (float16_t)real_count, min), max);
106           dst_c_ptr[5] = MSMIN(MSMAX(tmp_avg5 / (float16_t)real_count, min), max);
107           dst_c_ptr[6] = MSMIN(MSMAX(tmp_avg6 / (float16_t)real_count, min), max);
108           dst_c_ptr[7] = MSMIN(MSMAX(tmp_avg7 / (float16_t)real_count, min), max);
109 #endif
110         }  // c8 loop
111         int channel_s = c8 * C8NUM;
112         for (int ci = channel_s; ci < channel; ci++) {
113           const float16_t *src_c_ptr = src_plane_ptr + ci;
114           float16_t *dst_c_ptr = dst_plane_ptr + ci;
115           float16_t tmp_avg = 0;
116           int real_count = 0;
117           for (int h = real_win_h_start; h < real_win_h_end; h++) {
118             for (int w = real_win_w_start; w < real_win_w_end; w++) {
119               const float16_t *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
120               tmp_avg += src_win_ptr[0];
121               ++real_count;
122             }
123           }
124           if (real_count == 0) {
125             return NNACL_ERR;
126           }
127           tmp_avg = tmp_avg / (float16_t)real_count;
128           tmp_avg = fmax(tmp_avg, min);
129           tmp_avg = fmin(tmp_avg, max);
130           dst_c_ptr[0] = tmp_avg;
131         }  // channel_res loop
132       }    // real_cal_num loop
133     }      // out_plane loop
134   }        // out_batch loop
135   return NNACL_OK;
136 }
137 
MaxPoolingC8Fp16(const float16_t * input_ptr,float16_t * output_ptr,const PoolingParameter * pooling_param,float16_t min,float16_t max,int in_batch_offset,int out_plane_offset,int real_win_h_start,int real_win_h_end,int real_win_w_start,int real_win_w_end,int in_h_index,int in_w_index)138 void MaxPoolingC8Fp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
139                       float16_t min, float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
140                       int real_win_h_end, int real_win_w_start, int real_win_w_end, int in_h_index, int in_w_index) {
141   int channel = pooling_param->input_channel_;
142   int in_w = pooling_param->input_w_;
143   int c8 = channel / C8NUM;
144 #ifdef ENABLE_NEON
145   float16x8_t min_value = vdupq_n_f16(min);
146   float16x8_t max_value = vdupq_n_f16(max);
147 #endif
148   for (int j = 0; j < c8; j++) {
149     int in_channel_offset = in_batch_offset + j * C8NUM;
150     int out_channel_offset = out_plane_offset + j * C8NUM;
151 #ifdef ENABLE_NEON
152     float16x8_t tmp_max = vdupq_n_f16(min);
153 #else
154     float16_t tmp_max[8] = {min, min, min, min, min, min, min, min};
155 #endif
156     for (int h = real_win_h_start; h < real_win_h_end; h++) {
157       for (int w = real_win_w_start; w < real_win_w_end; w++) {
158         int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
159 #ifdef ENABLE_NEON
160         tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset));
161 #else
162         for (int k = 0; k < C8NUM; k++) {
163           tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
164         }
165 #endif
166       }  // win_w loop
167     }    // win_h loop
168 #ifdef ENABLE_NEON
169     tmp_max = vmaxq_f16(tmp_max, min_value);
170     tmp_max = vminq_f16(tmp_max, max_value);
171     vst1q_f16(output_ptr + out_channel_offset, tmp_max);
172 #else
173     for (int l = 0; l < C8NUM; ++l) {
174       tmp_max[l] = fmax(tmp_max[l], min);
175       tmp_max[l] = fmin(tmp_max[l], max);
176       *(output_ptr + out_channel_offset + l) = tmp_max[l];
177     }
178 #endif
179   }  // c8 loop
180 }
181 
MaxPoolingC4Fp16(const float16_t * input_ptr,float16_t * output_ptr,const PoolingParameter * pooling_param,float16_t min,float16_t max,int in_batch_offset,int out_plane_offset,int real_win_h_start,int real_win_h_end,int real_win_w_start,int real_win_w_end,int in_h_index,int in_w_index)182 void MaxPoolingC4Fp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
183                       float16_t min, float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
184                       int real_win_h_end, int real_win_w_start, int real_win_w_end, int in_h_index, int in_w_index) {
185   int channel = pooling_param->input_channel_;
186   int in_w = pooling_param->input_w_;
187   int c8 = channel / C8NUM;
188   int c8_res = channel % C8NUM;
189   int c4 = c8_res / C4NUM;
190 #ifdef ENABLE_NEON
191   float16x4_t min_value2 = vdup_n_f16(min);
192   float16x4_t max_value2 = vdup_n_f16(max);
193 #endif
194   int c4_offset = c8 * C8NUM;
195   for (int j = 0; j < c4; j++) {
196     int in_channel_offset = in_batch_offset + c4_offset + j * C4NUM;
197     int out_channel_offset = out_plane_offset + c4_offset + j * C4NUM;
198 #ifdef ENABLE_NEON
199     float16x4_t tmp_max = vdup_n_f16(min);
200 #else
201     float16_t tmp_max[4] = {min, min, min, min};
202 #endif
203     for (int h = real_win_h_start; h < real_win_h_end; h++) {
204       for (int w = real_win_w_start; w < real_win_w_end; w++) {
205         int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
206 #ifdef ENABLE_NEON
207         tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset));
208 #else
209         for (int k = 0; k < C4NUM; k++) {
210           tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k));
211         }
212 #endif
213       }  // win_w loop
214     }    // win_h loop
215 #ifdef ENABLE_NEON
216     tmp_max = vmax_f16(tmp_max, min_value2);
217     tmp_max = vmin_f16(tmp_max, max_value2);
218     vst1_f16(output_ptr + out_channel_offset, tmp_max);
219 #else
220     for (int l = 0; l < C4NUM; ++l) {
221       tmp_max[l] = fmax(tmp_max[l], min);
222       tmp_max[l] = fmin(tmp_max[l], max);
223       output_ptr[out_channel_offset + l] = tmp_max[l];
224     }
225 #endif
226   }  // c4 loop
227 }
MaxPoolingC1Fp16(const float16_t * input_ptr,float16_t * output_ptr,const PoolingParameter * pooling_param,float16_t min,float16_t max,int in_batch_offset,int out_plane_offset,int real_win_h_start,int real_win_h_end,int real_win_w_start,int real_win_w_end,int in_h_index,int in_w_index)228 void MaxPoolingC1Fp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
229                       float16_t min, float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
230                       int real_win_h_end, int real_win_w_start, int real_win_w_end, int in_h_index, int in_w_index) {
231   int channel = pooling_param->input_channel_;
232   int in_w = pooling_param->input_w_;
233   int c8 = channel / C8NUM;
234   int c8_res = channel % C8NUM;
235   int c4 = c8_res / C4NUM;
236   int channel_s = c8 * C8NUM + c4 * C4NUM;
237   for (int k = channel_s; k < channel; k++) {
238     int in_channel_offset = in_batch_offset + k;
239     int out_channel_offset = out_plane_offset + k;
240     float16_t tmp_max = -FLT_MAX;
241     for (int h = real_win_h_start; h < real_win_h_end; h++) {
242       for (int w = real_win_w_start; w < real_win_w_end; w++) {
243         int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel;
244         tmp_max = fmax(tmp_max, *(input_ptr + in_offset));
245       }  // win_w loop
246     }    // win_h loop
247     tmp_max = fmax(tmp_max, min);
248     tmp_max = fmin(tmp_max, max);
249     output_ptr[out_channel_offset] = tmp_max;
250   }  // channel_res loop
251 }
252 
MaxPoolingFp16(const float16_t * input_ptr,float16_t * output_ptr,const PoolingParameter * pooling_param,int task_id,float16_t min,float16_t max)253 void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
254                     int task_id, float16_t min, float16_t max) {
255   int stride_w = pooling_param->stride_w_;
256   int stride_h = pooling_param->stride_h_;
257   int pad_w = pooling_param->pad_l_;
258   int pad_h = pooling_param->pad_u_;
259   int win_w = pooling_param->window_w_;
260   int win_h = pooling_param->window_h_;
261   int channel = pooling_param->input_channel_;
262   int in_w = pooling_param->input_w_;
263   int in_h = pooling_param->input_h_;
264   int output_w = pooling_param->output_w_;
265   int output_h = pooling_param->output_h_;
266   int output_batch = pooling_param->output_batch_;
267   int out_plane = output_w * output_h;
268   int out_tile_count = UP_DIV(out_plane, TILE_NUM);
269   int thread_num = pooling_param->thread_num_;
270 
271   // input channel is equal to output channel
272   NNACL_CHECK_ZERO_RETURN(output_w);
273   for (int batch = 0; batch < output_batch; batch++) {
274     int in_batch_offset = batch * in_h * in_w * channel;
275     int out_batch_offset = batch * output_h * output_w * channel;
276     for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) {
277       int cal_start_index = thread_id * TILE_NUM;
278       int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index);
279       for (int i = 0; i < real_cal_num; i++) {
280         int index = cal_start_index + i;
281         int out_w_index = index % output_w;
282         int out_h_index = index / output_w;
283         int in_w_index = out_w_index * stride_w - pad_w;
284         int in_h_index = out_h_index * stride_h - pad_h;
285         int out_plane_offset = out_batch_offset + index * channel;
286         int real_win_h_start = MSMAX(0, -in_h_index);
287         int real_win_h_end = MSMIN(win_h, in_h - in_h_index);
288         int real_win_w_start = MSMAX(0, -in_w_index);
289         int real_win_w_end = MSMIN(win_w, in_w - in_w_index);
290         MaxPoolingC8Fp16(input_ptr, output_ptr, pooling_param, min, max, in_batch_offset, out_plane_offset,
291                          real_win_h_start, real_win_h_end, real_win_w_start, real_win_w_end, in_h_index, in_w_index);
292         MaxPoolingC4Fp16(input_ptr, output_ptr, pooling_param, min, max, in_batch_offset, out_plane_offset,
293                          real_win_h_start, real_win_h_end, real_win_w_start, real_win_w_end, in_h_index, in_w_index);
294         MaxPoolingC1Fp16(input_ptr, output_ptr, pooling_param, min, max, in_batch_offset, out_plane_offset,
295                          real_win_h_start, real_win_h_end, real_win_w_start, real_win_w_end, in_h_index, in_w_index);
296       }  // real_cal_num loop
297     }    // out_plane loop
298   }      // out_batch loop
299 }
300