1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "nnacl/base/minimal_filtering_generator.h"
17 #include <string.h>
18 #include <math.h>
19 #include "nnacl/fp32/pack_fp32.h"
20 #include "nnacl/errorcode.h"
21 #include "nnacl/intrinsics/ms_simd_instructions.h"
22
Polynomial(const float * interval,float * m,int degree)23 void Polynomial(const float *interval, float *m, int degree) {
24 for (int i = 0; i < degree; ++i) {
25 float mul = 1;
26 for (int j = 0; j < degree; ++j) {
27 if (i == j) {
28 continue;
29 }
30 mul *= (interval[i] - interval[j]);
31 }
32 m[i] = mul;
33 }
34 }
35
DiagonalPlusMatrix(const float * matrix,float * diagonal_matrix,int degree)36 void DiagonalPlusMatrix(const float *matrix, float *diagonal_matrix, int degree) {
37 int data_num = (degree + 1) * (degree + 1);
38 memset(diagonal_matrix, 0, data_num * sizeof(float));
39 for (int i = 0; i < degree; ++i) {
40 for (int j = 0; j < degree; ++j) {
41 if (j == i) {
42 diagonal_matrix[i * (degree + 1) + j] = matrix[i];
43 }
44 }
45 }
46 diagonal_matrix[data_num - 1] = 1;
47 }
48
ResidueMatrix(const float * interval,float * b,int row,int col)49 void ResidueMatrix(const float *interval, float *b, int row, int col) {
50 // row : input unit, col : output_unit
51 // result : matrix b
52 int len = row * col;
53 memset(b, 0, len * sizeof(float));
54 for (int i = 0; i < row - 1; ++i) {
55 for (int j = 0; j < col; ++j) {
56 b[i * col + j] = pow(interval[i], j);
57 }
58 }
59 b[len - 1] = 1;
60 }
61
LT(const float * poly_array,float * matrix_lt,int n)62 int LT(const float *poly_array, float *matrix_lt, int n) {
63 if (n > MAX_LEN) {
64 return NNACL_ERR;
65 }
66 float coefficient_array[MAX_LEN]; // n
67 float poly[MAX_LEN]; // n
68
69 Polynomial(poly_array, poly, n);
70 for (int i = 0; i < n; ++i) {
71 // get coefficient
72 int index = 1;
73 memset(coefficient_array, 0, n * sizeof(float));
74 coefficient_array[0] = 1;
75 for (int j = 0; j < n; ++j) {
76 if (j == i) continue;
77 float poly_coe = poly_array[j] == 0 ? 0 : -poly_array[j];
78 coefficient_array[index] = 1;
79 for (int k = index - 1; k > 0; --k) {
80 coefficient_array[k] = coefficient_array[k] * poly_coe + coefficient_array[k - 1];
81 }
82 coefficient_array[0] *= poly_coe;
83 index++;
84 }
85
86 // lx[i, 0].nth(j) / f[i]
87 int setp = i * n;
88 for (int l = 0; l < n; ++l) {
89 matrix_lt[setp + l] = coefficient_array[l] / poly[i];
90 }
91 } // matrix L row loop
92 return NNACL_OK;
93 }
94
T(const float * poly_array,float * matrix_t,int n)95 void T(const float *poly_array, float *matrix_t, int n) {
96 memset(matrix_t, 0, n * (n + 1) * sizeof(float));
97 for (int i = 0; i < n; ++i) {
98 for (int j = 0; j < n + 1; ++j) {
99 if (j == i) matrix_t[i * (n + 1) + j] = 1;
100 if (j == n) {
101 if (poly_array[i] == 0) {
102 matrix_t[i * (n + 1) + j] = 0;
103 } else {
104 matrix_t[i * (n + 1) + j] = -pow(poly_array[i], n);
105 }
106 }
107 }
108 }
109 }
110
B(const float * poly_array,float * matrix_b,int in_unit)111 int B(const float *poly_array, float *matrix_b, int in_unit) {
112 memset(matrix_b, 0, in_unit * in_unit * sizeof(float));
113 int n = in_unit - 1;
114 if ((n * n) > MAX_LEN || (n * in_unit) > MAX_LEN) {
115 return NNACL_ERR;
116 }
117 float matrix_l[MAX_LEN]; // n * n
118 float matrix_lt[MAX_LEN]; // n * n
119 float matrix_t[MAX_LEN]; // n * in_unit
120
121 T(poly_array, matrix_t, n);
122 if (LT(poly_array, matrix_lt, n) != NNACL_OK) {
123 return NNACL_ERR;
124 }
125 MatrixTranspose(matrix_lt, matrix_l, n, n);
126 MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit);
127 matrix_b[in_unit * in_unit - 1] = 1;
128 return NNACL_OK;
129 }
130
131 #if !defined(ENABLE_ARM) && !defined(ENABLE_SSE)
MatrixMultiplyWinograd(const float * matix_a,const float * matrix_b,float * matrix_c,int m,int k,int n,int in_channel,int c4_channel)132 void MatrixMultiplyWinograd(const float *matix_a, const float *matrix_b, float *matrix_c, int m, int k, int n,
133 int in_channel, int c4_channel) {
134 int cnt = 0;
135 for (int i = 0; i < m; ++i) {
136 for (int j = 0; j < n; ++j) {
137 for (int y = 0; y < in_channel; ++y) {
138 float tmp = 0;
139 for (int z = 0; z < k; ++z) {
140 tmp += matix_a[z * in_channel + y + i * in_channel * k] * matrix_b[j + z * n];
141 }
142 matrix_c[cnt++] = tmp;
143 }
144 cnt += c4_channel / 4 - in_channel;
145 }
146 }
147 }
148 #endif
149
GenerateIntervalArray(float * array,float interval,int degree)150 void GenerateIntervalArray(float *array, float interval, int degree) {
151 array[0] = 0;
152 for (int i = 1; i < degree; ++i) {
153 int coefficient = pow(-1, i - 1);
154 array[i] = array[i - 1] + interval * i * coefficient;
155 }
156 }
157
MatrixTranspose(const float * matrix,float * trans_matrix,int row,int col)158 void MatrixTranspose(const float *matrix, float *trans_matrix, int row, int col) {
159 for (int i = 0; i < col; ++i) {
160 for (int j = 0; j < row; ++j) {
161 trans_matrix[i * row + j] = matrix[j * col + i];
162 }
163 }
164 }
165
MatrixMultiply(const float * matrix_a,const float * matrix_b,float * matrix_c,int m,int k,int n)166 void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n) {
167 int count = 0;
168 for (int h = 0; h < m; h++) {
169 int h_offset = h * k;
170 for (int w = 0; w < n; w++) {
171 float res = 0;
172 for (int i = 0; i < k; i++) {
173 res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n);
174 }
175 *(matrix_c + count) = res;
176 count++;
177 }
178 }
179 }
180
CookToomFilter(float * matrix_a,float * matrix_at,float * matrix_b,float * matrix_bt,float * matrix_g,float * matrix_gt,float coefficient,int out_unit,int filter_size)181 int CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *matrix_bt, float *matrix_g,
182 float *matrix_gt, float coefficient, int out_unit, int filter_size) {
183 int in_unit = out_unit + filter_size - 1;
184 int degree = in_unit - 1;
185 if (degree > MAX_LEN || (in_unit * in_unit) > MAX_LEN || (in_unit * filter_size) > MAX_LEN) {
186 return NNACL_ERR;
187 }
188 float polynomial_m[MAX_LEN]; // degree
189 float diagonal_matrix[MAX_LEN]; // input_unit * input_unit
190 float inverse_diagonal_matrix[MAX_LEN]; // input_unit * input_unit
191
192 // get diagonal matrix
193 float interval[MAX_LEN]; // degree
194 GenerateIntervalArray(interval, coefficient, degree);
195 Polynomial(interval, polynomial_m, degree);
196 DiagonalPlusMatrix(polynomial_m, diagonal_matrix, degree);
197 if (diagonal_matrix[0] < 0) {
198 for (int i = 0; i < in_unit; ++i) {
199 if (diagonal_matrix[i] != 0) diagonal_matrix[i] *= -1;
200 }
201 }
202
203 // inverse diagonal matrix
204 for (int j = 0; j < in_unit * in_unit; ++j) {
205 if (diagonal_matrix[j] != 0) {
206 inverse_diagonal_matrix[j] = 1.0 / diagonal_matrix[j];
207 } else {
208 inverse_diagonal_matrix[j] = 0;
209 }
210 }
211
212 // get matrix A && AT
213 ResidueMatrix(interval, matrix_a, in_unit, out_unit);
214 MatrixTranspose(matrix_a, matrix_at, in_unit, out_unit);
215
216 // get matrix B
217 int ret = B(interval, matrix_bt, in_unit);
218 if (ret != NNACL_OK) {
219 return ret;
220 }
221 MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit);
222 MatrixMultiply(diagonal_matrix, matrix_b, matrix_bt, in_unit, in_unit, in_unit);
223 MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit);
224
225 // get matrix G && GT
226 float tmp_g[MAX_LEN]; // in_unit * filter_size
227 ResidueMatrix(interval, matrix_g, in_unit, filter_size);
228 MatrixTranspose(matrix_g, tmp_g, in_unit, filter_size);
229 MatrixMultiply(tmp_g, inverse_diagonal_matrix, matrix_gt, filter_size, in_unit, in_unit);
230 MatrixTranspose(matrix_gt, matrix_g, filter_size, in_unit);
231 return NNACL_OK;
232 }
233
234 #if defined(ENABLE_ARM) || defined(ENABLE_SSE)
MatrixMultiplyVec(const MS_FLOAT32X4 * matrix_a,const MS_FLOAT32X4 * matrix_b,MS_FLOAT32X4 * matrix_c,const float * bias,int m,int k,int n)235 void MatrixMultiplyVec(const MS_FLOAT32X4 *matrix_a, const MS_FLOAT32X4 *matrix_b, MS_FLOAT32X4 *matrix_c,
236 const float *bias, int m, int k, int n) {
237 int count = 0;
238 MS_FLOAT32X4 bias_ptr = MS_MOVQ_F32(0);
239 if (bias != NULL) {
240 bias_ptr = MS_LDQ_F32(bias);
241 }
242 for (int h = 0; h < m; h++) {
243 int h_offset = h * k;
244 for (int w = 0; w < n; w++) {
245 MS_FLOAT32X4 res = MS_MOVQ_F32(0);
246 for (int i = 0; i < k; i++) {
247 res = MS_MLAQ_F32(res, matrix_a[h_offset + i], matrix_b[w + i * n]);
248 }
249 matrix_c[count] = MS_ADDQ_F32(res, bias_ptr);
250 count++;
251 }
252 }
253 }
254 #endif
255
WinogradWeightTransform(const float * weight_data,float * winograd_data,float * matrix_g,const float * matrix_gt,int oc_block,int input_unit,int kernel_unit,int channel,int batch,bool pack)256 int WinogradWeightTransform(const float *weight_data, float *winograd_data, float *matrix_g, const float *matrix_gt,
257 int oc_block, int input_unit, int kernel_unit, int channel, int batch, bool pack) {
258 if (oc_block == 0) {
259 return NNACL_PARAM_INVALID;
260 }
261 // original weight format : ohwi
262 int oc_block_num = UP_DIV(batch, oc_block);
263 int block_stride = channel * oc_block;
264 int block_num_stride = block_stride * oc_block_num;
265
266 // trans_filter = G*g*GT (g represents weight_data)
267 // separate into two steps ===> tmp = (g * GT)T ===> trans = (tmp * GT)T use same function:MatrixMultiplyWinograd
268 float *tmp_data = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float)));
269 if (tmp_data == NULL) {
270 return NNACL_ERR;
271 }
272 float *trans_out_data = (float *)(malloc(channel * input_unit * input_unit * sizeof(float)));
273 if (trans_out_data == NULL) {
274 free(tmp_data);
275 return NNACL_ERR;
276 }
277
278 #ifndef ENABLE_ARM
279 float *tmp_data1 = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float)));
280 if (tmp_data1 == NULL) {
281 free(tmp_data);
282 free(trans_out_data);
283 return NNACL_ERR;
284 }
285 float *trans_out_data1 = (float *)(malloc(channel * input_unit * input_unit * sizeof(float)));
286 if (trans_out_data1 == NULL) {
287 free(tmp_data);
288 free(tmp_data1);
289 free(trans_out_data);
290 return NNACL_ERR;
291 }
292 #endif
293
294 int input_oz_offset = kernel_unit * kernel_unit * channel;
295 for (int i = 0; i < batch; i++) {
296 int out_c_block = i / oc_block;
297 int out_c_res = i % oc_block;
298 int output_oz_offset = out_c_block * block_stride + out_c_res;
299
300 #ifndef ENABLE_ARM
301 // tmp_data = g * GT
302 MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit,
303 channel, channel * 4);
304 // tmp_data1 = (tmp_data)T
305 PackHWCToWHC(tmp_data, tmp_data1, kernel_unit, input_unit, channel);
306 // trans_out_data1 = tmp * GT
307 MatrixMultiplyWinograd(tmp_data1, matrix_gt, trans_out_data1, input_unit, kernel_unit, input_unit, channel,
308 channel * 4);
309 // trans_out_data = (trans_out_data1)T
310 PackHWCToWHC(trans_out_data1, trans_out_data, input_unit, input_unit, channel);
311 #else
312 // tmp = (g * GT)T
313 MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit,
314 channel, channel * 4);
315 // trans = (tmp * GT)T
316 MatrixMultiplyWinograd(tmp_data, matrix_gt, trans_out_data, input_unit, kernel_unit, input_unit, channel,
317 channel * 4);
318 #endif
319 if (pack) {
320 int in_offset = 0;
321 for (int j = 0; j < input_unit; ++j) {
322 for (int k = 0; k < input_unit; ++k) {
323 for (int c = 0; c < channel; ++c) {
324 *(winograd_data + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c];
325 }
326 in_offset += channel;
327 output_oz_offset += block_num_stride;
328 }
329 }
330 } else {
331 memcpy(winograd_data + i * channel * input_unit * input_unit, trans_out_data,
332 channel * input_unit * input_unit * sizeof(float));
333 }
334 }
335 #ifndef ENABLE_ARM
336 free(tmp_data1);
337 free(trans_out_data1);
338 #endif
339 free(tmp_data);
340 free(trans_out_data);
341 return NNACL_OK;
342 }
343