• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "nnacl/base/minimal_filtering_generator.h"
17 #include <string.h>
18 #include <math.h>
19 #include "nnacl/fp32/pack_fp32.h"
20 #include "nnacl/errorcode.h"
21 #include "nnacl/intrinsics/ms_simd_instructions.h"
22 
Polynomial(const float * interval,float * m,int degree)23 void Polynomial(const float *interval, float *m, int degree) {
24   for (int i = 0; i < degree; ++i) {
25     float mul = 1;
26     for (int j = 0; j < degree; ++j) {
27       if (i == j) {
28         continue;
29       }
30       mul *= (interval[i] - interval[j]);
31     }
32     m[i] = mul;
33   }
34 }
35 
DiagonalPlusMatrix(const float * matrix,float * diagonal_matrix,int degree)36 void DiagonalPlusMatrix(const float *matrix, float *diagonal_matrix, int degree) {
37   int data_num = (degree + 1) * (degree + 1);
38   memset(diagonal_matrix, 0, data_num * sizeof(float));
39   for (int i = 0; i < degree; ++i) {
40     for (int j = 0; j < degree; ++j) {
41       if (j == i) {
42         diagonal_matrix[i * (degree + 1) + j] = matrix[i];
43       }
44     }
45   }
46   diagonal_matrix[data_num - 1] = 1;
47 }
48 
ResidueMatrix(const float * interval,float * b,int row,int col)49 void ResidueMatrix(const float *interval, float *b, int row, int col) {
50   // row : input unit, col : output_unit
51   // result : matrix b
52   int len = row * col;
53   memset(b, 0, len * sizeof(float));
54   for (int i = 0; i < row - 1; ++i) {
55     for (int j = 0; j < col; ++j) {
56       b[i * col + j] = pow(interval[i], j);
57     }
58   }
59   b[len - 1] = 1;
60 }
61 
LT(const float * poly_array,float * matrix_lt,int n)62 int LT(const float *poly_array, float *matrix_lt, int n) {
63   if (n > MAX_LEN) {
64     return NNACL_ERR;
65   }
66   float coefficient_array[MAX_LEN];  // n
67   float poly[MAX_LEN];               // n
68 
69   Polynomial(poly_array, poly, n);
70   for (int i = 0; i < n; ++i) {
71     // get coefficient
72     int index = 1;
73     memset(coefficient_array, 0, n * sizeof(float));
74     coefficient_array[0] = 1;
75     for (int j = 0; j < n; ++j) {
76       if (j == i) continue;
77       float poly_coe = poly_array[j] == 0 ? 0 : -poly_array[j];
78       coefficient_array[index] = 1;
79       for (int k = index - 1; k > 0; --k) {
80         coefficient_array[k] = coefficient_array[k] * poly_coe + coefficient_array[k - 1];
81       }
82       coefficient_array[0] *= poly_coe;
83       index++;
84     }
85 
86     // lx[i, 0].nth(j) / f[i]
87     int setp = i * n;
88     for (int l = 0; l < n; ++l) {
89       matrix_lt[setp + l] = coefficient_array[l] / poly[i];
90     }
91   }  // matrix L row loop
92   return NNACL_OK;
93 }
94 
T(const float * poly_array,float * matrix_t,int n)95 void T(const float *poly_array, float *matrix_t, int n) {
96   memset(matrix_t, 0, n * (n + 1) * sizeof(float));
97   for (int i = 0; i < n; ++i) {
98     for (int j = 0; j < n + 1; ++j) {
99       if (j == i) matrix_t[i * (n + 1) + j] = 1;
100       if (j == n) {
101         if (poly_array[i] == 0) {
102           matrix_t[i * (n + 1) + j] = 0;
103         } else {
104           matrix_t[i * (n + 1) + j] = -pow(poly_array[i], n);
105         }
106       }
107     }
108   }
109 }
110 
B(const float * poly_array,float * matrix_b,int in_unit)111 int B(const float *poly_array, float *matrix_b, int in_unit) {
112   memset(matrix_b, 0, in_unit * in_unit * sizeof(float));
113   int n = in_unit - 1;
114   if ((n * n) > MAX_LEN || (n * in_unit) > MAX_LEN) {
115     return NNACL_ERR;
116   }
117   float matrix_l[MAX_LEN];   // n * n
118   float matrix_lt[MAX_LEN];  // n * n
119   float matrix_t[MAX_LEN];   // n * in_unit
120 
121   T(poly_array, matrix_t, n);
122   if (LT(poly_array, matrix_lt, n) != NNACL_OK) {
123     return NNACL_ERR;
124   }
125   MatrixTranspose(matrix_lt, matrix_l, n, n);
126   MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit);
127   matrix_b[in_unit * in_unit - 1] = 1;
128   return NNACL_OK;
129 }
130 
131 #if !defined(ENABLE_ARM) && !defined(ENABLE_SSE)
MatrixMultiplyWinograd(const float * matix_a,const float * matrix_b,float * matrix_c,int m,int k,int n,int in_channel,int c4_channel)132 void MatrixMultiplyWinograd(const float *matix_a, const float *matrix_b, float *matrix_c, int m, int k, int n,
133                             int in_channel, int c4_channel) {
134   int cnt = 0;
135   for (int i = 0; i < m; ++i) {
136     for (int j = 0; j < n; ++j) {
137       for (int y = 0; y < in_channel; ++y) {
138         float tmp = 0;
139         for (int z = 0; z < k; ++z) {
140           tmp += matix_a[z * in_channel + y + i * in_channel * k] * matrix_b[j + z * n];
141         }
142         matrix_c[cnt++] = tmp;
143       }
144       cnt += c4_channel / 4 - in_channel;
145     }
146   }
147 }
148 #endif
149 
GenerateIntervalArray(float * array,float interval,int degree)150 void GenerateIntervalArray(float *array, float interval, int degree) {
151   array[0] = 0;
152   for (int i = 1; i < degree; ++i) {
153     int coefficient = pow(-1, i - 1);
154     array[i] = array[i - 1] + interval * i * coefficient;
155   }
156 }
157 
MatrixTranspose(const float * matrix,float * trans_matrix,int row,int col)158 void MatrixTranspose(const float *matrix, float *trans_matrix, int row, int col) {
159   for (int i = 0; i < col; ++i) {
160     for (int j = 0; j < row; ++j) {
161       trans_matrix[i * row + j] = matrix[j * col + i];
162     }
163   }
164 }
165 
MatrixMultiply(const float * matrix_a,const float * matrix_b,float * matrix_c,int m,int k,int n)166 void MatrixMultiply(const float *matrix_a, const float *matrix_b, float *matrix_c, int m, int k, int n) {
167   int count = 0;
168   for (int h = 0; h < m; h++) {
169     int h_offset = h * k;
170     for (int w = 0; w < n; w++) {
171       float res = 0;
172       for (int i = 0; i < k; i++) {
173         res += *(matrix_a + h_offset + i) * *(matrix_b + w + i * n);
174       }
175       *(matrix_c + count) = res;
176       count++;
177     }
178   }
179 }
180 
CookToomFilter(float * matrix_a,float * matrix_at,float * matrix_b,float * matrix_bt,float * matrix_g,float * matrix_gt,float coefficient,int out_unit,int filter_size)181 int CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *matrix_bt, float *matrix_g,
182                    float *matrix_gt, float coefficient, int out_unit, int filter_size) {
183   int in_unit = out_unit + filter_size - 1;
184   int degree = in_unit - 1;
185   if (degree > MAX_LEN || (in_unit * in_unit) > MAX_LEN || (in_unit * filter_size) > MAX_LEN) {
186     return NNACL_ERR;
187   }
188   float polynomial_m[MAX_LEN];             // degree
189   float diagonal_matrix[MAX_LEN];          // input_unit * input_unit
190   float inverse_diagonal_matrix[MAX_LEN];  // input_unit * input_unit
191 
192   // get diagonal matrix
193   float interval[MAX_LEN];  // degree
194   GenerateIntervalArray(interval, coefficient, degree);
195   Polynomial(interval, polynomial_m, degree);
196   DiagonalPlusMatrix(polynomial_m, diagonal_matrix, degree);
197   if (diagonal_matrix[0] < 0) {
198     for (int i = 0; i < in_unit; ++i) {
199       if (diagonal_matrix[i] != 0) diagonal_matrix[i] *= -1;
200     }
201   }
202 
203   // inverse diagonal matrix
204   for (int j = 0; j < in_unit * in_unit; ++j) {
205     if (diagonal_matrix[j] != 0) {
206       inverse_diagonal_matrix[j] = 1.0 / diagonal_matrix[j];
207     } else {
208       inverse_diagonal_matrix[j] = 0;
209     }
210   }
211 
212   // get matrix A && AT
213   ResidueMatrix(interval, matrix_a, in_unit, out_unit);
214   MatrixTranspose(matrix_a, matrix_at, in_unit, out_unit);
215 
216   // get matrix B
217   int ret = B(interval, matrix_bt, in_unit);
218   if (ret != NNACL_OK) {
219     return ret;
220   }
221   MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit);
222   MatrixMultiply(diagonal_matrix, matrix_b, matrix_bt, in_unit, in_unit, in_unit);
223   MatrixTranspose(matrix_bt, matrix_b, in_unit, in_unit);
224 
225   // get matrix G && GT
226   float tmp_g[MAX_LEN];  // in_unit * filter_size
227   ResidueMatrix(interval, matrix_g, in_unit, filter_size);
228   MatrixTranspose(matrix_g, tmp_g, in_unit, filter_size);
229   MatrixMultiply(tmp_g, inverse_diagonal_matrix, matrix_gt, filter_size, in_unit, in_unit);
230   MatrixTranspose(matrix_gt, matrix_g, filter_size, in_unit);
231   return NNACL_OK;
232 }
233 
234 #if defined(ENABLE_ARM) || defined(ENABLE_SSE)
MatrixMultiplyVec(const MS_FLOAT32X4 * matrix_a,const MS_FLOAT32X4 * matrix_b,MS_FLOAT32X4 * matrix_c,const float * bias,int m,int k,int n)235 void MatrixMultiplyVec(const MS_FLOAT32X4 *matrix_a, const MS_FLOAT32X4 *matrix_b, MS_FLOAT32X4 *matrix_c,
236                        const float *bias, int m, int k, int n) {
237   int count = 0;
238   MS_FLOAT32X4 bias_ptr = MS_MOVQ_F32(0);
239   if (bias != NULL) {
240     bias_ptr = MS_LDQ_F32(bias);
241   }
242   for (int h = 0; h < m; h++) {
243     int h_offset = h * k;
244     for (int w = 0; w < n; w++) {
245       MS_FLOAT32X4 res = MS_MOVQ_F32(0);
246       for (int i = 0; i < k; i++) {
247         res = MS_MLAQ_F32(res, matrix_a[h_offset + i], matrix_b[w + i * n]);
248       }
249       matrix_c[count] = MS_ADDQ_F32(res, bias_ptr);
250       count++;
251     }
252   }
253 }
254 #endif
255 
WinogradWeightTransform(const float * weight_data,float * winograd_data,float * matrix_g,const float * matrix_gt,int oc_block,int input_unit,int kernel_unit,int channel,int batch,bool pack)256 int WinogradWeightTransform(const float *weight_data, float *winograd_data, float *matrix_g, const float *matrix_gt,
257                             int oc_block, int input_unit, int kernel_unit, int channel, int batch, bool pack) {
258   if (oc_block == 0) {
259     return NNACL_PARAM_INVALID;
260   }
261   // original weight format : ohwi
262   int oc_block_num = UP_DIV(batch, oc_block);
263   int block_stride = channel * oc_block;
264   int block_num_stride = block_stride * oc_block_num;
265 
266   // trans_filter = G*g*GT (g represents weight_data)
267   // separate into two steps ===> tmp = (g * GT)T ===> trans = (tmp * GT)T   use same function:MatrixMultiplyWinograd
268   float *tmp_data = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float)));
269   if (tmp_data == NULL) {
270     return NNACL_ERR;
271   }
272   float *trans_out_data = (float *)(malloc(channel * input_unit * input_unit * sizeof(float)));
273   if (trans_out_data == NULL) {
274     free(tmp_data);
275     return NNACL_ERR;
276   }
277 
278 #ifndef ENABLE_ARM
279   float *tmp_data1 = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float)));
280   if (tmp_data1 == NULL) {
281     free(tmp_data);
282     free(trans_out_data);
283     return NNACL_ERR;
284   }
285   float *trans_out_data1 = (float *)(malloc(channel * input_unit * input_unit * sizeof(float)));
286   if (trans_out_data1 == NULL) {
287     free(tmp_data);
288     free(tmp_data1);
289     free(trans_out_data);
290     return NNACL_ERR;
291   }
292 #endif
293 
294   int input_oz_offset = kernel_unit * kernel_unit * channel;
295   for (int i = 0; i < batch; i++) {
296     int out_c_block = i / oc_block;
297     int out_c_res = i % oc_block;
298     int output_oz_offset = out_c_block * block_stride + out_c_res;
299 
300 #ifndef ENABLE_ARM
301     // tmp_data = g * GT
302     MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit,
303                            channel, channel * 4);
304     // tmp_data1 = (tmp_data)T
305     PackHWCToWHC(tmp_data, tmp_data1, kernel_unit, input_unit, channel);
306     // trans_out_data1 = tmp * GT
307     MatrixMultiplyWinograd(tmp_data1, matrix_gt, trans_out_data1, input_unit, kernel_unit, input_unit, channel,
308                            channel * 4);
309     // trans_out_data = (trans_out_data1)T
310     PackHWCToWHC(trans_out_data1, trans_out_data, input_unit, input_unit, channel);
311 #else
312     // tmp = (g * GT)T
313     MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit,
314                            channel, channel * 4);
315     // trans = (tmp * GT)T
316     MatrixMultiplyWinograd(tmp_data, matrix_gt, trans_out_data, input_unit, kernel_unit, input_unit, channel,
317                            channel * 4);
318 #endif
319     if (pack) {
320       int in_offset = 0;
321       for (int j = 0; j < input_unit; ++j) {
322         for (int k = 0; k < input_unit; ++k) {
323           for (int c = 0; c < channel; ++c) {
324             *(winograd_data + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c];
325           }
326           in_offset += channel;
327           output_oz_offset += block_num_stride;
328         }
329       }
330     } else {
331       memcpy(winograd_data + i * channel * input_unit * input_unit, trans_out_data,
332              channel * input_unit * input_unit * sizeof(float));
333     }
334   }
335 #ifndef ENABLE_ARM
336   free(tmp_data1);
337   free(trans_out_data1);
338 #endif
339   free(tmp_data);
340   free(trans_out_data);
341   return NNACL_OK;
342 }
343