1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_polynomial_predict_f32.c
4 * Description: SVM Polynomial Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
34 #include "arm_vec_math.h"
35 #endif
36
37 /**
38 * @addtogroup polysvm
39 * @{
40 */
41
42
43 /**
44 * @brief SVM polynomial prediction
45 * @param[in] S Pointer to an instance of the polynomial SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 * @return none.
49 *
50 */
51
52 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
53
54 #include "arm_helium_utils.h"
55 #include "arm_vec_math.h"
56
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)57 void arm_svm_polynomial_predict_f32(
58 const arm_svm_polynomial_instance_f32 *S,
59 const float32_t * in,
60 int32_t * pResult)
61 {
62 /* inlined Matrix x Vector function interleaved with dot prod */
63 uint32_t numRows = S->nbOfSupportVectors;
64 uint32_t numCols = S->vectorDimension;
65 const float32_t *pSupport = S->supportVectors;
66 const float32_t *pSrcA = pSupport;
67 const float32_t *pInA0;
68 const float32_t *pInA1;
69 uint32_t row;
70 uint32_t blkCnt; /* loop counters */
71 const float32_t *pDualCoef = S->dualCoefficients;
72 float32_t sum = S->intercept;
73 f32x4_t vSum = vdupq_n_f32(0.0f);
74
75 row = numRows;
76
77 /*
78 * compute 4 rows in parrallel
79 */
80 while (row >= 4) {
81 const float32_t *pInA2, *pInA3;
82 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
83 f32x4_t vecIn, acc0, acc1, acc2, acc3;
84 float32_t const *pSrcVecPtr = in;
85
86 /*
87 * Initialize the pointers to 4 consecutive MatrixA rows
88 */
89 pInA0 = pSrcA;
90 pInA1 = pInA0 + numCols;
91 pInA2 = pInA1 + numCols;
92 pInA3 = pInA2 + numCols;
93 /*
94 * Initialize the vector pointer
95 */
96 pInVec = pSrcVecPtr;
97 /*
98 * reset accumulators
99 */
100 acc0 = vdupq_n_f32(0.0f);
101 acc1 = vdupq_n_f32(0.0f);
102 acc2 = vdupq_n_f32(0.0f);
103 acc3 = vdupq_n_f32(0.0f);
104
105 pSrcA0Vec = pInA0;
106 pSrcA1Vec = pInA1;
107 pSrcA2Vec = pInA2;
108 pSrcA3Vec = pInA3;
109
110 blkCnt = numCols >> 2;
111 while (blkCnt > 0U) {
112 f32x4_t vecA;
113
114 vecIn = vld1q(pInVec);
115 pInVec += 4;
116 vecA = vld1q(pSrcA0Vec);
117 pSrcA0Vec += 4;
118 acc0 = vfmaq(acc0, vecIn, vecA);
119 vecA = vld1q(pSrcA1Vec);
120 pSrcA1Vec += 4;
121 acc1 = vfmaq(acc1, vecIn, vecA);
122 vecA = vld1q(pSrcA2Vec);
123 pSrcA2Vec += 4;
124 acc2 = vfmaq(acc2, vecIn, vecA);
125 vecA = vld1q(pSrcA3Vec);
126 pSrcA3Vec += 4;
127 acc3 = vfmaq(acc3, vecIn, vecA);
128
129 blkCnt--;
130 }
131 /*
132 * tail
133 * (will be merged thru tail predication)
134 */
135 blkCnt = numCols & 3;
136 if (blkCnt > 0U) {
137 mve_pred16_t p0 = vctp32q(blkCnt);
138 f32x4_t vecA;
139
140 vecIn = vldrwq_z_f32(pInVec, p0);
141 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
142 acc0 = vfmaq(acc0, vecIn, vecA);
143 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
144 acc1 = vfmaq(acc1, vecIn, vecA);
145 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
146 acc2 = vfmaq(acc2, vecIn, vecA);
147 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
148 acc3 = vfmaq(acc3, vecIn, vecA);
149 }
150 /*
151 * Sum the partial parts
152 */
153 f32x4_t vtmp = vuninitializedq_f32();
154 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
155 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
156 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
157 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
158
159 vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
160 arm_vec_exponent_f32
161 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
162
163 pDualCoef += 4;
164
165 pSrcA += numCols * 4;
166 /*
167 * Decrement the row loop counter
168 */
169 row -= 4;
170 }
171
172 /*
173 * compute 2 rows in parrallel
174 */
175 if (row >= 2) {
176 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
177 f32x4_t vecIn, acc0, acc1;
178 float32_t const *pSrcVecPtr = in;
179
180 /*
181 * Initialize the pointers to 2 consecutive MatrixA rows
182 */
183 pInA0 = pSrcA;
184 pInA1 = pInA0 + numCols;
185 /*
186 * Initialize the vector pointer
187 */
188 pInVec = pSrcVecPtr;
189 /*
190 * reset accumulators
191 */
192 acc0 = vdupq_n_f32(0.0f);
193 acc1 = vdupq_n_f32(0.0f);
194 pSrcA0Vec = pInA0;
195 pSrcA1Vec = pInA1;
196
197 blkCnt = numCols >> 2;
198 while (blkCnt > 0U) {
199 f32x4_t vecA;
200
201 vecIn = vld1q(pInVec);
202 pInVec += 4;
203 vecA = vld1q(pSrcA0Vec);
204 pSrcA0Vec += 4;
205 acc0 = vfmaq(acc0, vecIn, vecA);
206 vecA = vld1q(pSrcA1Vec);
207 pSrcA1Vec += 4;
208 acc1 = vfmaq(acc1, vecIn, vecA);
209
210 blkCnt--;
211 }
212 /*
213 * tail
214 * (will be merged thru tail predication)
215 */
216 blkCnt = numCols & 3;
217 if (blkCnt > 0U) {
218 mve_pred16_t p0 = vctp32q(blkCnt);
219 f32x4_t vecA;
220
221 vecIn = vldrwq_z_f32(pInVec, p0);
222 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
223 acc0 = vfmaq(acc0, vecIn, vecA);
224 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
225 acc1 = vfmaq(acc1, vecIn, vecA);
226 }
227 /*
228 * Sum the partial parts
229 */
230 f32x4_t vtmp = vuninitializedq_f32();
231 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
232 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
233
234 vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
235 arm_vec_exponent_f32
236 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
237 vctp32q(2));
238
239 pDualCoef += 2;
240 pSrcA += numCols * 2;
241 row -= 2;
242 }
243
244 if (row >= 1) {
245 f32x4_t vecIn, acc0;
246 float32_t const *pSrcA0Vec, *pInVec;
247 float32_t const *pSrcVecPtr = in;
248 /*
249 * Initialize the pointers to last MatrixA row
250 */
251 pInA0 = pSrcA;
252 /*
253 * Initialize the vector pointer
254 */
255 pInVec = pSrcVecPtr;
256 /*
257 * reset accumulators
258 */
259 acc0 = vdupq_n_f32(0.0f);
260
261 pSrcA0Vec = pInA0;
262
263 blkCnt = numCols >> 2;
264 while (blkCnt > 0U) {
265 f32x4_t vecA;
266
267 vecIn = vld1q(pInVec);
268 pInVec += 4;
269 vecA = vld1q(pSrcA0Vec);
270 pSrcA0Vec += 4;
271 acc0 = vfmaq(acc0, vecIn, vecA);
272
273 blkCnt--;
274 }
275 /*
276 * tail
277 * (will be merged thru tail predication)
278 */
279 blkCnt = numCols & 3;
280 if (blkCnt > 0U) {
281 mve_pred16_t p0 = vctp32q(blkCnt);
282 f32x4_t vecA;
283
284 vecIn = vldrwq_z_f32(pInVec, p0);
285 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
286 acc0 = vfmaq(acc0, vecIn, vecA);
287 }
288 /*
289 * Sum the partial parts
290 */
291 f32x4_t vtmp = vuninitializedq_f32();
292 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
293 vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
294 arm_vec_exponent_f32
295 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
296 vctp32q(1));
297 }
298 sum += vecAddAcrossF32Mve(vSum);
299
300
301 *pResult = S->classes[STEP(sum)];
302 }
303
304 #else
305 #if defined(ARM_MATH_NEON)
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)306 void arm_svm_polynomial_predict_f32(
307 const arm_svm_polynomial_instance_f32 *S,
308 const float32_t * in,
309 int32_t * pResult)
310 {
311 float32_t sum = S->intercept;
312
313 float32_t dot;
314 float32x4_t dotV;
315
316 float32x4_t accuma,accumb,accumc,accumd,accum;
317 float32x2_t accum2;
318 float32x4_t vec1;
319 float32x4_t coef0 = vdupq_n_f32(S->coef0);
320
321 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
322
323 uint32_t blkCnt;
324 uint32_t vectorBlkCnt;
325
326 const float32_t *pIn = in;
327
328 const float32_t *pSupport = S->supportVectors;
329
330 const float32_t *pSupporta = S->supportVectors;
331 const float32_t *pSupportb;
332 const float32_t *pSupportc;
333 const float32_t *pSupportd;
334
335 pSupportb = pSupporta + S->vectorDimension;
336 pSupportc = pSupportb + S->vectorDimension;
337 pSupportd = pSupportc + S->vectorDimension;
338
339 const float32_t *pDualCoefs = S->dualCoefficients;
340
341 vectorBlkCnt = S->nbOfSupportVectors >> 2;
342 while (vectorBlkCnt > 0U)
343 {
344 accuma = vdupq_n_f32(0);
345 accumb = vdupq_n_f32(0);
346 accumc = vdupq_n_f32(0);
347 accumd = vdupq_n_f32(0);
348
349 pIn = in;
350
351 blkCnt = S->vectorDimension >> 2;
352 while (blkCnt > 0U)
353 {
354
355 vec1 = vld1q_f32(pIn);
356 vec2a = vld1q_f32(pSupporta);
357 vec2b = vld1q_f32(pSupportb);
358 vec2c = vld1q_f32(pSupportc);
359 vec2d = vld1q_f32(pSupportd);
360
361 pIn += 4;
362 pSupporta += 4;
363 pSupportb += 4;
364 pSupportc += 4;
365 pSupportd += 4;
366
367 accuma = vmlaq_f32(accuma, vec1,vec2a);
368 accumb = vmlaq_f32(accumb, vec1,vec2b);
369 accumc = vmlaq_f32(accumc, vec1,vec2c);
370 accumd = vmlaq_f32(accumd, vec1,vec2d);
371
372 blkCnt -- ;
373 }
374 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
375 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
376
377 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
378 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
379
380 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
381 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
382
383 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
384 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
385
386
387 blkCnt = S->vectorDimension & 3;
388 while (blkCnt > 0U)
389 {
390 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
391 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
392 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
393 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
394
395 pIn++;
396
397 blkCnt -- ;
398 }
399
400 vec1 = vld1q_f32(pDualCoefs);
401 pDualCoefs += 4;
402
403 // To vectorize later
404 dotV = vmulq_n_f32(dotV, S->gamma);
405 dotV = vaddq_f32(dotV, coef0);
406
407 dotV = arm_vec_exponent_f32(dotV,S->degree);
408
409 accum = vmulq_f32(vec1,dotV);
410 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
411 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
412
413 pSupporta += 3*S->vectorDimension;
414 pSupportb += 3*S->vectorDimension;
415 pSupportc += 3*S->vectorDimension;
416 pSupportd += 3*S->vectorDimension;
417
418 vectorBlkCnt -- ;
419 }
420
421 pSupport = pSupporta;
422 vectorBlkCnt = S->nbOfSupportVectors & 3;
423
424 while (vectorBlkCnt > 0U)
425 {
426 accum = vdupq_n_f32(0);
427 dot = 0.0f;
428 pIn = in;
429
430 blkCnt = S->vectorDimension >> 2;
431 while (blkCnt > 0U)
432 {
433
434 vec1 = vld1q_f32(pIn);
435 vec2 = vld1q_f32(pSupport);
436 pIn += 4;
437 pSupport += 4;
438
439 accum = vmlaq_f32(accum, vec1,vec2);
440
441 blkCnt -- ;
442 }
443 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
444 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
445
446
447 blkCnt = S->vectorDimension & 3;
448 while (blkCnt > 0U)
449 {
450 dot = dot + *pIn++ * *pSupport++;
451
452 blkCnt -- ;
453 }
454
455 sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
456 vectorBlkCnt -- ;
457 }
458
459 *pResult=S->classes[STEP(sum)];
460 }
461 #else
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)462 void arm_svm_polynomial_predict_f32(
463 const arm_svm_polynomial_instance_f32 *S,
464 const float32_t * in,
465 int32_t * pResult)
466 {
467 float32_t sum=S->intercept;
468 float32_t dot=0;
469 uint32_t i,j;
470 const float32_t *pSupport = S->supportVectors;
471
472 for(i=0; i < S->nbOfSupportVectors; i++)
473 {
474 dot=0;
475 for(j=0; j < S->vectorDimension; j++)
476 {
477 dot = dot + in[j]* *pSupport++;
478 }
479 sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
480 }
481
482 *pResult=S->classes[STEP(sum)];
483 }
484 #endif
485 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
486
487
488 /**
489 * @} end of polysvm group
490 */
491