1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_rbf_predict_f32.c
4 * Description: SVM Radial Basis Function Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33
34 /**
35 * @addtogroup rbfsvm
36 * @{
37 */
38
39
40 /**
41 * @brief SVM rbf prediction
42 * @param[in] S Pointer to an instance of the rbf SVM structure.
43 * @param[in] in Pointer to input vector
44 * @param[out] pResult decision value
45 * @return none.
46 *
47 */
48
49 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
50
51 #include "arm_helium_utils.h"
52 #include "arm_vec_math.h"
53
arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 * S,const float32_t * in,int32_t * pResult)54 void arm_svm_rbf_predict_f32(
55 const arm_svm_rbf_instance_f32 *S,
56 const float32_t * in,
57 int32_t * pResult)
58 {
59 /* inlined Matrix x Vector function interleaved with dot prod */
60 uint32_t numRows = S->nbOfSupportVectors;
61 uint32_t numCols = S->vectorDimension;
62 const float32_t *pSupport = S->supportVectors;
63 const float32_t *pSrcA = pSupport;
64 const float32_t *pInA0;
65 const float32_t *pInA1;
66 uint32_t row;
67 uint32_t blkCnt; /* loop counters */
68 const float32_t *pDualCoef = S->dualCoefficients;
69 float32_t sum = S->intercept;
70 f32x4_t vSum = vdupq_n_f32(0);
71
72 row = numRows;
73
74 /*
75 * compute 4 rows in parrallel
76 */
77 while (row >= 4) {
78 const float32_t *pInA2, *pInA3;
79 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
80 f32x4_t vecIn, acc0, acc1, acc2, acc3;
81 float32_t const *pSrcVecPtr = in;
82
83 /*
84 * Initialize the pointers to 4 consecutive MatrixA rows
85 */
86 pInA0 = pSrcA;
87 pInA1 = pInA0 + numCols;
88 pInA2 = pInA1 + numCols;
89 pInA3 = pInA2 + numCols;
90 /*
91 * Initialize the vector pointer
92 */
93 pInVec = pSrcVecPtr;
94 /*
95 * reset accumulators
96 */
97 acc0 = vdupq_n_f32(0.0f);
98 acc1 = vdupq_n_f32(0.0f);
99 acc2 = vdupq_n_f32(0.0f);
100 acc3 = vdupq_n_f32(0.0f);
101
102 pSrcA0Vec = pInA0;
103 pSrcA1Vec = pInA1;
104 pSrcA2Vec = pInA2;
105 pSrcA3Vec = pInA3;
106
107 blkCnt = numCols >> 2;
108 while (blkCnt > 0U) {
109 f32x4_t vecA;
110 f32x4_t vecDif;
111
112 vecIn = vld1q(pInVec);
113 pInVec += 4;
114 vecA = vld1q(pSrcA0Vec);
115 pSrcA0Vec += 4;
116 vecDif = vsubq(vecIn, vecA);
117 acc0 = vfmaq(acc0, vecDif, vecDif);
118 vecA = vld1q(pSrcA1Vec);
119 pSrcA1Vec += 4;
120 vecDif = vsubq(vecIn, vecA);
121 acc1 = vfmaq(acc1, vecDif, vecDif);
122 vecA = vld1q(pSrcA2Vec);
123 pSrcA2Vec += 4;
124 vecDif = vsubq(vecIn, vecA);
125 acc2 = vfmaq(acc2, vecDif, vecDif);
126 vecA = vld1q(pSrcA3Vec);
127 pSrcA3Vec += 4;
128 vecDif = vsubq(vecIn, vecA);
129 acc3 = vfmaq(acc3, vecDif, vecDif);
130
131 blkCnt--;
132 }
133 /*
134 * tail
135 * (will be merged thru tail predication)
136 */
137 blkCnt = numCols & 3;
138 if (blkCnt > 0U) {
139 mve_pred16_t p0 = vctp32q(blkCnt);
140 f32x4_t vecA;
141 f32x4_t vecDif;
142
143 vecIn = vldrwq_z_f32(pInVec, p0);
144 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
145 vecDif = vsubq(vecIn, vecA);
146 acc0 = vfmaq(acc0, vecDif, vecDif);
147 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
148 vecDif = vsubq(vecIn, vecA);
149 acc1 = vfmaq(acc1, vecDif, vecDif);
150 vecA = vldrwq_z_f32(pSrcA2Vec, p0);;
151 vecDif = vsubq(vecIn, vecA);
152 acc2 = vfmaq(acc2, vecDif, vecDif);
153 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
154 vecDif = vsubq(vecIn, vecA);
155 acc3 = vfmaq(acc3, vecDif, vecDif);
156 }
157 /*
158 * Sum the partial parts
159 */
160
161 //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF32Mve(acc0));
162 f32x4_t vtmp = vuninitializedq_f32();
163 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
164 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
165 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
166 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
167
168 vSum =
169 vfmaq_f32(vSum, vld1q(pDualCoef),
170 vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)));
171 pDualCoef += 4;
172 pSrcA += numCols * 4;
173 /*
174 * Decrement the row loop counter
175 */
176 row -= 4;
177 }
178
179 /*
180 * compute 2 rows in parrallel
181 */
182 if (row >= 2) {
183 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
184 f32x4_t vecIn, acc0, acc1;
185 float32_t const *pSrcVecPtr = in;
186
187 /*
188 * Initialize the pointers to 2 consecutive MatrixA rows
189 */
190 pInA0 = pSrcA;
191 pInA1 = pInA0 + numCols;
192 /*
193 * Initialize the vector pointer
194 */
195 pInVec = pSrcVecPtr;
196 /*
197 * reset accumulators
198 */
199 acc0 = vdupq_n_f32(0.0f);
200 acc1 = vdupq_n_f32(0.0f);
201 pSrcA0Vec = pInA0;
202 pSrcA1Vec = pInA1;
203
204 blkCnt = numCols >> 2;
205 while (blkCnt > 0U) {
206 f32x4_t vecA;
207 f32x4_t vecDif;
208
209 vecIn = vld1q(pInVec);
210 pInVec += 4;
211 vecA = vld1q(pSrcA0Vec);
212 pSrcA0Vec += 4;
213 vecDif = vsubq(vecIn, vecA);
214 acc0 = vfmaq(acc0, vecDif, vecDif);;
215 vecA = vld1q(pSrcA1Vec);
216 pSrcA1Vec += 4;
217 vecDif = vsubq(vecIn, vecA);
218 acc1 = vfmaq(acc1, vecDif, vecDif);
219
220 blkCnt--;
221 }
222 /*
223 * tail
224 * (will be merged thru tail predication)
225 */
226 blkCnt = numCols & 3;
227 if (blkCnt > 0U) {
228 mve_pred16_t p0 = vctp32q(blkCnt);
229 f32x4_t vecA, vecDif;
230
231 vecIn = vldrwq_z_f32(pInVec, p0);
232 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
233 vecDif = vsubq(vecIn, vecA);
234 acc0 = vfmaq(acc0, vecDif, vecDif);
235 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
236 vecDif = vsubq(vecIn, vecA);
237 acc1 = vfmaq(acc1, vecDif, vecDif);
238 }
239 /*
240 * Sum the partial parts
241 */
242 f32x4_t vtmp = vuninitializedq_f32();
243 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
244 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
245
246 vSum =
247 vfmaq_m_f32(vSum, vld1q(pDualCoef),
248 vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(2));
249 pDualCoef += 2;
250
251 pSrcA += numCols * 2;
252 row -= 2;
253 }
254
255 if (row >= 1) {
256 f32x4_t vecIn, acc0;
257 float32_t const *pSrcA0Vec, *pInVec;
258 float32_t const *pSrcVecPtr = in;
259 /*
260 * Initialize the pointers to last MatrixA row
261 */
262 pInA0 = pSrcA;
263 /*
264 * Initialize the vector pointer
265 */
266 pInVec = pSrcVecPtr;
267 /*
268 * reset accumulators
269 */
270 acc0 = vdupq_n_f32(0.0f);
271
272 pSrcA0Vec = pInA0;
273
274 blkCnt = numCols >> 2;
275 while (blkCnt > 0U) {
276 f32x4_t vecA, vecDif;
277
278 vecIn = vld1q(pInVec);
279 pInVec += 4;
280 vecA = vld1q(pSrcA0Vec);
281 pSrcA0Vec += 4;
282 vecDif = vsubq(vecIn, vecA);
283 acc0 = vfmaq(acc0, vecDif, vecDif);
284
285 blkCnt--;
286 }
287 /*
288 * tail
289 * (will be merged thru tail predication)
290 */
291 blkCnt = numCols & 3;
292 if (blkCnt > 0U) {
293 mve_pred16_t p0 = vctp32q(blkCnt);
294 f32x4_t vecA, vecDif;
295
296 vecIn = vldrwq_z_f32(pInVec, p0);
297 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
298 vecDif = vsubq(vecIn, vecA);
299 acc0 = vfmaq(acc0, vecDif, vecDif);
300 }
301 /*
302 * Sum the partial parts
303 */
304 f32x4_t vtmp = vuninitializedq_f32();
305 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
306
307 vSum =
308 vfmaq_m_f32(vSum, vld1q(pDualCoef),
309 vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(1));
310
311 }
312
313
314 sum += vecAddAcrossF32Mve(vSum);
315 *pResult = S->classes[STEP(sum)];
316 }
317
318
319 #else
320 #if defined(ARM_MATH_NEON)
321
322 #include "NEMath.h"
323
arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 * S,const float32_t * in,int32_t * pResult)324 void arm_svm_rbf_predict_f32(
325 const arm_svm_rbf_instance_f32 *S,
326 const float32_t * in,
327 int32_t * pResult)
328 {
329 float32_t sum = S->intercept;
330
331 float32_t dot;
332 float32x4_t dotV;
333
334 float32x4_t accuma,accumb,accumc,accumd,accum;
335 float32x2_t accum2;
336 float32x4_t temp;
337 float32x4_t vec1;
338
339 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
340
341 uint32_t blkCnt;
342 uint32_t vectorBlkCnt;
343
344 const float32_t *pIn = in;
345
346 const float32_t *pSupport = S->supportVectors;
347
348 const float32_t *pSupporta = S->supportVectors;
349 const float32_t *pSupportb;
350 const float32_t *pSupportc;
351 const float32_t *pSupportd;
352
353 pSupportb = pSupporta + S->vectorDimension;
354 pSupportc = pSupportb + S->vectorDimension;
355 pSupportd = pSupportc + S->vectorDimension;
356
357 const float32_t *pDualCoefs = S->dualCoefficients;
358
359
360 vectorBlkCnt = S->nbOfSupportVectors >> 2;
361 while (vectorBlkCnt > 0U)
362 {
363 accuma = vdupq_n_f32(0);
364 accumb = vdupq_n_f32(0);
365 accumc = vdupq_n_f32(0);
366 accumd = vdupq_n_f32(0);
367
368 pIn = in;
369
370 blkCnt = S->vectorDimension >> 2;
371 while (blkCnt > 0U)
372 {
373
374 vec1 = vld1q_f32(pIn);
375 vec2a = vld1q_f32(pSupporta);
376 vec2b = vld1q_f32(pSupportb);
377 vec2c = vld1q_f32(pSupportc);
378 vec2d = vld1q_f32(pSupportd);
379
380 pIn += 4;
381 pSupporta += 4;
382 pSupportb += 4;
383 pSupportc += 4;
384 pSupportd += 4;
385
386 temp = vsubq_f32(vec1, vec2a);
387 accuma = vmlaq_f32(accuma, temp, temp);
388
389 temp = vsubq_f32(vec1, vec2b);
390 accumb = vmlaq_f32(accumb, temp, temp);
391
392 temp = vsubq_f32(vec1, vec2c);
393 accumc = vmlaq_f32(accumc, temp, temp);
394
395 temp = vsubq_f32(vec1, vec2d);
396 accumd = vmlaq_f32(accumd, temp, temp);
397
398 blkCnt -- ;
399 }
400 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
401 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
402
403 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
404 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
405
406 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
407 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
408
409 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
410 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
411
412
413 blkCnt = S->vectorDimension & 3;
414 while (blkCnt > 0U)
415 {
416 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0);
417 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1);
418 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2);
419 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3);
420
421 pSupporta++;
422 pSupportb++;
423 pSupportc++;
424 pSupportd++;
425
426 pIn++;
427
428 blkCnt -- ;
429 }
430
431 vec1 = vld1q_f32(pDualCoefs);
432 pDualCoefs += 4;
433
434 // To vectorize later
435 dotV = vmulq_n_f32(dotV, -S->gamma);
436 dotV = vexpq_f32(dotV);
437
438 accum = vmulq_f32(vec1,dotV);
439 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
440 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
441
442 pSupporta += 3*S->vectorDimension;
443 pSupportb += 3*S->vectorDimension;
444 pSupportc += 3*S->vectorDimension;
445 pSupportd += 3*S->vectorDimension;
446
447 vectorBlkCnt -- ;
448 }
449
450 pSupport = pSupporta;
451 vectorBlkCnt = S->nbOfSupportVectors & 3;
452
453 while (vectorBlkCnt > 0U)
454 {
455 accum = vdupq_n_f32(0);
456 dot = 0.0f;
457 pIn = in;
458
459 blkCnt = S->vectorDimension >> 2;
460 while (blkCnt > 0U)
461 {
462
463 vec1 = vld1q_f32(pIn);
464 vec2 = vld1q_f32(pSupport);
465 pIn += 4;
466 pSupport += 4;
467
468 temp = vsubq_f32(vec1,vec2);
469 accum = vmlaq_f32(accum, temp,temp);
470
471 blkCnt -- ;
472 }
473 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
474 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
475
476
477 blkCnt = S->vectorDimension & 3;
478 while (blkCnt > 0U)
479 {
480
481 dot = dot + SQ(*pIn - *pSupport);
482 pIn++;
483 pSupport++;
484
485 blkCnt -- ;
486 }
487
488 sum += *pDualCoefs++ * expf(-S->gamma * dot);
489 vectorBlkCnt -- ;
490 }
491
492 *pResult=S->classes[STEP(sum)];
493 }
494 #else
arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 * S,const float32_t * in,int32_t * pResult)495 void arm_svm_rbf_predict_f32(
496 const arm_svm_rbf_instance_f32 *S,
497 const float32_t * in,
498 int32_t * pResult)
499 {
500 float32_t sum=S->intercept;
501 float32_t dot=0;
502 uint32_t i,j;
503 const float32_t *pSupport = S->supportVectors;
504
505 for(i=0; i < S->nbOfSupportVectors; i++)
506 {
507 dot=0;
508 for(j=0; j < S->vectorDimension; j++)
509 {
510 dot = dot + SQ(in[j] - *pSupport);
511 pSupport++;
512 }
513 sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
514 }
515 *pResult=S->classes[STEP(sum)];
516 }
517 #endif
518
519 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
520
521 /**
522 * @} end of rbfsvm group
523 */
524