1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_sigmoid_predict_f32.c
4 * Description: SVM Sigmoid Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33 /**
34 * @addtogroup sigmoidsvm
35 * @{
36 */
37
38
39
40 /**
41 * @brief SVM sigmoid prediction
42 * @param[in] S Pointer to an instance of the rbf SVM structure.
43 * @param[in] in Pointer to input vector
44 * @param[out] pResult Decision value
45 * @return none.
46 *
47 */
48
49 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
50
51 #include "arm_helium_utils.h"
52 #include "arm_vec_math.h"
53
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)54 void arm_svm_sigmoid_predict_f32(
55 const arm_svm_sigmoid_instance_f32 *S,
56 const float32_t * in,
57 int32_t * pResult)
58 {
59 /* inlined Matrix x Vector function interleaved with dot prod */
60 uint32_t numRows = S->nbOfSupportVectors;
61 uint32_t numCols = S->vectorDimension;
62 const float32_t *pSupport = S->supportVectors;
63 const float32_t *pSrcA = pSupport;
64 const float32_t *pInA0;
65 const float32_t *pInA1;
66 uint32_t row;
67 uint32_t blkCnt; /* loop counters */
68 const float32_t *pDualCoef = S->dualCoefficients;
69 float32_t sum = S->intercept;
70 f32x4_t vSum = vdupq_n_f32(0.0f);
71
72 row = numRows;
73
74 /*
75 * compute 4 rows in parrallel
76 */
77 while (row >= 4) {
78 const float32_t *pInA2, *pInA3;
79 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
80 f32x4_t vecIn, acc0, acc1, acc2, acc3;
81 float32_t const *pSrcVecPtr = in;
82
83 /*
84 * Initialize the pointers to 4 consecutive MatrixA rows
85 */
86 pInA0 = pSrcA;
87 pInA1 = pInA0 + numCols;
88 pInA2 = pInA1 + numCols;
89 pInA3 = pInA2 + numCols;
90 /*
91 * Initialize the vector pointer
92 */
93 pInVec = pSrcVecPtr;
94 /*
95 * reset accumulators
96 */
97 acc0 = vdupq_n_f32(0.0f);
98 acc1 = vdupq_n_f32(0.0f);
99 acc2 = vdupq_n_f32(0.0f);
100 acc3 = vdupq_n_f32(0.0f);
101
102 pSrcA0Vec = pInA0;
103 pSrcA1Vec = pInA1;
104 pSrcA2Vec = pInA2;
105 pSrcA3Vec = pInA3;
106
107 blkCnt = numCols >> 2;
108 while (blkCnt > 0U) {
109 f32x4_t vecA;
110
111 vecIn = vld1q(pInVec);
112 pInVec += 4;
113 vecA = vld1q(pSrcA0Vec);
114 pSrcA0Vec += 4;
115 acc0 = vfmaq(acc0, vecIn, vecA);
116 vecA = vld1q(pSrcA1Vec);
117 pSrcA1Vec += 4;
118 acc1 = vfmaq(acc1, vecIn, vecA);
119 vecA = vld1q(pSrcA2Vec);
120 pSrcA2Vec += 4;
121 acc2 = vfmaq(acc2, vecIn, vecA);
122 vecA = vld1q(pSrcA3Vec);
123 pSrcA3Vec += 4;
124 acc3 = vfmaq(acc3, vecIn, vecA);
125
126 blkCnt--;
127 }
128 /*
129 * tail
130 * (will be merged thru tail predication)
131 */
132 blkCnt = numCols & 3;
133 if (blkCnt > 0U) {
134 mve_pred16_t p0 = vctp32q(blkCnt);
135 f32x4_t vecA;
136
137 vecIn = vldrwq_z_f32(pInVec, p0);
138 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
139 acc0 = vfmaq(acc0, vecIn, vecA);
140 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
141 acc1 = vfmaq(acc1, vecIn, vecA);
142 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
143 acc2 = vfmaq(acc2, vecIn, vecA);
144 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
145 acc3 = vfmaq(acc3, vecIn, vecA);
146 }
147 /*
148 * Sum the partial parts
149 */
150 f32x4_t vtmp = vuninitializedq_f32();
151 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
152 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
153 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
154 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
155
156 vSum =
157 vfmaq_f32(vSum, vld1q(pDualCoef),
158 vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
159
160 pDualCoef += 4;
161
162 pSrcA += numCols * 4;
163 /*
164 * Decrement the row loop counter
165 */
166 row -= 4;
167 }
168
169 /*
170 * compute 2 rows in parrallel
171 */
172 if (row >= 2) {
173 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
174 f32x4_t vecIn, acc0, acc1;
175 float32_t const *pSrcVecPtr = in;
176
177 /*
178 * Initialize the pointers to 2 consecutive MatrixA rows
179 */
180 pInA0 = pSrcA;
181 pInA1 = pInA0 + numCols;
182 /*
183 * Initialize the vector pointer
184 */
185 pInVec = pSrcVecPtr;
186 /*
187 * reset accumulators
188 */
189 acc0 = vdupq_n_f32(0.0f);
190 acc1 = vdupq_n_f32(0.0f);
191 pSrcA0Vec = pInA0;
192 pSrcA1Vec = pInA1;
193
194 blkCnt = numCols >> 2;
195 while (blkCnt > 0U) {
196 f32x4_t vecA;
197
198 vecIn = vld1q(pInVec);
199 pInVec += 4;
200 vecA = vld1q(pSrcA0Vec);
201 pSrcA0Vec += 4;
202 acc0 = vfmaq(acc0, vecIn, vecA);
203 vecA = vld1q(pSrcA1Vec);
204 pSrcA1Vec += 4;
205 acc1 = vfmaq(acc1, vecIn, vecA);
206
207 blkCnt--;
208 }
209 /*
210 * tail
211 * (will be merged thru tail predication)
212 */
213 blkCnt = numCols & 3;
214 if (blkCnt > 0U) {
215 mve_pred16_t p0 = vctp32q(blkCnt);
216 f32x4_t vecA;
217
218 vecIn = vldrwq_z_f32(pInVec, p0);
219 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
220 acc0 = vfmaq(acc0, vecIn, vecA);
221 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
222 acc1 = vfmaq(acc1, vecIn, vecA);
223 }
224 /*
225 * Sum the partial parts
226 */
227 f32x4_t vtmp = vuninitializedq_f32();
228 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
229 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
230
231 vSum =
232 vfmaq_m_f32(vSum, vld1q(pDualCoef),
233 vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
234 vctp32q(2));
235
236 pSrcA += numCols * 2;
237 row -= 2;
238 }
239
240 if (row >= 1) {
241 f32x4_t vecIn, acc0;
242 float32_t const *pSrcA0Vec, *pInVec;
243 float32_t const *pSrcVecPtr = in;
244 /*
245 * Initialize the pointers to last MatrixA row
246 */
247 pInA0 = pSrcA;
248 /*
249 * Initialize the vector pointer
250 */
251 pInVec = pSrcVecPtr;
252 /*
253 * reset accumulators
254 */
255 acc0 = vdupq_n_f32(0.0f);
256
257 pSrcA0Vec = pInA0;
258
259 blkCnt = numCols >> 2;
260 while (blkCnt > 0U) {
261 f32x4_t vecA;
262
263 vecIn = vld1q(pInVec);
264 pInVec += 4;
265 vecA = vld1q(pSrcA0Vec);
266 pSrcA0Vec += 4;
267 acc0 = vfmaq(acc0, vecIn, vecA);
268
269 blkCnt--;
270 }
271 /*
272 * tail
273 * (will be merged thru tail predication)
274 */
275 blkCnt = numCols & 3;
276 if (blkCnt > 0U) {
277 mve_pred16_t p0 = vctp32q(blkCnt);
278 f32x4_t vecA;
279
280 vecIn = vldrwq_z_f32(pInVec, p0);
281 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
282 acc0 = vfmaq(acc0, vecIn, vecA);
283 }
284 /*
285 * Sum the partial parts
286 */
287 f32x4_t vtmp = vuninitializedq_f32();
288 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
289
290 vSum =
291 vfmaq_m_f32(vSum, vld1q(pDualCoef),
292 vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
293 vctp32q(1));
294 }
295 sum += vecAddAcrossF32Mve(vSum);
296
297 *pResult = S->classes[STEP(sum)];
298 }
299
300 #else
301 #if defined(ARM_MATH_NEON)
302 #include "NEMath.h"
303
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)304 void arm_svm_sigmoid_predict_f32(
305 const arm_svm_sigmoid_instance_f32 *S,
306 const float32_t * in,
307 int32_t * pResult)
308 {
309 float32_t sum = S->intercept;
310
311 float32_t dot;
312 float32x4_t dotV;
313
314 float32x4_t accuma,accumb,accumc,accumd,accum;
315 float32x2_t accum2;
316 float32x4_t vec1;
317 float32x4_t coef0 = vdupq_n_f32(S->coef0);
318
319 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
320
321 uint32_t blkCnt;
322 uint32_t vectorBlkCnt;
323
324 const float32_t *pIn = in;
325
326 const float32_t *pSupport = S->supportVectors;
327
328 const float32_t *pSupporta = S->supportVectors;
329 const float32_t *pSupportb;
330 const float32_t *pSupportc;
331 const float32_t *pSupportd;
332
333 pSupportb = pSupporta + S->vectorDimension;
334 pSupportc = pSupportb + S->vectorDimension;
335 pSupportd = pSupportc + S->vectorDimension;
336
337 const float32_t *pDualCoefs = S->dualCoefficients;
338
339 vectorBlkCnt = S->nbOfSupportVectors >> 2;
340 while (vectorBlkCnt > 0U)
341 {
342 accuma = vdupq_n_f32(0);
343 accumb = vdupq_n_f32(0);
344 accumc = vdupq_n_f32(0);
345 accumd = vdupq_n_f32(0);
346
347 pIn = in;
348
349 blkCnt = S->vectorDimension >> 2;
350 while (blkCnt > 0U)
351 {
352
353 vec1 = vld1q_f32(pIn);
354 vec2a = vld1q_f32(pSupporta);
355 vec2b = vld1q_f32(pSupportb);
356 vec2c = vld1q_f32(pSupportc);
357 vec2d = vld1q_f32(pSupportd);
358
359 pIn += 4;
360 pSupporta += 4;
361 pSupportb += 4;
362 pSupportc += 4;
363 pSupportd += 4;
364
365 accuma = vmlaq_f32(accuma, vec1,vec2a);
366 accumb = vmlaq_f32(accumb, vec1,vec2b);
367 accumc = vmlaq_f32(accumc, vec1,vec2c);
368 accumd = vmlaq_f32(accumd, vec1,vec2d);
369
370 blkCnt -- ;
371 }
372 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
373 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
374
375 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
376 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
377
378 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
379 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
380
381 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
382 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
383
384
385 blkCnt = S->vectorDimension & 3;
386 while (blkCnt > 0U)
387 {
388 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
389 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
390 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
391 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
392
393 pIn++;
394
395 blkCnt -- ;
396 }
397
398 vec1 = vld1q_f32(pDualCoefs);
399 pDualCoefs += 4;
400
401 // To vectorize later
402 dotV = vmulq_n_f32(dotV, S->gamma);
403 dotV = vaddq_f32(dotV, coef0);
404
405 dotV = vtanhq_f32(dotV);
406
407 accum = vmulq_f32(vec1,dotV);
408 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
409 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
410
411 pSupporta += 3*S->vectorDimension;
412 pSupportb += 3*S->vectorDimension;
413 pSupportc += 3*S->vectorDimension;
414 pSupportd += 3*S->vectorDimension;
415
416 vectorBlkCnt -- ;
417 }
418
419 pSupport = pSupporta;
420 vectorBlkCnt = S->nbOfSupportVectors & 3;
421
422 while (vectorBlkCnt > 0U)
423 {
424 accum = vdupq_n_f32(0);
425 dot = 0.0f;
426 pIn = in;
427
428 blkCnt = S->vectorDimension >> 2;
429 while (blkCnt > 0U)
430 {
431
432 vec1 = vld1q_f32(pIn);
433 vec2 = vld1q_f32(pSupport);
434 pIn += 4;
435 pSupport += 4;
436
437 accum = vmlaq_f32(accum, vec1,vec2);
438
439 blkCnt -- ;
440 }
441 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
442 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
443
444
445 blkCnt = S->vectorDimension & 3;
446 while (blkCnt > 0U)
447 {
448 dot = dot + *pIn++ * *pSupport++;
449
450 blkCnt -- ;
451 }
452
453 sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
454 vectorBlkCnt -- ;
455 }
456
457 *pResult=S->classes[STEP(sum)];
458 }
459 #else
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)460 void arm_svm_sigmoid_predict_f32(
461 const arm_svm_sigmoid_instance_f32 *S,
462 const float32_t * in,
463 int32_t * pResult)
464 {
465 float32_t sum=S->intercept;
466 float32_t dot=0;
467 uint32_t i,j;
468 const float32_t *pSupport = S->supportVectors;
469
470 for(i=0; i < S->nbOfSupportVectors; i++)
471 {
472 dot=0;
473 for(j=0; j < S->vectorDimension; j++)
474 {
475 dot = dot + in[j]* *pSupport++;
476 }
477 sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
478 }
479 *pResult=S->classes[STEP(sum)];
480 }
481
482 #endif
483 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
484
485 /**
486 * @} end of sigmoidsvm group
487 */
488