1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_linear_predict_f32.c
4 * Description: SVM Linear Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33
34 /**
35 * @addtogroup linearsvm
36 * @{
37 */
38
39
40 /**
41 * @brief SVM linear prediction
42 * @param[in] S Pointer to an instance of the linear SVM structure.
43 * @param[in] in Pointer to input vector
44 * @param[out] pResult Decision value
45 * @return none.
46 *
47 */
48 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
49
50 #include "arm_helium_utils.h"
51
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)52 void arm_svm_linear_predict_f32(
53 const arm_svm_linear_instance_f32 *S,
54 const float32_t * in,
55 int32_t * pResult)
56 {
57 /* inlined Matrix x Vector function interleaved with dot prod */
58 uint32_t numRows = S->nbOfSupportVectors;
59 uint32_t numCols = S->vectorDimension;
60 const float32_t *pSupport = S->supportVectors;
61 const float32_t *pSrcA = pSupport;
62 const float32_t *pInA0;
63 const float32_t *pInA1;
64 uint32_t row;
65 uint32_t blkCnt; /* loop counters */
66 const float32_t *pDualCoef = S->dualCoefficients;
67 float32_t sum = S->intercept;
68 row = numRows;
69
70 /*
71 * compute 4 rows in parrallel
72 */
73 while (row >= 4)
74 {
75 const float32_t *pInA2, *pInA3;
76 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
77 f32x4_t vecIn, acc0, acc1, acc2, acc3;
78 float32_t const *pSrcVecPtr = in;
79
80 /*
81 * Initialize the pointers to 4 consecutive MatrixA rows
82 */
83 pInA0 = pSrcA;
84 pInA1 = pInA0 + numCols;
85 pInA2 = pInA1 + numCols;
86 pInA3 = pInA2 + numCols;
87 /*
88 * Initialize the vector pointer
89 */
90 pInVec = pSrcVecPtr;
91 /*
92 * reset accumulators
93 */
94 acc0 = vdupq_n_f32(0.0f);
95 acc1 = vdupq_n_f32(0.0f);
96 acc2 = vdupq_n_f32(0.0f);
97 acc3 = vdupq_n_f32(0.0f);
98
99 pSrcA0Vec = pInA0;
100 pSrcA1Vec = pInA1;
101 pSrcA2Vec = pInA2;
102 pSrcA3Vec = pInA3;
103
104 blkCnt = numCols >> 2;
105 while (blkCnt > 0U) {
106 f32x4_t vecA;
107
108 vecIn = vld1q(pInVec);
109 pInVec += 4;
110 vecA = vld1q(pSrcA0Vec);
111 pSrcA0Vec += 4;
112 acc0 = vfmaq(acc0, vecIn, vecA);
113 vecA = vld1q(pSrcA1Vec);
114 pSrcA1Vec += 4;
115 acc1 = vfmaq(acc1, vecIn, vecA);
116 vecA = vld1q(pSrcA2Vec);
117 pSrcA2Vec += 4;
118 acc2 = vfmaq(acc2, vecIn, vecA);
119 vecA = vld1q(pSrcA3Vec);
120 pSrcA3Vec += 4;
121 acc3 = vfmaq(acc3, vecIn, vecA);
122
123 blkCnt--;
124 }
125 /*
126 * tail
127 * (will be merged thru tail predication)
128 */
129 blkCnt = numCols & 3;
130 if (blkCnt > 0U) {
131 mve_pred16_t p0 = vctp32q(blkCnt);
132 f32x4_t vecA;
133
134 vecIn = vldrwq_z_f32(pInVec, p0);
135 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
136 acc0 = vfmaq(acc0, vecIn, vecA);
137 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
138 acc1 = vfmaq(acc1, vecIn, vecA);
139 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
140 acc2 = vfmaq(acc2, vecIn, vecA);
141 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
142 acc3 = vfmaq(acc3, vecIn, vecA);
143 }
144 /*
145 * Sum the partial parts
146 */
147
148 acc0 = vmulq_n_f32(acc0,*pDualCoef++);
149 acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
150 acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
151 acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
152
153 sum += vecAddAcrossF32Mve(acc0);
154
155 pSrcA += numCols * 4;
156 /*
157 * Decrement the row loop counter
158 */
159 row -= 4;
160 }
161
162 /*
163 * compute 2 rows in parallel
164 */
165 if (row >= 2) {
166 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
167 f32x4_t vecIn, acc0, acc1;
168 float32_t const *pSrcVecPtr = in;
169
170 /*
171 * Initialize the pointers to 2 consecutive MatrixA rows
172 */
173 pInA0 = pSrcA;
174 pInA1 = pInA0 + numCols;
175 /*
176 * Initialize the vector pointer
177 */
178 pInVec = pSrcVecPtr;
179 /*
180 * reset accumulators
181 */
182 acc0 = vdupq_n_f32(0.0f);
183 acc1 = vdupq_n_f32(0.0f);
184 pSrcA0Vec = pInA0;
185 pSrcA1Vec = pInA1;
186
187 blkCnt = numCols >> 2;
188 while (blkCnt > 0U) {
189 f32x4_t vecA;
190
191 vecIn = vld1q(pInVec);
192 pInVec += 4;
193 vecA = vld1q(pSrcA0Vec);
194 pSrcA0Vec += 4;
195 acc0 = vfmaq(acc0, vecIn, vecA);
196 vecA = vld1q(pSrcA1Vec);
197 pSrcA1Vec += 4;
198 acc1 = vfmaq(acc1, vecIn, vecA);
199
200 blkCnt--;
201 }
202 /*
203 * tail
204 * (will be merged thru tail predication)
205 */
206 blkCnt = numCols & 3;
207 if (blkCnt > 0U) {
208 mve_pred16_t p0 = vctp32q(blkCnt);
209 f32x4_t vecA;
210
211 vecIn = vldrwq_z_f32(pInVec, p0);
212 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
213 acc0 = vfmaq(acc0, vecIn, vecA);
214 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
215 acc1 = vfmaq(acc1, vecIn, vecA);
216 }
217 /*
218 * Sum the partial parts
219 */
220 acc0 = vmulq_n_f32(acc0,*pDualCoef++);
221 acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
222
223 sum += vecAddAcrossF32Mve(acc0);
224
225
226 pSrcA += numCols * 2;
227 row -= 2;
228 }
229
230 if (row >= 1) {
231 f32x4_t vecIn, acc0;
232 float32_t const *pSrcA0Vec, *pInVec;
233 float32_t const *pSrcVecPtr = in;
234 /*
235 * Initialize the pointers to last MatrixA row
236 */
237 pInA0 = pSrcA;
238 /*
239 * Initialize the vector pointer
240 */
241 pInVec = pSrcVecPtr;
242 /*
243 * reset accumulators
244 */
245 acc0 = vdupq_n_f32(0.0f);
246
247 pSrcA0Vec = pInA0;
248
249 blkCnt = numCols >> 2;
250 while (blkCnt > 0U) {
251 f32x4_t vecA;
252
253 vecIn = vld1q(pInVec);
254 pInVec += 4;
255 vecA = vld1q(pSrcA0Vec);
256 pSrcA0Vec += 4;
257 acc0 = vfmaq(acc0, vecIn, vecA);
258
259 blkCnt--;
260 }
261 /*
262 * tail
263 * (will be merged thru tail predication)
264 */
265 blkCnt = numCols & 3;
266 if (blkCnt > 0U) {
267 mve_pred16_t p0 = vctp32q(blkCnt);
268 f32x4_t vecA;
269
270 vecIn = vldrwq_z_f32(pInVec, p0);
271 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
272 acc0 = vfmaq(acc0, vecIn, vecA);
273 }
274 /*
275 * Sum the partial parts
276 */
277 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
278
279 }
280
281 *pResult = S->classes[STEP(sum)];
282 }
283
284 #else
285 #if defined(ARM_MATH_NEON)
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)286 void arm_svm_linear_predict_f32(
287 const arm_svm_linear_instance_f32 *S,
288 const float32_t * in,
289 int32_t * pResult)
290 {
291 float32_t sum = S->intercept;
292
293 float32_t dot;
294 float32x4_t dotV;
295
296 float32x4_t accuma,accumb,accumc,accumd,accum;
297 float32x2_t accum2;
298 float32x4_t vec1;
299
300 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
301
302 uint32_t blkCnt;
303 uint32_t vectorBlkCnt;
304
305 const float32_t *pIn = in;
306
307 const float32_t *pSupport = S->supportVectors;
308
309 const float32_t *pSupporta = S->supportVectors;
310 const float32_t *pSupportb;
311 const float32_t *pSupportc;
312 const float32_t *pSupportd;
313
314 pSupportb = pSupporta + S->vectorDimension;
315 pSupportc = pSupportb + S->vectorDimension;
316 pSupportd = pSupportc + S->vectorDimension;
317
318 const float32_t *pDualCoefs = S->dualCoefficients;
319
320 vectorBlkCnt = S->nbOfSupportVectors >> 2;
321
322 while (vectorBlkCnt > 0U)
323 {
324 accuma = vdupq_n_f32(0);
325 accumb = vdupq_n_f32(0);
326 accumc = vdupq_n_f32(0);
327 accumd = vdupq_n_f32(0);
328
329 pIn = in;
330
331 blkCnt = S->vectorDimension >> 2;
332 while (blkCnt > 0U)
333 {
334
335 vec1 = vld1q_f32(pIn);
336 vec2a = vld1q_f32(pSupporta);
337 vec2b = vld1q_f32(pSupportb);
338 vec2c = vld1q_f32(pSupportc);
339 vec2d = vld1q_f32(pSupportd);
340
341 pIn += 4;
342 pSupporta += 4;
343 pSupportb += 4;
344 pSupportc += 4;
345 pSupportd += 4;
346
347 accuma = vmlaq_f32(accuma, vec1,vec2a);
348 accumb = vmlaq_f32(accumb, vec1,vec2b);
349 accumc = vmlaq_f32(accumc, vec1,vec2c);
350 accumd = vmlaq_f32(accumd, vec1,vec2d);
351
352 blkCnt -- ;
353 }
354 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
355 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
356
357 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
358 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
359
360 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
361 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
362
363 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
364 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
365
366
367 blkCnt = S->vectorDimension & 3;
368 while (blkCnt > 0U)
369 {
370 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
371 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
372 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
373 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
374
375 pIn++;
376
377 blkCnt -- ;
378 }
379
380 vec1 = vld1q_f32(pDualCoefs);
381 pDualCoefs += 4;
382
383 accum = vmulq_f32(vec1,dotV);
384 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
385 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
386
387 pSupporta += 3*S->vectorDimension;
388 pSupportb += 3*S->vectorDimension;
389 pSupportc += 3*S->vectorDimension;
390 pSupportd += 3*S->vectorDimension;
391
392 vectorBlkCnt -- ;
393 }
394
395 pSupport = pSupporta;
396 vectorBlkCnt = S->nbOfSupportVectors & 3;
397 while (vectorBlkCnt > 0U)
398 {
399 accum = vdupq_n_f32(0);
400 dot = 0.0f;
401 pIn = in;
402
403 blkCnt = S->vectorDimension >> 2;
404 while (blkCnt > 0U)
405 {
406
407 vec1 = vld1q_f32(pIn);
408 vec2 = vld1q_f32(pSupport);
409 pIn += 4;
410 pSupport += 4;
411
412 accum = vmlaq_f32(accum, vec1,vec2);
413
414 blkCnt -- ;
415 }
416 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
417 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
418
419
420 blkCnt = S->vectorDimension & 3;
421 while (blkCnt > 0U)
422 {
423 dot = dot + *pIn++ * *pSupport++;
424
425 blkCnt -- ;
426 }
427
428 sum += *pDualCoefs++ * dot;
429 vectorBlkCnt -- ;
430 }
431
432 *pResult=S->classes[STEP(sum)];
433 }
434 #else
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)435 void arm_svm_linear_predict_f32(
436 const arm_svm_linear_instance_f32 *S,
437 const float32_t * in,
438 int32_t * pResult)
439 {
440 float32_t sum=S->intercept;
441 float32_t dot=0;
442 uint32_t i,j;
443 const float32_t *pSupport = S->supportVectors;
444
445 for(i=0; i < S->nbOfSupportVectors; i++)
446 {
447 dot=0;
448 for(j=0; j < S->vectorDimension; j++)
449 {
450 dot = dot + in[j]* *pSupport++;
451 }
452 sum += S->dualCoefficients[i] * dot;
453 }
454 *pResult=S->classes[STEP(sum)];
455 }
456 #endif
457 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
458
459 /**
460 * @} end of linearsvm group
461 */
462