1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_linear_predict_f32.c
4 * Description: SVM Linear Classifier
5 *
6 *
7 * Target Processor: Cortex-M and Cortex-A cores
8 * -------------------------------------------------------------------- */
9 /*
10 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
11 *
12 * SPDX-License-Identifier: Apache-2.0
13 *
14 * Licensed under the Apache License, Version 2.0 (the License); you may
15 * not use this file except in compliance with the License.
16 * You may obtain a copy of the License at
17 *
18 * www.apache.org/licenses/LICENSE-2.0
19 *
20 * Unless required by applicable law or agreed to in writing, software
21 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
22 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 * See the License for the specific language governing permissions and
24 * limitations under the License.
25 */
26
27 #include "arm_math.h"
28 #include <limits.h>
29 #include <math.h>
30
31
32 /**
33 * @addtogroup groupSVM
34 * @{
35 */
36
37
38 /**
39 * @brief SVM linear prediction
40 * @param[in] S Pointer to an instance of the linear SVM structure.
41 * @param[in] in Pointer to input vector
42 * @param[out] pResult Decision value
43 * @return none.
44 *
45 */
46 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
47
48 #include "arm_helium_utils.h"
49
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)50 void arm_svm_linear_predict_f32(
51 const arm_svm_linear_instance_f32 *S,
52 const float32_t * in,
53 int32_t * pResult)
54 {
55 /* inlined Matrix x Vector function interleaved with dot prod */
56 uint32_t numRows = S->nbOfSupportVectors;
57 uint32_t numCols = S->vectorDimension;
58 const float32_t *pSupport = S->supportVectors;
59 const float32_t *pSrcA = pSupport;
60 const float32_t *pInA0;
61 const float32_t *pInA1;
62 uint32_t row;
63 uint32_t blkCnt; /* loop counters */
64 const float32_t *pDualCoef = S->dualCoefficients;
65 float32_t sum = S->intercept;
66 row = numRows;
67
68 /*
69 * compute 4 rows in parrallel
70 */
71 while (row >= 4)
72 {
73 const float32_t *pInA2, *pInA3;
74 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
75 f32x4_t vecIn, acc0, acc1, acc2, acc3;
76 float32_t const *pSrcVecPtr = in;
77
78 /*
79 * Initialize the pointers to 4 consecutive MatrixA rows
80 */
81 pInA0 = pSrcA;
82 pInA1 = pInA0 + numCols;
83 pInA2 = pInA1 + numCols;
84 pInA3 = pInA2 + numCols;
85 /*
86 * Initialize the vector pointer
87 */
88 pInVec = pSrcVecPtr;
89 /*
90 * reset accumulators
91 */
92 acc0 = vdupq_n_f32(0.0f);
93 acc1 = vdupq_n_f32(0.0f);
94 acc2 = vdupq_n_f32(0.0f);
95 acc3 = vdupq_n_f32(0.0f);
96
97 pSrcA0Vec = pInA0;
98 pSrcA1Vec = pInA1;
99 pSrcA2Vec = pInA2;
100 pSrcA3Vec = pInA3;
101
102 blkCnt = numCols >> 2;
103 while (blkCnt > 0U) {
104 f32x4_t vecA;
105
106 vecIn = vld1q(pInVec);
107 pInVec += 4;
108 vecA = vld1q(pSrcA0Vec);
109 pSrcA0Vec += 4;
110 acc0 = vfmaq(acc0, vecIn, vecA);
111 vecA = vld1q(pSrcA1Vec);
112 pSrcA1Vec += 4;
113 acc1 = vfmaq(acc1, vecIn, vecA);
114 vecA = vld1q(pSrcA2Vec);
115 pSrcA2Vec += 4;
116 acc2 = vfmaq(acc2, vecIn, vecA);
117 vecA = vld1q(pSrcA3Vec);
118 pSrcA3Vec += 4;
119 acc3 = vfmaq(acc3, vecIn, vecA);
120
121 blkCnt--;
122 }
123 /*
124 * tail
125 * (will be merged thru tail predication)
126 */
127 blkCnt = numCols & 3;
128 if (blkCnt > 0U) {
129 mve_pred16_t p0 = vctp32q(blkCnt);
130 f32x4_t vecA;
131
132 vecIn = vldrwq_z_f32(pInVec, p0);
133 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
134 acc0 = vfmaq(acc0, vecIn, vecA);
135 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
136 acc1 = vfmaq(acc1, vecIn, vecA);
137 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
138 acc2 = vfmaq(acc2, vecIn, vecA);
139 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
140 acc3 = vfmaq(acc3, vecIn, vecA);
141 }
142 /*
143 * Sum the partial parts
144 */
145 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
146 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc1);
147 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc2);
148 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc3);
149
150 pSrcA += numCols * 4;
151 /*
152 * Decrement the row loop counter
153 */
154 row -= 4;
155 }
156
157 /*
158 * compute 2 rows in parallel
159 */
160 if (row >= 2) {
161 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
162 f32x4_t vecIn, acc0, acc1;
163 float32_t const *pSrcVecPtr = in;
164
165 /*
166 * Initialize the pointers to 2 consecutive MatrixA rows
167 */
168 pInA0 = pSrcA;
169 pInA1 = pInA0 + numCols;
170 /*
171 * Initialize the vector pointer
172 */
173 pInVec = pSrcVecPtr;
174 /*
175 * reset accumulators
176 */
177 acc0 = vdupq_n_f32(0.0f);
178 acc1 = vdupq_n_f32(0.0f);
179 pSrcA0Vec = pInA0;
180 pSrcA1Vec = pInA1;
181
182 blkCnt = numCols >> 2;
183 while (blkCnt > 0U) {
184 f32x4_t vecA;
185
186 vecIn = vld1q(pInVec);
187 pInVec += 4;
188 vecA = vld1q(pSrcA0Vec);
189 pSrcA0Vec += 4;
190 acc0 = vfmaq(acc0, vecIn, vecA);
191 vecA = vld1q(pSrcA1Vec);
192 pSrcA1Vec += 4;
193 acc1 = vfmaq(acc1, vecIn, vecA);
194
195 blkCnt--;
196 }
197 /*
198 * tail
199 * (will be merged thru tail predication)
200 */
201 blkCnt = numCols & 3;
202 if (blkCnt > 0U) {
203 mve_pred16_t p0 = vctp32q(blkCnt);
204 f32x4_t vecA;
205
206 vecIn = vldrwq_z_f32(pInVec, p0);
207 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
208 acc0 = vfmaq(acc0, vecIn, vecA);
209 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
210 acc1 = vfmaq(acc1, vecIn, vecA);
211 }
212 /*
213 * Sum the partial parts
214 */
215 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
216 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc1);
217
218 pSrcA += numCols * 2;
219 row -= 2;
220 }
221
222 if (row >= 1) {
223 f32x4_t vecIn, acc0;
224 float32_t const *pSrcA0Vec, *pInVec;
225 float32_t const *pSrcVecPtr = in;
226 /*
227 * Initialize the pointers to last MatrixA row
228 */
229 pInA0 = pSrcA;
230 /*
231 * Initialize the vector pointer
232 */
233 pInVec = pSrcVecPtr;
234 /*
235 * reset accumulators
236 */
237 acc0 = vdupq_n_f32(0.0f);
238
239 pSrcA0Vec = pInA0;
240
241 blkCnt = numCols >> 2;
242 while (blkCnt > 0U) {
243 f32x4_t vecA;
244
245 vecIn = vld1q(pInVec);
246 pInVec += 4;
247 vecA = vld1q(pSrcA0Vec);
248 pSrcA0Vec += 4;
249 acc0 = vfmaq(acc0, vecIn, vecA);
250
251 blkCnt--;
252 }
253 /*
254 * tail
255 * (will be merged thru tail predication)
256 */
257 blkCnt = numCols & 3;
258 if (blkCnt > 0U) {
259 mve_pred16_t p0 = vctp32q(blkCnt);
260 f32x4_t vecA;
261
262 vecIn = vldrwq_z_f32(pInVec, p0);
263 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
264 acc0 = vfmaq(acc0, vecIn, vecA);
265 }
266 /*
267 * Sum the partial parts
268 */
269 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
270
271 }
272
273 *pResult = S->classes[STEP(sum)];
274 }
275
276 #else
277 #if defined(ARM_MATH_NEON)
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)278 void arm_svm_linear_predict_f32(
279 const arm_svm_linear_instance_f32 *S,
280 const float32_t * in,
281 int32_t * pResult)
282 {
283 float32_t sum = S->intercept;
284
285 float32_t dot;
286 float32x4_t dotV;
287
288 float32x4_t accuma,accumb,accumc,accumd,accum;
289 float32x2_t accum2;
290 float32x4_t vec1;
291
292 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
293
294 uint32_t blkCnt;
295 uint32_t vectorBlkCnt;
296
297 const float32_t *pIn = in;
298
299 const float32_t *pSupport = S->supportVectors;
300
301 const float32_t *pSupporta = S->supportVectors;
302 const float32_t *pSupportb;
303 const float32_t *pSupportc;
304 const float32_t *pSupportd;
305
306 pSupportb = pSupporta + S->vectorDimension;
307 pSupportc = pSupportb + S->vectorDimension;
308 pSupportd = pSupportc + S->vectorDimension;
309
310 const float32_t *pDualCoefs = S->dualCoefficients;
311
312 vectorBlkCnt = S->nbOfSupportVectors >> 2;
313
314 while (vectorBlkCnt > 0U)
315 {
316 accuma = vdupq_n_f32(0);
317 accumb = vdupq_n_f32(0);
318 accumc = vdupq_n_f32(0);
319 accumd = vdupq_n_f32(0);
320
321 pIn = in;
322
323 blkCnt = S->vectorDimension >> 2;
324 while (blkCnt > 0U)
325 {
326
327 vec1 = vld1q_f32(pIn);
328 vec2a = vld1q_f32(pSupporta);
329 vec2b = vld1q_f32(pSupportb);
330 vec2c = vld1q_f32(pSupportc);
331 vec2d = vld1q_f32(pSupportd);
332
333 pIn += 4;
334 pSupporta += 4;
335 pSupportb += 4;
336 pSupportc += 4;
337 pSupportd += 4;
338
339 accuma = vmlaq_f32(accuma, vec1,vec2a);
340 accumb = vmlaq_f32(accumb, vec1,vec2b);
341 accumc = vmlaq_f32(accumc, vec1,vec2c);
342 accumd = vmlaq_f32(accumd, vec1,vec2d);
343
344 blkCnt -- ;
345 }
346 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
347 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
348
349 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
350 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
351
352 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
353 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
354
355 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
356 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
357
358
359 blkCnt = S->vectorDimension & 3;
360 while (blkCnt > 0U)
361 {
362 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
363 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
364 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
365 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
366
367 pIn++;
368
369 blkCnt -- ;
370 }
371
372 vec1 = vld1q_f32(pDualCoefs);
373 pDualCoefs += 4;
374
375 accum = vmulq_f32(vec1,dotV);
376 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
377 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
378
379 pSupporta += 3*S->vectorDimension;
380 pSupportb += 3*S->vectorDimension;
381 pSupportc += 3*S->vectorDimension;
382 pSupportd += 3*S->vectorDimension;
383
384 vectorBlkCnt -- ;
385 }
386
387 pSupport = pSupporta;
388 vectorBlkCnt = S->nbOfSupportVectors & 3;
389 while (vectorBlkCnt > 0U)
390 {
391 accum = vdupq_n_f32(0);
392 dot = 0.0f;
393 pIn = in;
394
395 blkCnt = S->vectorDimension >> 2;
396 while (blkCnt > 0U)
397 {
398
399 vec1 = vld1q_f32(pIn);
400 vec2 = vld1q_f32(pSupport);
401 pIn += 4;
402 pSupport += 4;
403
404 accum = vmlaq_f32(accum, vec1,vec2);
405
406 blkCnt -- ;
407 }
408 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
409 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
410
411
412 blkCnt = S->vectorDimension & 3;
413 while (blkCnt > 0U)
414 {
415 dot = dot + *pIn++ * *pSupport++;
416
417 blkCnt -- ;
418 }
419
420 sum += *pDualCoefs++ * dot;
421 vectorBlkCnt -- ;
422 }
423
424 *pResult=S->classes[STEP(sum)];
425 }
426 #else
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)427 void arm_svm_linear_predict_f32(
428 const arm_svm_linear_instance_f32 *S,
429 const float32_t * in,
430 int32_t * pResult)
431 {
432 float32_t sum=S->intercept;
433 float32_t dot=0;
434 uint32_t i,j;
435 const float32_t *pSupport = S->supportVectors;
436
437 for(i=0; i < S->nbOfSupportVectors; i++)
438 {
439 dot=0;
440 for(j=0; j < S->vectorDimension; j++)
441 {
442 dot = dot + in[j]* *pSupport++;
443 }
444 sum += S->dualCoefficients[i] * dot;
445 }
446 *pResult=S->classes[STEP(sum)];
447 }
448 #endif
449 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
450
451 /**
452 * @} end of groupSVM group
453 */
454