• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_linear_predict_f32.c
4  * Description:  SVM Linear Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 
34 /**
35  * @addtogroup linearsvm
36  * @{
37  */
38 
39 
40 /**
41  * @brief SVM linear prediction
42  * @param[in]    S          Pointer to an instance of the linear SVM structure.
43  * @param[in]    in         Pointer to input vector
44  * @param[out]   pResult    Decision value
45  * @return none.
46  *
47  */
48 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
49 
50 #include "arm_helium_utils.h"
51 
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)52 void arm_svm_linear_predict_f32(
53     const arm_svm_linear_instance_f32 *S,
54     const float32_t * in,
55     int32_t * pResult)
56 {
57         /* inlined Matrix x Vector function interleaved with dot prod */
58     uint32_t        numRows = S->nbOfSupportVectors;
59     uint32_t        numCols = S->vectorDimension;
60     const float32_t *pSupport = S->supportVectors;
61     const float32_t *pSrcA = pSupport;
62     const float32_t *pInA0;
63     const float32_t *pInA1;
64     uint32_t         row;
65     uint32_t         blkCnt;     /* loop counters */
66     const float32_t *pDualCoef = S->dualCoefficients;
67     float32_t       sum = S->intercept;
68     row = numRows;
69 
70     /*
71      * compute 4 rows in parrallel
72      */
73     while (row >= 4)
74     {
75         const float32_t *pInA2, *pInA3;
76         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
77         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
78         float32_t const *pSrcVecPtr = in;
79 
80         /*
81          * Initialize the pointers to 4 consecutive MatrixA rows
82          */
83         pInA0 = pSrcA;
84         pInA1 = pInA0 + numCols;
85         pInA2 = pInA1 + numCols;
86         pInA3 = pInA2 + numCols;
87         /*
88          * Initialize the vector pointer
89          */
90         pInVec = pSrcVecPtr;
91         /*
92          * reset accumulators
93          */
94         acc0 = vdupq_n_f32(0.0f);
95         acc1 = vdupq_n_f32(0.0f);
96         acc2 = vdupq_n_f32(0.0f);
97         acc3 = vdupq_n_f32(0.0f);
98 
99         pSrcA0Vec = pInA0;
100         pSrcA1Vec = pInA1;
101         pSrcA2Vec = pInA2;
102         pSrcA3Vec = pInA3;
103 
104         blkCnt = numCols >> 2;
105         while (blkCnt > 0U) {
106             f32x4_t         vecA;
107 
108             vecIn = vld1q(pInVec);
109             pInVec += 4;
110             vecA = vld1q(pSrcA0Vec);
111             pSrcA0Vec += 4;
112             acc0 = vfmaq(acc0, vecIn, vecA);
113             vecA = vld1q(pSrcA1Vec);
114             pSrcA1Vec += 4;
115             acc1 = vfmaq(acc1, vecIn, vecA);
116             vecA = vld1q(pSrcA2Vec);
117             pSrcA2Vec += 4;
118             acc2 = vfmaq(acc2, vecIn, vecA);
119             vecA = vld1q(pSrcA3Vec);
120             pSrcA3Vec += 4;
121             acc3 = vfmaq(acc3, vecIn, vecA);
122 
123             blkCnt--;
124         }
125         /*
126          * tail
127          * (will be merged thru tail predication)
128          */
129         blkCnt = numCols & 3;
130         if (blkCnt > 0U) {
131             mve_pred16_t    p0 = vctp32q(blkCnt);
132             f32x4_t         vecA;
133 
134             vecIn = vldrwq_z_f32(pInVec, p0);
135             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
136             acc0 = vfmaq(acc0, vecIn, vecA);
137             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
138             acc1 = vfmaq(acc1, vecIn, vecA);
139             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
140             acc2 = vfmaq(acc2, vecIn, vecA);
141             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
142             acc3 = vfmaq(acc3, vecIn, vecA);
143         }
144         /*
145          * Sum the partial parts
146          */
147 
148         acc0 = vmulq_n_f32(acc0,*pDualCoef++);
149         acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
150         acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
151         acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
152 
153         sum += vecAddAcrossF32Mve(acc0);
154 
155         pSrcA += numCols * 4;
156         /*
157          * Decrement the row loop counter
158          */
159         row -= 4;
160     }
161 
162     /*
163      * compute 2 rows in parallel
164      */
165     if (row >= 2) {
166         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
167         f32x4_t         vecIn, acc0, acc1;
168         float32_t const *pSrcVecPtr = in;
169 
170         /*
171          * Initialize the pointers to 2 consecutive MatrixA rows
172          */
173         pInA0 = pSrcA;
174         pInA1 = pInA0 + numCols;
175         /*
176          * Initialize the vector pointer
177          */
178         pInVec = pSrcVecPtr;
179         /*
180          * reset accumulators
181          */
182         acc0 = vdupq_n_f32(0.0f);
183         acc1 = vdupq_n_f32(0.0f);
184         pSrcA0Vec = pInA0;
185         pSrcA1Vec = pInA1;
186 
187         blkCnt = numCols >> 2;
188         while (blkCnt > 0U) {
189             f32x4_t         vecA;
190 
191             vecIn = vld1q(pInVec);
192             pInVec += 4;
193             vecA = vld1q(pSrcA0Vec);
194             pSrcA0Vec += 4;
195             acc0 = vfmaq(acc0, vecIn, vecA);
196             vecA = vld1q(pSrcA1Vec);
197             pSrcA1Vec += 4;
198             acc1 = vfmaq(acc1, vecIn, vecA);
199 
200             blkCnt--;
201         }
202         /*
203          * tail
204          * (will be merged thru tail predication)
205          */
206         blkCnt = numCols & 3;
207         if (blkCnt > 0U) {
208             mve_pred16_t    p0 = vctp32q(blkCnt);
209             f32x4_t         vecA;
210 
211             vecIn = vldrwq_z_f32(pInVec, p0);
212             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
213             acc0 = vfmaq(acc0, vecIn, vecA);
214             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
215             acc1 = vfmaq(acc1, vecIn, vecA);
216         }
217         /*
218          * Sum the partial parts
219          */
220         acc0 = vmulq_n_f32(acc0,*pDualCoef++);
221         acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
222 
223         sum += vecAddAcrossF32Mve(acc0);
224 
225 
226         pSrcA += numCols * 2;
227         row -= 2;
228     }
229 
230     if (row >= 1) {
231         f32x4_t         vecIn, acc0;
232         float32_t const *pSrcA0Vec, *pInVec;
233         float32_t const *pSrcVecPtr = in;
234         /*
235          * Initialize the pointers to last MatrixA row
236          */
237         pInA0 = pSrcA;
238         /*
239          * Initialize the vector pointer
240          */
241         pInVec = pSrcVecPtr;
242         /*
243          * reset accumulators
244          */
245         acc0 = vdupq_n_f32(0.0f);
246 
247         pSrcA0Vec = pInA0;
248 
249         blkCnt = numCols >> 2;
250         while (blkCnt > 0U) {
251             f32x4_t         vecA;
252 
253             vecIn = vld1q(pInVec);
254             pInVec += 4;
255             vecA = vld1q(pSrcA0Vec);
256             pSrcA0Vec += 4;
257             acc0 = vfmaq(acc0, vecIn, vecA);
258 
259             blkCnt--;
260         }
261         /*
262          * tail
263          * (will be merged thru tail predication)
264          */
265         blkCnt = numCols & 3;
266         if (blkCnt > 0U) {
267             mve_pred16_t    p0 = vctp32q(blkCnt);
268             f32x4_t         vecA;
269 
270             vecIn = vldrwq_z_f32(pInVec, p0);
271             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
272             acc0 = vfmaq(acc0, vecIn, vecA);
273         }
274         /*
275          * Sum the partial parts
276          */
277         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
278 
279     }
280 
281     *pResult = S->classes[STEP(sum)];
282 }
283 
284 #else
285 #if defined(ARM_MATH_NEON)
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)286 void arm_svm_linear_predict_f32(
287     const arm_svm_linear_instance_f32 *S,
288     const float32_t * in,
289     int32_t * pResult)
290 {
291     float32_t sum = S->intercept;
292 
293     float32_t dot;
294     float32x4_t dotV;
295 
296     float32x4_t accuma,accumb,accumc,accumd,accum;
297     float32x2_t accum2;
298     float32x4_t vec1;
299 
300     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
301 
302     uint32_t blkCnt;
303     uint32_t vectorBlkCnt;
304 
305     const float32_t *pIn = in;
306 
307     const float32_t *pSupport = S->supportVectors;
308 
309     const float32_t *pSupporta = S->supportVectors;
310     const float32_t *pSupportb;
311     const float32_t *pSupportc;
312     const float32_t *pSupportd;
313 
314     pSupportb = pSupporta + S->vectorDimension;
315     pSupportc = pSupportb + S->vectorDimension;
316     pSupportd = pSupportc + S->vectorDimension;
317 
318     const float32_t *pDualCoefs = S->dualCoefficients;
319 
320     vectorBlkCnt = S->nbOfSupportVectors >> 2;
321 
322     while (vectorBlkCnt > 0U)
323     {
324         accuma = vdupq_n_f32(0);
325         accumb = vdupq_n_f32(0);
326         accumc = vdupq_n_f32(0);
327         accumd = vdupq_n_f32(0);
328 
329         pIn = in;
330 
331         blkCnt = S->vectorDimension >> 2;
332         while (blkCnt > 0U)
333         {
334 
335             vec1 = vld1q_f32(pIn);
336             vec2a = vld1q_f32(pSupporta);
337             vec2b = vld1q_f32(pSupportb);
338             vec2c = vld1q_f32(pSupportc);
339             vec2d = vld1q_f32(pSupportd);
340 
341             pIn += 4;
342             pSupporta += 4;
343             pSupportb += 4;
344             pSupportc += 4;
345             pSupportd += 4;
346 
347             accuma = vmlaq_f32(accuma, vec1,vec2a);
348             accumb = vmlaq_f32(accumb, vec1,vec2b);
349             accumc = vmlaq_f32(accumc, vec1,vec2c);
350             accumd = vmlaq_f32(accumd, vec1,vec2d);
351 
352             blkCnt -- ;
353         }
354         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
355         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
356 
357         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
358         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
359 
360         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
361         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
362 
363         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
364         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
365 
366 
367         blkCnt = S->vectorDimension & 3;
368         while (blkCnt > 0U)
369         {
370             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
371             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
372             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
373             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
374 
375             pIn++;
376 
377             blkCnt -- ;
378         }
379 
380         vec1 = vld1q_f32(pDualCoefs);
381         pDualCoefs += 4;
382 
383         accum = vmulq_f32(vec1,dotV);
384         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
385         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
386 
387         pSupporta += 3*S->vectorDimension;
388         pSupportb += 3*S->vectorDimension;
389         pSupportc += 3*S->vectorDimension;
390         pSupportd += 3*S->vectorDimension;
391 
392         vectorBlkCnt -- ;
393     }
394 
395     pSupport = pSupporta;
396     vectorBlkCnt = S->nbOfSupportVectors & 3;
397     while (vectorBlkCnt > 0U)
398     {
399         accum = vdupq_n_f32(0);
400         dot = 0.0f;
401         pIn = in;
402 
403         blkCnt = S->vectorDimension >> 2;
404         while (blkCnt > 0U)
405         {
406 
407             vec1 = vld1q_f32(pIn);
408             vec2 = vld1q_f32(pSupport);
409             pIn += 4;
410             pSupport += 4;
411 
412             accum = vmlaq_f32(accum, vec1,vec2);
413 
414             blkCnt -- ;
415         }
416         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
417         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
418 
419 
420         blkCnt = S->vectorDimension & 3;
421         while (blkCnt > 0U)
422         {
423             dot = dot + *pIn++ * *pSupport++;
424 
425             blkCnt -- ;
426         }
427 
428         sum += *pDualCoefs++ * dot;
429         vectorBlkCnt -- ;
430     }
431 
432     *pResult=S->classes[STEP(sum)];
433 }
434 #else
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)435 void arm_svm_linear_predict_f32(
436     const arm_svm_linear_instance_f32 *S,
437     const float32_t * in,
438     int32_t * pResult)
439 {
440     float32_t sum=S->intercept;
441     float32_t dot=0;
442     uint32_t i,j;
443     const float32_t *pSupport = S->supportVectors;
444 
445     for(i=0; i < S->nbOfSupportVectors; i++)
446     {
447         dot=0;
448         for(j=0; j < S->vectorDimension; j++)
449         {
450             dot = dot + in[j]* *pSupport++;
451         }
452         sum += S->dualCoefficients[i] * dot;
453     }
454     *pResult=S->classes[STEP(sum)];
455 }
456 #endif
457 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
458 
459 /**
460  * @} end of linearsvm group
461  */
462