• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_polynomial_predict_f32.c
4  * Description:  SVM Polynomial Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
34 #include "arm_vec_math.h"
35 #endif
36 
37 /**
38  * @addtogroup polysvm
39  * @{
40  */
41 
42 
43 /**
44  * @brief SVM polynomial prediction
45  * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
46  * @param[in]    in         Pointer to input vector
47  * @param[out]   pResult    Decision value
48  * @return none.
49  *
50  */
51 
52 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
53 
54 #include "arm_helium_utils.h"
55 #include "arm_vec_math.h"
56 
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)57 void arm_svm_polynomial_predict_f32(
58     const arm_svm_polynomial_instance_f32 *S,
59     const float32_t * in,
60     int32_t * pResult)
61 {
62         /* inlined Matrix x Vector function interleaved with dot prod */
63     uint32_t        numRows = S->nbOfSupportVectors;
64     uint32_t        numCols = S->vectorDimension;
65     const float32_t *pSupport = S->supportVectors;
66     const float32_t *pSrcA = pSupport;
67     const float32_t *pInA0;
68     const float32_t *pInA1;
69     uint32_t         row;
70     uint32_t         blkCnt;     /* loop counters */
71     const float32_t *pDualCoef = S->dualCoefficients;
72     float32_t       sum = S->intercept;
73     f32x4_t         vSum = vdupq_n_f32(0.0f);
74 
75     row = numRows;
76 
77     /*
78      * compute 4 rows in parrallel
79      */
80     while (row >= 4) {
81         const float32_t *pInA2, *pInA3;
82         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
83         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
84         float32_t const *pSrcVecPtr = in;
85 
86         /*
87          * Initialize the pointers to 4 consecutive MatrixA rows
88          */
89         pInA0 = pSrcA;
90         pInA1 = pInA0 + numCols;
91         pInA2 = pInA1 + numCols;
92         pInA3 = pInA2 + numCols;
93         /*
94          * Initialize the vector pointer
95          */
96         pInVec = pSrcVecPtr;
97         /*
98          * reset accumulators
99          */
100         acc0 = vdupq_n_f32(0.0f);
101         acc1 = vdupq_n_f32(0.0f);
102         acc2 = vdupq_n_f32(0.0f);
103         acc3 = vdupq_n_f32(0.0f);
104 
105         pSrcA0Vec = pInA0;
106         pSrcA1Vec = pInA1;
107         pSrcA2Vec = pInA2;
108         pSrcA3Vec = pInA3;
109 
110         blkCnt = numCols >> 2;
111         while (blkCnt > 0U) {
112             f32x4_t         vecA;
113 
114             vecIn = vld1q(pInVec);
115             pInVec += 4;
116             vecA = vld1q(pSrcA0Vec);
117             pSrcA0Vec += 4;
118             acc0 = vfmaq(acc0, vecIn, vecA);
119             vecA = vld1q(pSrcA1Vec);
120             pSrcA1Vec += 4;
121             acc1 = vfmaq(acc1, vecIn, vecA);
122             vecA = vld1q(pSrcA2Vec);
123             pSrcA2Vec += 4;
124             acc2 = vfmaq(acc2, vecIn, vecA);
125             vecA = vld1q(pSrcA3Vec);
126             pSrcA3Vec += 4;
127             acc3 = vfmaq(acc3, vecIn, vecA);
128 
129             blkCnt--;
130         }
131         /*
132          * tail
133          * (will be merged thru tail predication)
134          */
135         blkCnt = numCols & 3;
136         if (blkCnt > 0U) {
137             mve_pred16_t    p0 = vctp32q(blkCnt);
138             f32x4_t         vecA;
139 
140             vecIn = vldrwq_z_f32(pInVec, p0);
141             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
142             acc0 = vfmaq(acc0, vecIn, vecA);
143             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
144             acc1 = vfmaq(acc1, vecIn, vecA);
145             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
146             acc2 = vfmaq(acc2, vecIn, vecA);
147             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
148             acc3 = vfmaq(acc3, vecIn, vecA);
149         }
150         /*
151          * Sum the partial parts
152          */
153         f32x4_t         vtmp = vuninitializedq_f32();
154         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
155         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
156         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
157         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
158 
159         vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
160                              arm_vec_exponent_f32
161                              (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
162 
163         pDualCoef += 4;
164 
165         pSrcA += numCols * 4;
166         /*
167          * Decrement the row loop counter
168          */
169         row -= 4;
170     }
171 
172     /*
173      * compute 2 rows in parrallel
174      */
175     if (row >= 2) {
176         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
177         f32x4_t         vecIn, acc0, acc1;
178         float32_t const *pSrcVecPtr = in;
179 
180         /*
181          * Initialize the pointers to 2 consecutive MatrixA rows
182          */
183         pInA0 = pSrcA;
184         pInA1 = pInA0 + numCols;
185         /*
186          * Initialize the vector pointer
187          */
188         pInVec = pSrcVecPtr;
189         /*
190          * reset accumulators
191          */
192         acc0 = vdupq_n_f32(0.0f);
193         acc1 = vdupq_n_f32(0.0f);
194         pSrcA0Vec = pInA0;
195         pSrcA1Vec = pInA1;
196 
197         blkCnt = numCols >> 2;
198         while (blkCnt > 0U) {
199             f32x4_t         vecA;
200 
201             vecIn = vld1q(pInVec);
202             pInVec += 4;
203             vecA = vld1q(pSrcA0Vec);
204             pSrcA0Vec += 4;
205             acc0 = vfmaq(acc0, vecIn, vecA);
206             vecA = vld1q(pSrcA1Vec);
207             pSrcA1Vec += 4;
208             acc1 = vfmaq(acc1, vecIn, vecA);
209 
210             blkCnt--;
211         }
212         /*
213          * tail
214          * (will be merged thru tail predication)
215          */
216         blkCnt = numCols & 3;
217         if (blkCnt > 0U) {
218             mve_pred16_t    p0 = vctp32q(blkCnt);
219             f32x4_t         vecA;
220 
221             vecIn = vldrwq_z_f32(pInVec, p0);
222             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
223             acc0 = vfmaq(acc0, vecIn, vecA);
224             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
225             acc1 = vfmaq(acc1, vecIn, vecA);
226         }
227         /*
228          * Sum the partial parts
229          */
230         f32x4_t         vtmp = vuninitializedq_f32();
231         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
232         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
233 
234         vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
235                              arm_vec_exponent_f32
236                              (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
237                              vctp32q(2));
238 
239         pDualCoef += 2;
240         pSrcA += numCols * 2;
241         row -= 2;
242     }
243 
244     if (row >= 1) {
245         f32x4_t         vecIn, acc0;
246         float32_t const *pSrcA0Vec, *pInVec;
247         float32_t const *pSrcVecPtr = in;
248         /*
249          * Initialize the pointers to last MatrixA row
250          */
251         pInA0 = pSrcA;
252         /*
253          * Initialize the vector pointer
254          */
255         pInVec = pSrcVecPtr;
256         /*
257          * reset accumulators
258          */
259         acc0 = vdupq_n_f32(0.0f);
260 
261         pSrcA0Vec = pInA0;
262 
263         blkCnt = numCols >> 2;
264         while (blkCnt > 0U) {
265             f32x4_t         vecA;
266 
267             vecIn = vld1q(pInVec);
268             pInVec += 4;
269             vecA = vld1q(pSrcA0Vec);
270             pSrcA0Vec += 4;
271             acc0 = vfmaq(acc0, vecIn, vecA);
272 
273             blkCnt--;
274         }
275         /*
276          * tail
277          * (will be merged thru tail predication)
278          */
279         blkCnt = numCols & 3;
280         if (blkCnt > 0U) {
281             mve_pred16_t    p0 = vctp32q(blkCnt);
282             f32x4_t         vecA;
283 
284             vecIn = vldrwq_z_f32(pInVec, p0);
285             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
286             acc0 = vfmaq(acc0, vecIn, vecA);
287         }
288         /*
289          * Sum the partial parts
290          */
291         f32x4_t         vtmp = vuninitializedq_f32();
292         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
293         vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
294                              arm_vec_exponent_f32
295                              (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
296                              vctp32q(1));
297     }
298     sum += vecAddAcrossF32Mve(vSum);
299 
300 
301     *pResult = S->classes[STEP(sum)];
302 }
303 
304 #else
305 #if defined(ARM_MATH_NEON)
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)306 void arm_svm_polynomial_predict_f32(
307     const arm_svm_polynomial_instance_f32 *S,
308     const float32_t * in,
309     int32_t * pResult)
310 {
311     float32_t sum = S->intercept;
312 
313     float32_t dot;
314     float32x4_t dotV;
315 
316     float32x4_t accuma,accumb,accumc,accumd,accum;
317     float32x2_t accum2;
318     float32x4_t vec1;
319     float32x4_t coef0 = vdupq_n_f32(S->coef0);
320 
321     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
322 
323     uint32_t blkCnt;
324     uint32_t vectorBlkCnt;
325 
326     const float32_t *pIn = in;
327 
328     const float32_t *pSupport = S->supportVectors;
329 
330     const float32_t *pSupporta = S->supportVectors;
331     const float32_t *pSupportb;
332     const float32_t *pSupportc;
333     const float32_t *pSupportd;
334 
335     pSupportb = pSupporta + S->vectorDimension;
336     pSupportc = pSupportb + S->vectorDimension;
337     pSupportd = pSupportc + S->vectorDimension;
338 
339     const float32_t *pDualCoefs = S->dualCoefficients;
340 
341     vectorBlkCnt = S->nbOfSupportVectors >> 2;
342     while (vectorBlkCnt > 0U)
343     {
344         accuma = vdupq_n_f32(0);
345         accumb = vdupq_n_f32(0);
346         accumc = vdupq_n_f32(0);
347         accumd = vdupq_n_f32(0);
348 
349         pIn = in;
350 
351         blkCnt = S->vectorDimension >> 2;
352         while (blkCnt > 0U)
353         {
354 
355             vec1 = vld1q_f32(pIn);
356             vec2a = vld1q_f32(pSupporta);
357             vec2b = vld1q_f32(pSupportb);
358             vec2c = vld1q_f32(pSupportc);
359             vec2d = vld1q_f32(pSupportd);
360 
361             pIn += 4;
362             pSupporta += 4;
363             pSupportb += 4;
364             pSupportc += 4;
365             pSupportd += 4;
366 
367             accuma = vmlaq_f32(accuma, vec1,vec2a);
368             accumb = vmlaq_f32(accumb, vec1,vec2b);
369             accumc = vmlaq_f32(accumc, vec1,vec2c);
370             accumd = vmlaq_f32(accumd, vec1,vec2d);
371 
372             blkCnt -- ;
373         }
374         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
375         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
376 
377         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
378         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
379 
380         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
381         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
382 
383         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
384         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
385 
386 
387         blkCnt = S->vectorDimension & 3;
388         while (blkCnt > 0U)
389         {
390             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
391             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
392             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
393             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
394 
395             pIn++;
396 
397             blkCnt -- ;
398         }
399 
400         vec1 = vld1q_f32(pDualCoefs);
401         pDualCoefs += 4;
402 
403         // To vectorize later
404         dotV = vmulq_n_f32(dotV, S->gamma);
405         dotV = vaddq_f32(dotV, coef0);
406 
407         dotV = arm_vec_exponent_f32(dotV,S->degree);
408 
409         accum = vmulq_f32(vec1,dotV);
410         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
411         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
412 
413         pSupporta += 3*S->vectorDimension;
414         pSupportb += 3*S->vectorDimension;
415         pSupportc += 3*S->vectorDimension;
416         pSupportd += 3*S->vectorDimension;
417 
418         vectorBlkCnt -- ;
419     }
420 
421     pSupport = pSupporta;
422     vectorBlkCnt = S->nbOfSupportVectors & 3;
423 
424     while (vectorBlkCnt > 0U)
425     {
426         accum = vdupq_n_f32(0);
427         dot = 0.0f;
428         pIn = in;
429 
430         blkCnt = S->vectorDimension >> 2;
431         while (blkCnt > 0U)
432         {
433 
434             vec1 = vld1q_f32(pIn);
435             vec2 = vld1q_f32(pSupport);
436             pIn += 4;
437             pSupport += 4;
438 
439             accum = vmlaq_f32(accum, vec1,vec2);
440 
441             blkCnt -- ;
442         }
443         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
444         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
445 
446 
447         blkCnt = S->vectorDimension & 3;
448         while (blkCnt > 0U)
449         {
450             dot = dot + *pIn++ * *pSupport++;
451 
452             blkCnt -- ;
453         }
454 
455         sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
456         vectorBlkCnt -- ;
457     }
458 
459     *pResult=S->classes[STEP(sum)];
460 }
461 #else
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)462 void arm_svm_polynomial_predict_f32(
463     const arm_svm_polynomial_instance_f32 *S,
464     const float32_t * in,
465     int32_t * pResult)
466 {
467     float32_t sum=S->intercept;
468     float32_t dot=0;
469     uint32_t i,j;
470     const float32_t *pSupport = S->supportVectors;
471 
472     for(i=0; i < S->nbOfSupportVectors; i++)
473     {
474         dot=0;
475         for(j=0; j < S->vectorDimension; j++)
476         {
477             dot = dot + in[j]* *pSupport++;
478         }
479         sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
480     }
481 
482     *pResult=S->classes[STEP(sum)];
483 }
484 #endif
485 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
486 
487 
488 /**
489  * @} end of polysvm group
490  */
491