• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_sigmoid_predict_f32.c
4  * Description:  SVM Sigmoid Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 /**
34  * @addtogroup sigmoidsvm
35  * @{
36  */
37 
38 
39 
40 /**
41  * @brief SVM sigmoid prediction
42  * @param[in]    S        Pointer to an instance of the rbf SVM structure.
43  * @param[in]    in       Pointer to input vector
44  * @param[out]   pResult  Decision value
45  * @return none.
46  *
47  */
48 
49 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
50 
51 #include "arm_helium_utils.h"
52 #include "arm_vec_math.h"
53 
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)54 void arm_svm_sigmoid_predict_f32(
55     const arm_svm_sigmoid_instance_f32 *S,
56     const float32_t * in,
57     int32_t * pResult)
58 {
59         /* inlined Matrix x Vector function interleaved with dot prod */
60     uint32_t        numRows = S->nbOfSupportVectors;
61     uint32_t        numCols = S->vectorDimension;
62     const float32_t *pSupport = S->supportVectors;
63     const float32_t *pSrcA = pSupport;
64     const float32_t *pInA0;
65     const float32_t *pInA1;
66     uint32_t         row;
67     uint32_t         blkCnt;     /* loop counters */
68     const float32_t *pDualCoef = S->dualCoefficients;
69     float32_t       sum = S->intercept;
70     f32x4_t         vSum = vdupq_n_f32(0.0f);
71 
72     row = numRows;
73 
74     /*
75      * compute 4 rows in parrallel
76      */
77     while (row >= 4) {
78         const float32_t *pInA2, *pInA3;
79         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
80         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
81         float32_t const *pSrcVecPtr = in;
82 
83         /*
84          * Initialize the pointers to 4 consecutive MatrixA rows
85          */
86         pInA0 = pSrcA;
87         pInA1 = pInA0 + numCols;
88         pInA2 = pInA1 + numCols;
89         pInA3 = pInA2 + numCols;
90         /*
91          * Initialize the vector pointer
92          */
93         pInVec = pSrcVecPtr;
94         /*
95          * reset accumulators
96          */
97         acc0 = vdupq_n_f32(0.0f);
98         acc1 = vdupq_n_f32(0.0f);
99         acc2 = vdupq_n_f32(0.0f);
100         acc3 = vdupq_n_f32(0.0f);
101 
102         pSrcA0Vec = pInA0;
103         pSrcA1Vec = pInA1;
104         pSrcA2Vec = pInA2;
105         pSrcA3Vec = pInA3;
106 
107         blkCnt = numCols >> 2;
108         while (blkCnt > 0U) {
109             f32x4_t         vecA;
110 
111             vecIn = vld1q(pInVec);
112             pInVec += 4;
113             vecA = vld1q(pSrcA0Vec);
114             pSrcA0Vec += 4;
115             acc0 = vfmaq(acc0, vecIn, vecA);
116             vecA = vld1q(pSrcA1Vec);
117             pSrcA1Vec += 4;
118             acc1 = vfmaq(acc1, vecIn, vecA);
119             vecA = vld1q(pSrcA2Vec);
120             pSrcA2Vec += 4;
121             acc2 = vfmaq(acc2, vecIn, vecA);
122             vecA = vld1q(pSrcA3Vec);
123             pSrcA3Vec += 4;
124             acc3 = vfmaq(acc3, vecIn, vecA);
125 
126             blkCnt--;
127         }
128         /*
129          * tail
130          * (will be merged thru tail predication)
131          */
132         blkCnt = numCols & 3;
133         if (blkCnt > 0U) {
134             mve_pred16_t    p0 = vctp32q(blkCnt);
135             f32x4_t         vecA;
136 
137             vecIn = vldrwq_z_f32(pInVec, p0);
138             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
139             acc0 = vfmaq(acc0, vecIn, vecA);
140             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
141             acc1 = vfmaq(acc1, vecIn, vecA);
142             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
143             acc2 = vfmaq(acc2, vecIn, vecA);
144             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
145             acc3 = vfmaq(acc3, vecIn, vecA);
146         }
147         /*
148          * Sum the partial parts
149          */
150         f32x4_t         vtmp = vuninitializedq_f32();
151         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
152         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
153         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
154         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
155 
156         vSum =
157             vfmaq_f32(vSum, vld1q(pDualCoef),
158                       vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
159 
160         pDualCoef += 4;
161 
162         pSrcA += numCols * 4;
163         /*
164          * Decrement the row loop counter
165          */
166         row -= 4;
167     }
168 
169     /*
170      * compute 2 rows in parrallel
171      */
172     if (row >= 2) {
173         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
174         f32x4_t         vecIn, acc0, acc1;
175         float32_t const *pSrcVecPtr = in;
176 
177         /*
178          * Initialize the pointers to 2 consecutive MatrixA rows
179          */
180         pInA0 = pSrcA;
181         pInA1 = pInA0 + numCols;
182         /*
183          * Initialize the vector pointer
184          */
185         pInVec = pSrcVecPtr;
186         /*
187          * reset accumulators
188          */
189         acc0 = vdupq_n_f32(0.0f);
190         acc1 = vdupq_n_f32(0.0f);
191         pSrcA0Vec = pInA0;
192         pSrcA1Vec = pInA1;
193 
194         blkCnt = numCols >> 2;
195         while (blkCnt > 0U) {
196             f32x4_t         vecA;
197 
198             vecIn = vld1q(pInVec);
199             pInVec += 4;
200             vecA = vld1q(pSrcA0Vec);
201             pSrcA0Vec += 4;
202             acc0 = vfmaq(acc0, vecIn, vecA);
203             vecA = vld1q(pSrcA1Vec);
204             pSrcA1Vec += 4;
205             acc1 = vfmaq(acc1, vecIn, vecA);
206 
207             blkCnt--;
208         }
209         /*
210          * tail
211          * (will be merged thru tail predication)
212          */
213         blkCnt = numCols & 3;
214         if (blkCnt > 0U) {
215             mve_pred16_t    p0 = vctp32q(blkCnt);
216             f32x4_t         vecA;
217 
218             vecIn = vldrwq_z_f32(pInVec, p0);
219             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
220             acc0 = vfmaq(acc0, vecIn, vecA);
221             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
222             acc1 = vfmaq(acc1, vecIn, vecA);
223         }
224         /*
225          * Sum the partial parts
226          */
227         f32x4_t         vtmp = vuninitializedq_f32();
228         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
229         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
230 
231         vSum =
232             vfmaq_m_f32(vSum, vld1q(pDualCoef),
233                         vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
234                         vctp32q(2));
235 
236         pSrcA += numCols * 2;
237         row -= 2;
238     }
239 
240     if (row >= 1) {
241         f32x4_t         vecIn, acc0;
242         float32_t const *pSrcA0Vec, *pInVec;
243         float32_t const *pSrcVecPtr = in;
244         /*
245          * Initialize the pointers to last MatrixA row
246          */
247         pInA0 = pSrcA;
248         /*
249          * Initialize the vector pointer
250          */
251         pInVec = pSrcVecPtr;
252         /*
253          * reset accumulators
254          */
255         acc0 = vdupq_n_f32(0.0f);
256 
257         pSrcA0Vec = pInA0;
258 
259         blkCnt = numCols >> 2;
260         while (blkCnt > 0U) {
261             f32x4_t         vecA;
262 
263             vecIn = vld1q(pInVec);
264             pInVec += 4;
265             vecA = vld1q(pSrcA0Vec);
266             pSrcA0Vec += 4;
267             acc0 = vfmaq(acc0, vecIn, vecA);
268 
269             blkCnt--;
270         }
271         /*
272          * tail
273          * (will be merged thru tail predication)
274          */
275         blkCnt = numCols & 3;
276         if (blkCnt > 0U) {
277             mve_pred16_t    p0 = vctp32q(blkCnt);
278             f32x4_t         vecA;
279 
280             vecIn = vldrwq_z_f32(pInVec, p0);
281             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
282             acc0 = vfmaq(acc0, vecIn, vecA);
283         }
284         /*
285          * Sum the partial parts
286          */
287         f32x4_t         vtmp = vuninitializedq_f32();
288         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
289 
290         vSum =
291             vfmaq_m_f32(vSum, vld1q(pDualCoef),
292                         vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
293                         vctp32q(1));
294     }
295     sum += vecAddAcrossF32Mve(vSum);
296 
297     *pResult = S->classes[STEP(sum)];
298 }
299 
300 #else
301 #if defined(ARM_MATH_NEON)
302 #include "NEMath.h"
303 
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)304 void arm_svm_sigmoid_predict_f32(
305     const arm_svm_sigmoid_instance_f32 *S,
306     const float32_t * in,
307     int32_t * pResult)
308 {
309     float32_t sum = S->intercept;
310 
311     float32_t dot;
312     float32x4_t dotV;
313 
314     float32x4_t accuma,accumb,accumc,accumd,accum;
315     float32x2_t accum2;
316     float32x4_t vec1;
317     float32x4_t coef0 = vdupq_n_f32(S->coef0);
318 
319     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
320 
321     uint32_t blkCnt;
322     uint32_t vectorBlkCnt;
323 
324     const float32_t *pIn = in;
325 
326     const float32_t *pSupport = S->supportVectors;
327 
328     const float32_t *pSupporta = S->supportVectors;
329     const float32_t *pSupportb;
330     const float32_t *pSupportc;
331     const float32_t *pSupportd;
332 
333     pSupportb = pSupporta + S->vectorDimension;
334     pSupportc = pSupportb + S->vectorDimension;
335     pSupportd = pSupportc + S->vectorDimension;
336 
337     const float32_t *pDualCoefs = S->dualCoefficients;
338 
339     vectorBlkCnt = S->nbOfSupportVectors >> 2;
340     while (vectorBlkCnt > 0U)
341     {
342         accuma = vdupq_n_f32(0);
343         accumb = vdupq_n_f32(0);
344         accumc = vdupq_n_f32(0);
345         accumd = vdupq_n_f32(0);
346 
347         pIn = in;
348 
349         blkCnt = S->vectorDimension >> 2;
350         while (blkCnt > 0U)
351         {
352 
353             vec1 = vld1q_f32(pIn);
354             vec2a = vld1q_f32(pSupporta);
355             vec2b = vld1q_f32(pSupportb);
356             vec2c = vld1q_f32(pSupportc);
357             vec2d = vld1q_f32(pSupportd);
358 
359             pIn += 4;
360             pSupporta += 4;
361             pSupportb += 4;
362             pSupportc += 4;
363             pSupportd += 4;
364 
365             accuma = vmlaq_f32(accuma, vec1,vec2a);
366             accumb = vmlaq_f32(accumb, vec1,vec2b);
367             accumc = vmlaq_f32(accumc, vec1,vec2c);
368             accumd = vmlaq_f32(accumd, vec1,vec2d);
369 
370             blkCnt -- ;
371         }
372         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
373         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
374 
375         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
376         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
377 
378         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
379         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
380 
381         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
382         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
383 
384 
385         blkCnt = S->vectorDimension & 3;
386         while (blkCnt > 0U)
387         {
388             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
389             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
390             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
391             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
392 
393             pIn++;
394 
395             blkCnt -- ;
396         }
397 
398         vec1 = vld1q_f32(pDualCoefs);
399         pDualCoefs += 4;
400 
401         // To vectorize later
402         dotV = vmulq_n_f32(dotV, S->gamma);
403         dotV = vaddq_f32(dotV, coef0);
404 
405         dotV = vtanhq_f32(dotV);
406 
407         accum = vmulq_f32(vec1,dotV);
408         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
409         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
410 
411         pSupporta += 3*S->vectorDimension;
412         pSupportb += 3*S->vectorDimension;
413         pSupportc += 3*S->vectorDimension;
414         pSupportd += 3*S->vectorDimension;
415 
416         vectorBlkCnt -- ;
417     }
418 
419     pSupport = pSupporta;
420     vectorBlkCnt = S->nbOfSupportVectors & 3;
421 
422     while (vectorBlkCnt > 0U)
423     {
424         accum = vdupq_n_f32(0);
425         dot = 0.0f;
426         pIn = in;
427 
428         blkCnt = S->vectorDimension >> 2;
429         while (blkCnt > 0U)
430         {
431 
432             vec1 = vld1q_f32(pIn);
433             vec2 = vld1q_f32(pSupport);
434             pIn += 4;
435             pSupport += 4;
436 
437             accum = vmlaq_f32(accum, vec1,vec2);
438 
439             blkCnt -- ;
440         }
441         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
442         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
443 
444 
445         blkCnt = S->vectorDimension & 3;
446         while (blkCnt > 0U)
447         {
448             dot = dot + *pIn++ * *pSupport++;
449 
450             blkCnt -- ;
451         }
452 
453         sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
454         vectorBlkCnt -- ;
455     }
456 
457     *pResult=S->classes[STEP(sum)];
458 }
459 #else
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)460 void arm_svm_sigmoid_predict_f32(
461     const arm_svm_sigmoid_instance_f32 *S,
462     const float32_t * in,
463     int32_t * pResult)
464 {
465     float32_t sum=S->intercept;
466     float32_t dot=0;
467     uint32_t i,j;
468     const float32_t *pSupport = S->supportVectors;
469 
470     for(i=0; i < S->nbOfSupportVectors; i++)
471     {
472         dot=0;
473         for(j=0; j < S->vectorDimension; j++)
474         {
475             dot = dot + in[j]* *pSupport++;
476         }
477         sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
478     }
479     *pResult=S->classes[STEP(sum)];
480 }
481 
482 #endif
483 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
484 
485 /**
486  * @} end of sigmoidsvm group
487  */
488