• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_linear_predict_f32.c
4  * Description:  SVM Linear Classifier
5  *
6  *
7  * Target Processor: Cortex-M and Cortex-A cores
8  * -------------------------------------------------------------------- */
9 /*
10  * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
11  *
12  * SPDX-License-Identifier: Apache-2.0
13  *
14  * Licensed under the Apache License, Version 2.0 (the License); you may
15  * not use this file except in compliance with the License.
16  * You may obtain a copy of the License at
17  *
18  * www.apache.org/licenses/LICENSE-2.0
19  *
20  * Unless required by applicable law or agreed to in writing, software
21  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
22  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23  * See the License for the specific language governing permissions and
24  * limitations under the License.
25  */
26 
27 #include "arm_math.h"
28 #include <limits.h>
29 #include <math.h>
30 
31 
32 /**
33  * @addtogroup groupSVM
34  * @{
35  */
36 
37 
38 /**
39  * @brief SVM linear prediction
40  * @param[in]    S          Pointer to an instance of the linear SVM structure.
41  * @param[in]    in         Pointer to input vector
42  * @param[out]   pResult    Decision value
43  * @return none.
44  *
45  */
46 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
47 
48 #include "arm_helium_utils.h"
49 
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)50 void arm_svm_linear_predict_f32(
51     const arm_svm_linear_instance_f32 *S,
52     const float32_t * in,
53     int32_t * pResult)
54 {
55         /* inlined Matrix x Vector function interleaved with dot prod */
56     uint32_t        numRows = S->nbOfSupportVectors;
57     uint32_t        numCols = S->vectorDimension;
58     const float32_t *pSupport = S->supportVectors;
59     const float32_t *pSrcA = pSupport;
60     const float32_t *pInA0;
61     const float32_t *pInA1;
62     uint32_t         row;
63     uint32_t         blkCnt;     /* loop counters */
64     const float32_t *pDualCoef = S->dualCoefficients;
65     float32_t       sum = S->intercept;
66     row = numRows;
67 
68     /*
69      * compute 4 rows in parrallel
70      */
71     while (row >= 4)
72     {
73         const float32_t *pInA2, *pInA3;
74         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
75         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
76         float32_t const *pSrcVecPtr = in;
77 
78         /*
79          * Initialize the pointers to 4 consecutive MatrixA rows
80          */
81         pInA0 = pSrcA;
82         pInA1 = pInA0 + numCols;
83         pInA2 = pInA1 + numCols;
84         pInA3 = pInA2 + numCols;
85         /*
86          * Initialize the vector pointer
87          */
88         pInVec = pSrcVecPtr;
89         /*
90          * reset accumulators
91          */
92         acc0 = vdupq_n_f32(0.0f);
93         acc1 = vdupq_n_f32(0.0f);
94         acc2 = vdupq_n_f32(0.0f);
95         acc3 = vdupq_n_f32(0.0f);
96 
97         pSrcA0Vec = pInA0;
98         pSrcA1Vec = pInA1;
99         pSrcA2Vec = pInA2;
100         pSrcA3Vec = pInA3;
101 
102         blkCnt = numCols >> 2;
103         while (blkCnt > 0U) {
104             f32x4_t         vecA;
105 
106             vecIn = vld1q(pInVec);
107             pInVec += 4;
108             vecA = vld1q(pSrcA0Vec);
109             pSrcA0Vec += 4;
110             acc0 = vfmaq(acc0, vecIn, vecA);
111             vecA = vld1q(pSrcA1Vec);
112             pSrcA1Vec += 4;
113             acc1 = vfmaq(acc1, vecIn, vecA);
114             vecA = vld1q(pSrcA2Vec);
115             pSrcA2Vec += 4;
116             acc2 = vfmaq(acc2, vecIn, vecA);
117             vecA = vld1q(pSrcA3Vec);
118             pSrcA3Vec += 4;
119             acc3 = vfmaq(acc3, vecIn, vecA);
120 
121             blkCnt--;
122         }
123         /*
124          * tail
125          * (will be merged thru tail predication)
126          */
127         blkCnt = numCols & 3;
128         if (blkCnt > 0U) {
129             mve_pred16_t    p0 = vctp32q(blkCnt);
130             f32x4_t         vecA;
131 
132             vecIn = vldrwq_z_f32(pInVec, p0);
133             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
134             acc0 = vfmaq(acc0, vecIn, vecA);
135             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
136             acc1 = vfmaq(acc1, vecIn, vecA);
137             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
138             acc2 = vfmaq(acc2, vecIn, vecA);
139             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
140             acc3 = vfmaq(acc3, vecIn, vecA);
141         }
142         /*
143          * Sum the partial parts
144          */
145         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
146         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc1);
147         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc2);
148         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc3);
149 
150         pSrcA += numCols * 4;
151         /*
152          * Decrement the row loop counter
153          */
154         row -= 4;
155     }
156 
157     /*
158      * compute 2 rows in parallel
159      */
160     if (row >= 2) {
161         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
162         f32x4_t         vecIn, acc0, acc1;
163         float32_t const *pSrcVecPtr = in;
164 
165         /*
166          * Initialize the pointers to 2 consecutive MatrixA rows
167          */
168         pInA0 = pSrcA;
169         pInA1 = pInA0 + numCols;
170         /*
171          * Initialize the vector pointer
172          */
173         pInVec = pSrcVecPtr;
174         /*
175          * reset accumulators
176          */
177         acc0 = vdupq_n_f32(0.0f);
178         acc1 = vdupq_n_f32(0.0f);
179         pSrcA0Vec = pInA0;
180         pSrcA1Vec = pInA1;
181 
182         blkCnt = numCols >> 2;
183         while (blkCnt > 0U) {
184             f32x4_t         vecA;
185 
186             vecIn = vld1q(pInVec);
187             pInVec += 4;
188             vecA = vld1q(pSrcA0Vec);
189             pSrcA0Vec += 4;
190             acc0 = vfmaq(acc0, vecIn, vecA);
191             vecA = vld1q(pSrcA1Vec);
192             pSrcA1Vec += 4;
193             acc1 = vfmaq(acc1, vecIn, vecA);
194 
195             blkCnt--;
196         }
197         /*
198          * tail
199          * (will be merged thru tail predication)
200          */
201         blkCnt = numCols & 3;
202         if (blkCnt > 0U) {
203             mve_pred16_t    p0 = vctp32q(blkCnt);
204             f32x4_t         vecA;
205 
206             vecIn = vldrwq_z_f32(pInVec, p0);
207             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
208             acc0 = vfmaq(acc0, vecIn, vecA);
209             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
210             acc1 = vfmaq(acc1, vecIn, vecA);
211         }
212         /*
213          * Sum the partial parts
214          */
215         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
216         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc1);
217 
218         pSrcA += numCols * 2;
219         row -= 2;
220     }
221 
222     if (row >= 1) {
223         f32x4_t         vecIn, acc0;
224         float32_t const *pSrcA0Vec, *pInVec;
225         float32_t const *pSrcVecPtr = in;
226         /*
227          * Initialize the pointers to last MatrixA row
228          */
229         pInA0 = pSrcA;
230         /*
231          * Initialize the vector pointer
232          */
233         pInVec = pSrcVecPtr;
234         /*
235          * reset accumulators
236          */
237         acc0 = vdupq_n_f32(0.0f);
238 
239         pSrcA0Vec = pInA0;
240 
241         blkCnt = numCols >> 2;
242         while (blkCnt > 0U) {
243             f32x4_t         vecA;
244 
245             vecIn = vld1q(pInVec);
246             pInVec += 4;
247             vecA = vld1q(pSrcA0Vec);
248             pSrcA0Vec += 4;
249             acc0 = vfmaq(acc0, vecIn, vecA);
250 
251             blkCnt--;
252         }
253         /*
254          * tail
255          * (will be merged thru tail predication)
256          */
257         blkCnt = numCols & 3;
258         if (blkCnt > 0U) {
259             mve_pred16_t    p0 = vctp32q(blkCnt);
260             f32x4_t         vecA;
261 
262             vecIn = vldrwq_z_f32(pInVec, p0);
263             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
264             acc0 = vfmaq(acc0, vecIn, vecA);
265         }
266         /*
267          * Sum the partial parts
268          */
269         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
270 
271     }
272 
273     *pResult = S->classes[STEP(sum)];
274 }
275 
276 #else
277 #if defined(ARM_MATH_NEON)
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)278 void arm_svm_linear_predict_f32(
279     const arm_svm_linear_instance_f32 *S,
280     const float32_t * in,
281     int32_t * pResult)
282 {
283     float32_t sum = S->intercept;
284 
285     float32_t dot;
286     float32x4_t dotV;
287 
288     float32x4_t accuma,accumb,accumc,accumd,accum;
289     float32x2_t accum2;
290     float32x4_t vec1;
291 
292     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
293 
294     uint32_t blkCnt;
295     uint32_t vectorBlkCnt;
296 
297     const float32_t *pIn = in;
298 
299     const float32_t *pSupport = S->supportVectors;
300 
301     const float32_t *pSupporta = S->supportVectors;
302     const float32_t *pSupportb;
303     const float32_t *pSupportc;
304     const float32_t *pSupportd;
305 
306     pSupportb = pSupporta + S->vectorDimension;
307     pSupportc = pSupportb + S->vectorDimension;
308     pSupportd = pSupportc + S->vectorDimension;
309 
310     const float32_t *pDualCoefs = S->dualCoefficients;
311 
312     vectorBlkCnt = S->nbOfSupportVectors >> 2;
313 
314     while (vectorBlkCnt > 0U)
315     {
316         accuma = vdupq_n_f32(0);
317         accumb = vdupq_n_f32(0);
318         accumc = vdupq_n_f32(0);
319         accumd = vdupq_n_f32(0);
320 
321         pIn = in;
322 
323         blkCnt = S->vectorDimension >> 2;
324         while (blkCnt > 0U)
325         {
326 
327             vec1 = vld1q_f32(pIn);
328             vec2a = vld1q_f32(pSupporta);
329             vec2b = vld1q_f32(pSupportb);
330             vec2c = vld1q_f32(pSupportc);
331             vec2d = vld1q_f32(pSupportd);
332 
333             pIn += 4;
334             pSupporta += 4;
335             pSupportb += 4;
336             pSupportc += 4;
337             pSupportd += 4;
338 
339             accuma = vmlaq_f32(accuma, vec1,vec2a);
340             accumb = vmlaq_f32(accumb, vec1,vec2b);
341             accumc = vmlaq_f32(accumc, vec1,vec2c);
342             accumd = vmlaq_f32(accumd, vec1,vec2d);
343 
344             blkCnt -- ;
345         }
346         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
347         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
348 
349         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
350         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
351 
352         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
353         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
354 
355         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
356         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
357 
358 
359         blkCnt = S->vectorDimension & 3;
360         while (blkCnt > 0U)
361         {
362             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
363             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
364             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
365             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
366 
367             pIn++;
368 
369             blkCnt -- ;
370         }
371 
372         vec1 = vld1q_f32(pDualCoefs);
373         pDualCoefs += 4;
374 
375         accum = vmulq_f32(vec1,dotV);
376         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
377         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
378 
379         pSupporta += 3*S->vectorDimension;
380         pSupportb += 3*S->vectorDimension;
381         pSupportc += 3*S->vectorDimension;
382         pSupportd += 3*S->vectorDimension;
383 
384         vectorBlkCnt -- ;
385     }
386 
387     pSupport = pSupporta;
388     vectorBlkCnt = S->nbOfSupportVectors & 3;
389     while (vectorBlkCnt > 0U)
390     {
391         accum = vdupq_n_f32(0);
392         dot = 0.0f;
393         pIn = in;
394 
395         blkCnt = S->vectorDimension >> 2;
396         while (blkCnt > 0U)
397         {
398 
399             vec1 = vld1q_f32(pIn);
400             vec2 = vld1q_f32(pSupport);
401             pIn += 4;
402             pSupport += 4;
403 
404             accum = vmlaq_f32(accum, vec1,vec2);
405 
406             blkCnt -- ;
407         }
408         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
409         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
410 
411 
412         blkCnt = S->vectorDimension & 3;
413         while (blkCnt > 0U)
414         {
415             dot = dot + *pIn++ * *pSupport++;
416 
417             blkCnt -- ;
418         }
419 
420         sum += *pDualCoefs++ * dot;
421         vectorBlkCnt -- ;
422     }
423 
424     *pResult=S->classes[STEP(sum)];
425 }
426 #else
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)427 void arm_svm_linear_predict_f32(
428     const arm_svm_linear_instance_f32 *S,
429     const float32_t * in,
430     int32_t * pResult)
431 {
432     float32_t sum=S->intercept;
433     float32_t dot=0;
434     uint32_t i,j;
435     const float32_t *pSupport = S->supportVectors;
436 
437     for(i=0; i < S->nbOfSupportVectors; i++)
438     {
439         dot=0;
440         for(j=0; j < S->vectorDimension; j++)
441         {
442             dot = dot + in[j]* *pSupport++;
443         }
444         sum += S->dualCoefficients[i] * dot;
445     }
446     *pResult=S->classes[STEP(sum)];
447 }
448 #endif
449 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
450 
451 /**
452  * @} end of groupSVM group
453  */
454