• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_linear_predict_f16.c
4  * Description:  SVM Linear Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 
37 /**
38  * @addtogroup linearsvm
39  * @{
40  */
41 
42 
43 /**
44  * @brief SVM linear prediction
45  * @param[in]    S          Pointer to an instance of the linear SVM structure.
46  * @param[in]    in         Pointer to input vector
47  * @param[out]   pResult    Decision value
48  * @return none.
49  *
50  */
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)55 void arm_svm_linear_predict_f16(
56     const arm_svm_linear_instance_f16 *S,
57     const float16_t * in,
58     int32_t * pResult)
59 {
60         /* inlined Matrix x Vector function interleaved with dot prod */
61     uint32_t        numRows = S->nbOfSupportVectors;
62     uint32_t        numCols = S->vectorDimension;
63     const float16_t *pSupport = S->supportVectors;
64     const float16_t *pSrcA = pSupport;
65     const float16_t *pInA0;
66     const float16_t *pInA1;
67     uint32_t         row;
68     uint32_t         blkCnt;     /* loop counters */
69     const float16_t *pDualCoef = S->dualCoefficients;
70     _Float16       sum = S->intercept;
71     row = numRows;
72 
73     /*
74      * compute 4 rows in parrallel
75      */
76     while (row >= 4)
77     {
78         const float16_t *pInA2, *pInA3;
79         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
80         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
81         float16_t const *pSrcVecPtr = in;
82 
83         /*
84          * Initialize the pointers to 4 consecutive MatrixA rows
85          */
86         pInA0 = pSrcA;
87         pInA1 = pInA0 + numCols;
88         pInA2 = pInA1 + numCols;
89         pInA3 = pInA2 + numCols;
90         /*
91          * Initialize the vector pointer
92          */
93         pInVec = pSrcVecPtr;
94         /*
95          * reset accumulators
96          */
97         acc0 = vdupq_n_f16(0.0f);
98         acc1 = vdupq_n_f16(0.0f);
99         acc2 = vdupq_n_f16(0.0f);
100         acc3 = vdupq_n_f16(0.0f);
101 
102         pSrcA0Vec = pInA0;
103         pSrcA1Vec = pInA1;
104         pSrcA2Vec = pInA2;
105         pSrcA3Vec = pInA3;
106 
107         blkCnt = numCols >> 3;
108         while (blkCnt > 0U) {
109             f16x8_t         vecA;
110 
111             vecIn = vld1q(pInVec);
112             pInVec += 8;
113             vecA = vld1q(pSrcA0Vec);
114             pSrcA0Vec += 8;
115             acc0 = vfmaq(acc0, vecIn, vecA);
116             vecA = vld1q(pSrcA1Vec);
117             pSrcA1Vec += 8;
118             acc1 = vfmaq(acc1, vecIn, vecA);
119             vecA = vld1q(pSrcA2Vec);
120             pSrcA2Vec += 8;
121             acc2 = vfmaq(acc2, vecIn, vecA);
122             vecA = vld1q(pSrcA3Vec);
123             pSrcA3Vec += 8;
124             acc3 = vfmaq(acc3, vecIn, vecA);
125 
126             blkCnt--;
127         }
128         /*
129          * tail
130          * (will be merged thru tail predication)
131          */
132         blkCnt = numCols & 7;
133         if (blkCnt > 0U) {
134             mve_pred16_t    p0 = vctp16q(blkCnt);
135             f16x8_t         vecA;
136 
137             vecIn = vldrhq_z_f16(pInVec, p0);
138             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
139             acc0 = vfmaq(acc0, vecIn, vecA);
140             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
141             acc1 = vfmaq(acc1, vecIn, vecA);
142             vecA = vldrhq_z_f16(pSrcA2Vec, p0);
143             acc2 = vfmaq(acc2, vecIn, vecA);
144             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
145             acc3 = vfmaq(acc3, vecIn, vecA);
146         }
147         /*
148          * Sum the partial parts
149          */
150         acc0 = vmulq_n_f16(acc0,*pDualCoef++);
151         acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
152         acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
153         acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
154 
155         sum += (_Float16)vecAddAcrossF16Mve(acc0);
156 
157         pSrcA += numCols * 4;
158         /*
159          * Decrement the row loop counter
160          */
161         row -= 4;
162     }
163 
164     /*
165      * compute 2 rows in parallel
166      */
167     if (row >= 2) {
168         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
169         f16x8_t         vecIn, acc0, acc1;
170         float16_t const *pSrcVecPtr = in;
171 
172         /*
173          * Initialize the pointers to 2 consecutive MatrixA rows
174          */
175         pInA0 = pSrcA;
176         pInA1 = pInA0 + numCols;
177         /*
178          * Initialize the vector pointer
179          */
180         pInVec = pSrcVecPtr;
181         /*
182          * reset accumulators
183          */
184         acc0 = vdupq_n_f16(0.0f);
185         acc1 = vdupq_n_f16(0.0f);
186         pSrcA0Vec = pInA0;
187         pSrcA1Vec = pInA1;
188 
189         blkCnt = numCols >> 3;
190         while (blkCnt > 0U) {
191             f16x8_t         vecA;
192 
193             vecIn = vld1q(pInVec);
194             pInVec += 8;
195             vecA = vld1q(pSrcA0Vec);
196             pSrcA0Vec += 8;
197             acc0 = vfmaq(acc0, vecIn, vecA);
198             vecA = vld1q(pSrcA1Vec);
199             pSrcA1Vec += 8;
200             acc1 = vfmaq(acc1, vecIn, vecA);
201 
202             blkCnt--;
203         }
204         /*
205          * tail
206          * (will be merged thru tail predication)
207          */
208         blkCnt = numCols & 7;
209         if (blkCnt > 0U) {
210             mve_pred16_t    p0 = vctp16q(blkCnt);
211             f16x8_t         vecA;
212 
213             vecIn = vldrhq_z_f16(pInVec, p0);
214             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
215             acc0 = vfmaq(acc0, vecIn, vecA);
216             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
217             acc1 = vfmaq(acc1, vecIn, vecA);
218         }
219         /*
220          * Sum the partial parts
221          */
222         acc0 = vmulq_n_f16(acc0,*pDualCoef++);
223         acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
224 
225         sum += (_Float16)vecAddAcrossF16Mve(acc0);
226 
227         pSrcA += numCols * 2;
228         row -= 2;
229     }
230 
231     if (row >= 1) {
232         f16x8_t         vecIn, acc0;
233         float16_t const *pSrcA0Vec, *pInVec;
234         float16_t const *pSrcVecPtr = in;
235         /*
236          * Initialize the pointers to last MatrixA row
237          */
238         pInA0 = pSrcA;
239         /*
240          * Initialize the vector pointer
241          */
242         pInVec = pSrcVecPtr;
243         /*
244          * reset accumulators
245          */
246         acc0 = vdupq_n_f16(0.0f);
247 
248         pSrcA0Vec = pInA0;
249 
250         blkCnt = numCols >> 3;
251         while (blkCnt > 0U) {
252             f16x8_t         vecA;
253 
254             vecIn = vld1q(pInVec);
255             pInVec += 8;
256             vecA = vld1q(pSrcA0Vec);
257             pSrcA0Vec += 8;
258             acc0 = vfmaq(acc0, vecIn, vecA);
259 
260             blkCnt--;
261         }
262         /*
263          * tail
264          * (will be merged thru tail predication)
265          */
266         blkCnt = numCols & 7;
267         if (blkCnt > 0U) {
268             mve_pred16_t    p0 = vctp16q(blkCnt);
269             f16x8_t         vecA;
270 
271             vecIn = vldrhq_z_f16(pInVec, p0);
272             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
273             acc0 = vfmaq(acc0, vecIn, vecA);
274         }
275         /*
276          * Sum the partial parts
277          */
278         sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
279 
280     }
281 
282     *pResult = S->classes[STEP(sum)];
283 }
284 
285 #else
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)286 void arm_svm_linear_predict_f16(
287     const arm_svm_linear_instance_f16 *S,
288     const float16_t * in,
289     int32_t * pResult)
290 {
291     _Float16 sum=S->intercept;
292     _Float16 dot=0;
293     uint32_t i,j;
294     const float16_t *pSupport = S->supportVectors;
295 
296     for(i=0; i < S->nbOfSupportVectors; i++)
297     {
298         dot=0;
299         for(j=0; j < S->vectorDimension; j++)
300         {
301             dot = dot + in[j]* *pSupport++;
302         }
303         sum += S->dualCoefficients[i] * dot;
304     }
305     *pResult=S->classes[STEP(sum)];
306 }
307 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
308 
309 /**
310  * @} end of linearsvm group
311  */
312 
313 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
314 
315