• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_polynomial_predict_f16.c
4  * Description:  SVM Polynomial Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 
37 /**
38  * @addtogroup polysvm
39  * @{
40  */
41 
42 
43 /**
44  * @brief SVM polynomial prediction
45  * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
46  * @param[in]    in         Pointer to input vector
47  * @param[out]   pResult    Decision value
48  * @return none.
49  *
50  */
51 
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53 
54 #include "arm_helium_utils.h"
55 #include "arm_vec_math_f16.h"
56 
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)57 void arm_svm_polynomial_predict_f16(
58     const arm_svm_polynomial_instance_f16 *S,
59     const float16_t * in,
60     int32_t * pResult)
61 {
62         /* inlined Matrix x Vector function interleaved with dot prod */
63     uint32_t        numRows = S->nbOfSupportVectors;
64     uint32_t        numCols = S->vectorDimension;
65     const float16_t *pSupport = S->supportVectors;
66     const float16_t *pSrcA = pSupport;
67     const float16_t *pInA0;
68     const float16_t *pInA1;
69     uint32_t         row;
70     uint32_t         blkCnt;     /* loop counters */
71     const float16_t *pDualCoef = S->dualCoefficients;
72     _Float16       sum = S->intercept;
73     f16x8_t         vSum = vdupq_n_f16(0.0f);
74 
75     row = numRows;
76 
77     /*
78      * compute 4 rows in parrallel
79      */
80     while (row >= 4) {
81         const float16_t *pInA2, *pInA3;
82         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
83         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
84         float16_t const *pSrcVecPtr = in;
85 
86         /*
87          * Initialize the pointers to 4 consecutive MatrixA rows
88          */
89         pInA0 = pSrcA;
90         pInA1 = pInA0 + numCols;
91         pInA2 = pInA1 + numCols;
92         pInA3 = pInA2 + numCols;
93         /*
94          * Initialize the vector pointer
95          */
96         pInVec = pSrcVecPtr;
97         /*
98          * reset accumulators
99          */
100         acc0 = vdupq_n_f16(0.0f);
101         acc1 = vdupq_n_f16(0.0f);
102         acc2 = vdupq_n_f16(0.0f);
103         acc3 = vdupq_n_f16(0.0f);
104 
105         pSrcA0Vec = pInA0;
106         pSrcA1Vec = pInA1;
107         pSrcA2Vec = pInA2;
108         pSrcA3Vec = pInA3;
109 
110         blkCnt = numCols >> 3;
111         while (blkCnt > 0U) {
112             f16x8_t         vecA;
113 
114             vecIn = vld1q(pInVec);
115             pInVec += 8;
116             vecA = vld1q(pSrcA0Vec);
117             pSrcA0Vec += 8;
118             acc0 = vfmaq(acc0, vecIn, vecA);
119             vecA = vld1q(pSrcA1Vec);
120             pSrcA1Vec += 8;
121             acc1 = vfmaq(acc1, vecIn, vecA);
122             vecA = vld1q(pSrcA2Vec);
123             pSrcA2Vec += 8;
124             acc2 = vfmaq(acc2, vecIn, vecA);
125             vecA = vld1q(pSrcA3Vec);
126             pSrcA3Vec += 8;
127             acc3 = vfmaq(acc3, vecIn, vecA);
128 
129             blkCnt--;
130         }
131         /*
132          * tail
133          * (will be merged thru tail predication)
134          */
135         blkCnt = numCols & 7;
136         if (blkCnt > 0U) {
137             mve_pred16_t    p0 = vctp16q(blkCnt);
138             f16x8_t         vecA;
139 
140             vecIn = vldrhq_z_f16(pInVec, p0);
141             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
142             acc0 = vfmaq(acc0, vecIn, vecA);
143             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
144             acc1 = vfmaq(acc1, vecIn, vecA);
145             vecA = vldrhq_z_f16(pSrcA2Vec, p0);
146             acc2 = vfmaq(acc2, vecIn, vecA);
147             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
148             acc3 = vfmaq(acc3, vecIn, vecA);
149         }
150         /*
151          * Sum the partial parts
152          */
153         f16x8_t         vtmp = vuninitializedq_f16();
154         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
155         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
156         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
157         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
158 
159         vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
160                              arm_vec_exponent_f16
161                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
162                                 S->degree),vctp16q(4));
163 
164         pDualCoef += 4;
165 
166         pSrcA += numCols * 4;
167         /*
168          * Decrement the row loop counter
169          */
170         row -= 4;
171     }
172 
173     /*
174      * compute 2 rows in parrallel
175      */
176     if (row >= 2) {
177         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
178         f16x8_t         vecIn, acc0, acc1;
179         float16_t const *pSrcVecPtr = in;
180 
181         /*
182          * Initialize the pointers to 2 consecutive MatrixA rows
183          */
184         pInA0 = pSrcA;
185         pInA1 = pInA0 + numCols;
186         /*
187          * Initialize the vector pointer
188          */
189         pInVec = pSrcVecPtr;
190         /*
191          * reset accumulators
192          */
193         acc0 = vdupq_n_f16(0.0f);
194         acc1 = vdupq_n_f16(0.0f);
195         pSrcA0Vec = pInA0;
196         pSrcA1Vec = pInA1;
197 
198         blkCnt = numCols >> 3;
199         while (blkCnt > 0U) {
200             f16x8_t         vecA;
201 
202             vecIn = vld1q(pInVec);
203             pInVec += 8;
204             vecA = vld1q(pSrcA0Vec);
205             pSrcA0Vec += 8;
206             acc0 = vfmaq(acc0, vecIn, vecA);
207             vecA = vld1q(pSrcA1Vec);
208             pSrcA1Vec += 8;
209             acc1 = vfmaq(acc1, vecIn, vecA);
210 
211             blkCnt--;
212         }
213         /*
214          * tail
215          * (will be merged thru tail predication)
216          */
217         blkCnt = numCols & 7;
218         if (blkCnt > 0U) {
219             mve_pred16_t    p0 = vctp16q(blkCnt);
220             f16x8_t         vecA;
221 
222             vecIn = vldrhq_z_f16(pInVec, p0);
223             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
224             acc0 = vfmaq(acc0, vecIn, vecA);
225             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
226             acc1 = vfmaq(acc1, vecIn, vecA);
227         }
228         /*
229          * Sum the partial parts
230          */
231         f16x8_t         vtmp = vuninitializedq_f16();
232         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
233         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
234 
235         vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
236                              arm_vec_exponent_f16
237                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
238                              vctp16q(2));
239 
240         pDualCoef += 2;
241         pSrcA += numCols * 2;
242         row -= 2;
243     }
244 
245     if (row >= 1) {
246         f16x8_t         vecIn, acc0;
247         float16_t const *pSrcA0Vec, *pInVec;
248         float16_t const *pSrcVecPtr = in;
249         /*
250          * Initialize the pointers to last MatrixA row
251          */
252         pInA0 = pSrcA;
253         /*
254          * Initialize the vector pointer
255          */
256         pInVec = pSrcVecPtr;
257         /*
258          * reset accumulators
259          */
260         acc0 = vdupq_n_f16(0.0f);
261 
262         pSrcA0Vec = pInA0;
263 
264         blkCnt = numCols >> 3;
265         while (blkCnt > 0U) {
266             f16x8_t         vecA;
267 
268             vecIn = vld1q(pInVec);
269             pInVec += 8;
270             vecA = vld1q(pSrcA0Vec);
271             pSrcA0Vec += 8;
272             acc0 = vfmaq(acc0, vecIn, vecA);
273 
274             blkCnt--;
275         }
276         /*
277          * tail
278          * (will be merged thru tail predication)
279          */
280         blkCnt = numCols & 7;
281         if (blkCnt > 0U) {
282             mve_pred16_t    p0 = vctp16q(blkCnt);
283             f16x8_t         vecA;
284 
285             vecIn = vldrhq_z_f16(pInVec, p0);
286             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
287             acc0 = vfmaq(acc0, vecIn, vecA);
288         }
289         /*
290          * Sum the partial parts
291          */
292         f16x8_t         vtmp = vuninitializedq_f16();
293         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
294         vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
295                              arm_vec_exponent_f16
296                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
297                              vctp16q(1));
298     }
299     sum += (_Float16)vecAddAcrossF16Mve(vSum);
300 
301 
302     *pResult = S->classes[STEP(sum)];
303 }
304 
305 #else
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)306 void arm_svm_polynomial_predict_f16(
307     const arm_svm_polynomial_instance_f16 *S,
308     const float16_t * in,
309     int32_t * pResult)
310 {
311     _Float16 sum=S->intercept;
312     _Float16 dot=0;
313     uint32_t i,j;
314     const float16_t *pSupport = S->supportVectors;
315 
316     for(i=0; i < S->nbOfSupportVectors; i++)
317     {
318         dot=0;
319         for(j=0; j < S->vectorDimension; j++)
320         {
321             dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
322         }
323         sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
324     }
325 
326     *pResult=S->classes[STEP(sum)];
327 }
328 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
329 
330 
331 /**
332  * @} end of polysvm group
333  */
334 
335 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
336 
337