1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_polynomial_predict_f16.c
4 * Description: SVM Polynomial Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36
37 /**
38 * @addtogroup polysvm
39 * @{
40 */
41
42
43 /**
44 * @brief SVM polynomial prediction
45 * @param[in] S Pointer to an instance of the polynomial SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 * @return none.
49 *
50 */
51
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53
54 #include "arm_helium_utils.h"
55 #include "arm_vec_math_f16.h"
56
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)57 void arm_svm_polynomial_predict_f16(
58 const arm_svm_polynomial_instance_f16 *S,
59 const float16_t * in,
60 int32_t * pResult)
61 {
62 /* inlined Matrix x Vector function interleaved with dot prod */
63 uint32_t numRows = S->nbOfSupportVectors;
64 uint32_t numCols = S->vectorDimension;
65 const float16_t *pSupport = S->supportVectors;
66 const float16_t *pSrcA = pSupport;
67 const float16_t *pInA0;
68 const float16_t *pInA1;
69 uint32_t row;
70 uint32_t blkCnt; /* loop counters */
71 const float16_t *pDualCoef = S->dualCoefficients;
72 _Float16 sum = S->intercept;
73 f16x8_t vSum = vdupq_n_f16(0.0f);
74
75 row = numRows;
76
77 /*
78 * compute 4 rows in parrallel
79 */
80 while (row >= 4) {
81 const float16_t *pInA2, *pInA3;
82 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
83 f16x8_t vecIn, acc0, acc1, acc2, acc3;
84 float16_t const *pSrcVecPtr = in;
85
86 /*
87 * Initialize the pointers to 4 consecutive MatrixA rows
88 */
89 pInA0 = pSrcA;
90 pInA1 = pInA0 + numCols;
91 pInA2 = pInA1 + numCols;
92 pInA3 = pInA2 + numCols;
93 /*
94 * Initialize the vector pointer
95 */
96 pInVec = pSrcVecPtr;
97 /*
98 * reset accumulators
99 */
100 acc0 = vdupq_n_f16(0.0f);
101 acc1 = vdupq_n_f16(0.0f);
102 acc2 = vdupq_n_f16(0.0f);
103 acc3 = vdupq_n_f16(0.0f);
104
105 pSrcA0Vec = pInA0;
106 pSrcA1Vec = pInA1;
107 pSrcA2Vec = pInA2;
108 pSrcA3Vec = pInA3;
109
110 blkCnt = numCols >> 3;
111 while (blkCnt > 0U) {
112 f16x8_t vecA;
113
114 vecIn = vld1q(pInVec);
115 pInVec += 8;
116 vecA = vld1q(pSrcA0Vec);
117 pSrcA0Vec += 8;
118 acc0 = vfmaq(acc0, vecIn, vecA);
119 vecA = vld1q(pSrcA1Vec);
120 pSrcA1Vec += 8;
121 acc1 = vfmaq(acc1, vecIn, vecA);
122 vecA = vld1q(pSrcA2Vec);
123 pSrcA2Vec += 8;
124 acc2 = vfmaq(acc2, vecIn, vecA);
125 vecA = vld1q(pSrcA3Vec);
126 pSrcA3Vec += 8;
127 acc3 = vfmaq(acc3, vecIn, vecA);
128
129 blkCnt--;
130 }
131 /*
132 * tail
133 * (will be merged thru tail predication)
134 */
135 blkCnt = numCols & 7;
136 if (blkCnt > 0U) {
137 mve_pred16_t p0 = vctp16q(blkCnt);
138 f16x8_t vecA;
139
140 vecIn = vldrhq_z_f16(pInVec, p0);
141 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
142 acc0 = vfmaq(acc0, vecIn, vecA);
143 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
144 acc1 = vfmaq(acc1, vecIn, vecA);
145 vecA = vldrhq_z_f16(pSrcA2Vec, p0);
146 acc2 = vfmaq(acc2, vecIn, vecA);
147 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
148 acc3 = vfmaq(acc3, vecIn, vecA);
149 }
150 /*
151 * Sum the partial parts
152 */
153 f16x8_t vtmp = vuninitializedq_f16();
154 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
155 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
156 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
157 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
158
159 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
160 arm_vec_exponent_f16
161 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
162 S->degree),vctp16q(4));
163
164 pDualCoef += 4;
165
166 pSrcA += numCols * 4;
167 /*
168 * Decrement the row loop counter
169 */
170 row -= 4;
171 }
172
173 /*
174 * compute 2 rows in parrallel
175 */
176 if (row >= 2) {
177 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
178 f16x8_t vecIn, acc0, acc1;
179 float16_t const *pSrcVecPtr = in;
180
181 /*
182 * Initialize the pointers to 2 consecutive MatrixA rows
183 */
184 pInA0 = pSrcA;
185 pInA1 = pInA0 + numCols;
186 /*
187 * Initialize the vector pointer
188 */
189 pInVec = pSrcVecPtr;
190 /*
191 * reset accumulators
192 */
193 acc0 = vdupq_n_f16(0.0f);
194 acc1 = vdupq_n_f16(0.0f);
195 pSrcA0Vec = pInA0;
196 pSrcA1Vec = pInA1;
197
198 blkCnt = numCols >> 3;
199 while (blkCnt > 0U) {
200 f16x8_t vecA;
201
202 vecIn = vld1q(pInVec);
203 pInVec += 8;
204 vecA = vld1q(pSrcA0Vec);
205 pSrcA0Vec += 8;
206 acc0 = vfmaq(acc0, vecIn, vecA);
207 vecA = vld1q(pSrcA1Vec);
208 pSrcA1Vec += 8;
209 acc1 = vfmaq(acc1, vecIn, vecA);
210
211 blkCnt--;
212 }
213 /*
214 * tail
215 * (will be merged thru tail predication)
216 */
217 blkCnt = numCols & 7;
218 if (blkCnt > 0U) {
219 mve_pred16_t p0 = vctp16q(blkCnt);
220 f16x8_t vecA;
221
222 vecIn = vldrhq_z_f16(pInVec, p0);
223 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
224 acc0 = vfmaq(acc0, vecIn, vecA);
225 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
226 acc1 = vfmaq(acc1, vecIn, vecA);
227 }
228 /*
229 * Sum the partial parts
230 */
231 f16x8_t vtmp = vuninitializedq_f16();
232 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
233 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
234
235 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
236 arm_vec_exponent_f16
237 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
238 vctp16q(2));
239
240 pDualCoef += 2;
241 pSrcA += numCols * 2;
242 row -= 2;
243 }
244
245 if (row >= 1) {
246 f16x8_t vecIn, acc0;
247 float16_t const *pSrcA0Vec, *pInVec;
248 float16_t const *pSrcVecPtr = in;
249 /*
250 * Initialize the pointers to last MatrixA row
251 */
252 pInA0 = pSrcA;
253 /*
254 * Initialize the vector pointer
255 */
256 pInVec = pSrcVecPtr;
257 /*
258 * reset accumulators
259 */
260 acc0 = vdupq_n_f16(0.0f);
261
262 pSrcA0Vec = pInA0;
263
264 blkCnt = numCols >> 3;
265 while (blkCnt > 0U) {
266 f16x8_t vecA;
267
268 vecIn = vld1q(pInVec);
269 pInVec += 8;
270 vecA = vld1q(pSrcA0Vec);
271 pSrcA0Vec += 8;
272 acc0 = vfmaq(acc0, vecIn, vecA);
273
274 blkCnt--;
275 }
276 /*
277 * tail
278 * (will be merged thru tail predication)
279 */
280 blkCnt = numCols & 7;
281 if (blkCnt > 0U) {
282 mve_pred16_t p0 = vctp16q(blkCnt);
283 f16x8_t vecA;
284
285 vecIn = vldrhq_z_f16(pInVec, p0);
286 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
287 acc0 = vfmaq(acc0, vecIn, vecA);
288 }
289 /*
290 * Sum the partial parts
291 */
292 f16x8_t vtmp = vuninitializedq_f16();
293 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
294 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
295 arm_vec_exponent_f16
296 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
297 vctp16q(1));
298 }
299 sum += (_Float16)vecAddAcrossF16Mve(vSum);
300
301
302 *pResult = S->classes[STEP(sum)];
303 }
304
305 #else
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)306 void arm_svm_polynomial_predict_f16(
307 const arm_svm_polynomial_instance_f16 *S,
308 const float16_t * in,
309 int32_t * pResult)
310 {
311 _Float16 sum=S->intercept;
312 _Float16 dot=0;
313 uint32_t i,j;
314 const float16_t *pSupport = S->supportVectors;
315
316 for(i=0; i < S->nbOfSupportVectors; i++)
317 {
318 dot=0;
319 for(j=0; j < S->vectorDimension; j++)
320 {
321 dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
322 }
323 sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
324 }
325
326 *pResult=S->classes[STEP(sum)];
327 }
328 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
329
330
331 /**
332 * @} end of polysvm group
333 */
334
335 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
336
337