1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_linear_predict_f16.c
4 * Description: SVM Linear Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36
37 /**
38 * @addtogroup linearsvm
39 * @{
40 */
41
42
43 /**
44 * @brief SVM linear prediction
45 * @param[in] S Pointer to an instance of the linear SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 * @return none.
49 *
50 */
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52
53 #include "arm_helium_utils.h"
54
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)55 void arm_svm_linear_predict_f16(
56 const arm_svm_linear_instance_f16 *S,
57 const float16_t * in,
58 int32_t * pResult)
59 {
60 /* inlined Matrix x Vector function interleaved with dot prod */
61 uint32_t numRows = S->nbOfSupportVectors;
62 uint32_t numCols = S->vectorDimension;
63 const float16_t *pSupport = S->supportVectors;
64 const float16_t *pSrcA = pSupport;
65 const float16_t *pInA0;
66 const float16_t *pInA1;
67 uint32_t row;
68 uint32_t blkCnt; /* loop counters */
69 const float16_t *pDualCoef = S->dualCoefficients;
70 _Float16 sum = S->intercept;
71 row = numRows;
72
73 /*
74 * compute 4 rows in parrallel
75 */
76 while (row >= 4)
77 {
78 const float16_t *pInA2, *pInA3;
79 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
80 f16x8_t vecIn, acc0, acc1, acc2, acc3;
81 float16_t const *pSrcVecPtr = in;
82
83 /*
84 * Initialize the pointers to 4 consecutive MatrixA rows
85 */
86 pInA0 = pSrcA;
87 pInA1 = pInA0 + numCols;
88 pInA2 = pInA1 + numCols;
89 pInA3 = pInA2 + numCols;
90 /*
91 * Initialize the vector pointer
92 */
93 pInVec = pSrcVecPtr;
94 /*
95 * reset accumulators
96 */
97 acc0 = vdupq_n_f16(0.0f);
98 acc1 = vdupq_n_f16(0.0f);
99 acc2 = vdupq_n_f16(0.0f);
100 acc3 = vdupq_n_f16(0.0f);
101
102 pSrcA0Vec = pInA0;
103 pSrcA1Vec = pInA1;
104 pSrcA2Vec = pInA2;
105 pSrcA3Vec = pInA3;
106
107 blkCnt = numCols >> 3;
108 while (blkCnt > 0U) {
109 f16x8_t vecA;
110
111 vecIn = vld1q(pInVec);
112 pInVec += 8;
113 vecA = vld1q(pSrcA0Vec);
114 pSrcA0Vec += 8;
115 acc0 = vfmaq(acc0, vecIn, vecA);
116 vecA = vld1q(pSrcA1Vec);
117 pSrcA1Vec += 8;
118 acc1 = vfmaq(acc1, vecIn, vecA);
119 vecA = vld1q(pSrcA2Vec);
120 pSrcA2Vec += 8;
121 acc2 = vfmaq(acc2, vecIn, vecA);
122 vecA = vld1q(pSrcA3Vec);
123 pSrcA3Vec += 8;
124 acc3 = vfmaq(acc3, vecIn, vecA);
125
126 blkCnt--;
127 }
128 /*
129 * tail
130 * (will be merged thru tail predication)
131 */
132 blkCnt = numCols & 7;
133 if (blkCnt > 0U) {
134 mve_pred16_t p0 = vctp16q(blkCnt);
135 f16x8_t vecA;
136
137 vecIn = vldrhq_z_f16(pInVec, p0);
138 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
139 acc0 = vfmaq(acc0, vecIn, vecA);
140 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
141 acc1 = vfmaq(acc1, vecIn, vecA);
142 vecA = vldrhq_z_f16(pSrcA2Vec, p0);
143 acc2 = vfmaq(acc2, vecIn, vecA);
144 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
145 acc3 = vfmaq(acc3, vecIn, vecA);
146 }
147 /*
148 * Sum the partial parts
149 */
150 acc0 = vmulq_n_f16(acc0,*pDualCoef++);
151 acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
152 acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
153 acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
154
155 sum += (_Float16)vecAddAcrossF16Mve(acc0);
156
157 pSrcA += numCols * 4;
158 /*
159 * Decrement the row loop counter
160 */
161 row -= 4;
162 }
163
164 /*
165 * compute 2 rows in parallel
166 */
167 if (row >= 2) {
168 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
169 f16x8_t vecIn, acc0, acc1;
170 float16_t const *pSrcVecPtr = in;
171
172 /*
173 * Initialize the pointers to 2 consecutive MatrixA rows
174 */
175 pInA0 = pSrcA;
176 pInA1 = pInA0 + numCols;
177 /*
178 * Initialize the vector pointer
179 */
180 pInVec = pSrcVecPtr;
181 /*
182 * reset accumulators
183 */
184 acc0 = vdupq_n_f16(0.0f);
185 acc1 = vdupq_n_f16(0.0f);
186 pSrcA0Vec = pInA0;
187 pSrcA1Vec = pInA1;
188
189 blkCnt = numCols >> 3;
190 while (blkCnt > 0U) {
191 f16x8_t vecA;
192
193 vecIn = vld1q(pInVec);
194 pInVec += 8;
195 vecA = vld1q(pSrcA0Vec);
196 pSrcA0Vec += 8;
197 acc0 = vfmaq(acc0, vecIn, vecA);
198 vecA = vld1q(pSrcA1Vec);
199 pSrcA1Vec += 8;
200 acc1 = vfmaq(acc1, vecIn, vecA);
201
202 blkCnt--;
203 }
204 /*
205 * tail
206 * (will be merged thru tail predication)
207 */
208 blkCnt = numCols & 7;
209 if (blkCnt > 0U) {
210 mve_pred16_t p0 = vctp16q(blkCnt);
211 f16x8_t vecA;
212
213 vecIn = vldrhq_z_f16(pInVec, p0);
214 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
215 acc0 = vfmaq(acc0, vecIn, vecA);
216 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
217 acc1 = vfmaq(acc1, vecIn, vecA);
218 }
219 /*
220 * Sum the partial parts
221 */
222 acc0 = vmulq_n_f16(acc0,*pDualCoef++);
223 acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
224
225 sum += (_Float16)vecAddAcrossF16Mve(acc0);
226
227 pSrcA += numCols * 2;
228 row -= 2;
229 }
230
231 if (row >= 1) {
232 f16x8_t vecIn, acc0;
233 float16_t const *pSrcA0Vec, *pInVec;
234 float16_t const *pSrcVecPtr = in;
235 /*
236 * Initialize the pointers to last MatrixA row
237 */
238 pInA0 = pSrcA;
239 /*
240 * Initialize the vector pointer
241 */
242 pInVec = pSrcVecPtr;
243 /*
244 * reset accumulators
245 */
246 acc0 = vdupq_n_f16(0.0f);
247
248 pSrcA0Vec = pInA0;
249
250 blkCnt = numCols >> 3;
251 while (blkCnt > 0U) {
252 f16x8_t vecA;
253
254 vecIn = vld1q(pInVec);
255 pInVec += 8;
256 vecA = vld1q(pSrcA0Vec);
257 pSrcA0Vec += 8;
258 acc0 = vfmaq(acc0, vecIn, vecA);
259
260 blkCnt--;
261 }
262 /*
263 * tail
264 * (will be merged thru tail predication)
265 */
266 blkCnt = numCols & 7;
267 if (blkCnt > 0U) {
268 mve_pred16_t p0 = vctp16q(blkCnt);
269 f16x8_t vecA;
270
271 vecIn = vldrhq_z_f16(pInVec, p0);
272 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
273 acc0 = vfmaq(acc0, vecIn, vecA);
274 }
275 /*
276 * Sum the partial parts
277 */
278 sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
279
280 }
281
282 *pResult = S->classes[STEP(sum)];
283 }
284
285 #else
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)286 void arm_svm_linear_predict_f16(
287 const arm_svm_linear_instance_f16 *S,
288 const float16_t * in,
289 int32_t * pResult)
290 {
291 _Float16 sum=S->intercept;
292 _Float16 dot=0;
293 uint32_t i,j;
294 const float16_t *pSupport = S->supportVectors;
295
296 for(i=0; i < S->nbOfSupportVectors; i++)
297 {
298 dot=0;
299 for(j=0; j < S->vectorDimension; j++)
300 {
301 dot = dot + in[j]* *pSupport++;
302 }
303 sum += S->dualCoefficients[i] * dot;
304 }
305 *pResult=S->classes[STEP(sum)];
306 }
307 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
308
309 /**
310 * @} end of linearsvm group
311 */
312
313 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
314
315