• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1R"(
2
3/*
4 * Copyright (c) 2018 Arm Limited.
5 *
6 * SPDX-License-Identifier: MIT
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to
10 * deal in the Software without restriction, including without limitation the
11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
12 * sell copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in all
16 * copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26/*
27 * Copyright (c) 2017-2020 Arm Limited.
28 *
29 * SPDX-License-Identifier: MIT
30 *
31 * Permission is hereby granted, free of charge, to any person obtaining a copy
32 * of this software and associated documentation files (the "Software"), to
33 * deal in the Software without restriction, including without limitation the
34 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
35 * sell copies of the Software, and to permit persons to whom the Software is
36 * furnished to do so, subject to the following conditions:
37 *
38 * The above copyright notice and this permission notice shall be included in all
39 * copies or substantial portions of the Software.
40 *
41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
46 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
47 * SOFTWARE.
48 */
49#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
50#define ARM_COMPUTE_HELPERS_ASYMM_H
51
52/*
53 * Copyright (c) 2016-2020 Arm Limited.
54 *
55 * SPDX-License-Identifier: MIT
56 *
57 * Permission is hereby granted, free of charge, to any person obtaining a copy
58 * of this software and associated documentation files (the "Software"), to
59 * deal in the Software without restriction, including without limitation the
60 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
61 * sell copies of the Software, and to permit persons to whom the Software is
62 * furnished to do so, subject to the following conditions:
63 *
64 * The above copyright notice and this permission notice shall be included in all
65 * copies or substantial portions of the Software.
66 *
67 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
68 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
70 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
71 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
72 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
73 * SOFTWARE.
74 */
75#ifndef ARM_COMPUTE_HELPER_H
76#define ARM_COMPUTE_HELPER_H
77
78/*
79 * Copyright (c) 2020 Arm Limited.
80 *
81 * SPDX-License-Identifier: MIT
82 *
83 * Permission is hereby granted, free of charge, to any person obtaining a copy
84 * of this software and associated documentation files (the "Software"), to
85 * deal in the Software without restriction, including without limitation the
86 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
87 * sell copies of the Software, and to permit persons to whom the Software is
88 * furnished to do so, subject to the following conditions:
89 *
90 * The above copyright notice and this permission notice shall be included in all
91 * copies or substantial portions of the Software.
92 *
93 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
98 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
99 * SOFTWARE.
100 */
101
102/** Store the 0 to (n-1)th rows of the given variables
103 * @name STORE_ROW_n
104 *
105 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
106 * @param[in] DATA_TYPE The data type of the vectors
107 * @param[in] BASENAME  The basename of the variables
108 * @param[in] PTR       The base pointer
109 * @param[in] STRIDE_Y  The stride value in y-axis direction
110 * @param[in] Z         The offset in z-axis direction
111 * @{
112 */
113#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
114    VSTORE(N0)                                                 \
115    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
116
117#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119    VSTORE(N0)                                                 \
120    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
121
122#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124    VSTORE(N0)                                                 \
125    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
126
127#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129    VSTORE(N0)                                                 \
130    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
131
132#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134    VSTORE(N0)                                                 \
135    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
136
137#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
138    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
139    VSTORE(N0)                                                 \
140    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
141
142#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144    VSTORE(N0)                                                 \
145    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
146
147#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149    VSTORE(N0)                                                 \
150    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
151
152#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154    VSTORE(N0)                                                 \
155    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
156
157#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
159    VSTORE(N0)                                                  \
160    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
161
162#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164    VSTORE(N0)                                                  \
165    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
166
167#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169    VSTORE(N0)                                                  \
170    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
171
172#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
173    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
174    VSTORE(N0)                                                  \
175    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
176
177#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
178    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
179    VSTORE(N0)                                                  \
180    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
181
182#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
183    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
184    VSTORE(N0)                                                  \
185    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
186
187#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
188    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
189    VSTORE(N0)                                                  \
190    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
191/** @} */ // end of groupd STORE_ROW_n
192
193/** Convert and store the 0th to (n-1)th rows of the given variables
194 * @name CONVERT_STORE_ROW_n
195 *
196 * @param[in] N0        The size of the vectors
197 * @param[in] DATA_TYPE The data type of the vectors
198 * @param[in] BASENAME  The basename of the variables
199 * @param[in] PTR       The base pointer
200 * @param[in] STRIDE_Y  The stride value in y-axis direction
201 * @param[in] Z         The offset in z-axis direction
202 * @{
203 */
204#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205    VSTORE(N0)                                                         \
206    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
207
208#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
209    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
210    VSTORE(N0)                                                         \
211    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
212
213#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
214    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
215    VSTORE(N0)                                                         \
216    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
217
218#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
219    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
220    VSTORE(N0)                                                         \
221    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
222
223#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
224    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
225    VSTORE(N0)                                                         \
226    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
227
228#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
229    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
230    VSTORE(N0)                                                         \
231    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
232
233#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
234    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
235    VSTORE(N0)                                                         \
236    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
237
238#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
239    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
240    VSTORE(N0)                                                         \
241    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
242
243#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
244    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
245    VSTORE(N0)                                                         \
246    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
247
248#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
249    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250    VSTORE(N0)                                                     \
251    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
252
253#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
254    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
255    VSTORE(N0)                                                          \
256    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
257
258#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
259    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
260    VSTORE(N0)                                                          \
261    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
262
263#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
264    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
265    VSTORE(N0)                                                          \
266    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
267
268#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
269    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
270    VSTORE(N0)                                                          \
271    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
272
273#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
274    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
275    VSTORE(N0)                                                          \
276    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
277
278#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
279    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
280    VSTORE(N0)                                                          \
281    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
282
283/** @} */ // end of groupd CONVERT_STORE_ROW_n
284
285/** Store a block of the given size M0xN0
286 * @name STORE_BLOCK
287 *
288 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
289 * The data to store is expected to have consecutive names for each row.
290 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
291 * The Z offset is expected to have consecutive names.
292 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
293 *
294 * @param[in] M0        The number of rows to store
295 * @param[in] N0        The size of each vector
296 * @param[in] DATA_TYPE The data type of the vectors
297 * @param[in] BASENAME  The basename of the variables
298 * @param[in] PTR       The base pointer
299 * @param[in] STRIDE_Y  The stride value in y-axis direction
300 * @param[in] Z         The offset in z-axis direction
301 * @{
302 */
303#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
304#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
305/** @} */ // end of group STORE_BLOCK
306
307/** Convert and store a block of the given size M0xN0
308 * @name CONVERT_STORE_BLOCK
309 *
310 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
311 * The data to store is expected to have consecutive names for each row.
312 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
313 * The Z offset is expected to have consecutive names.
314 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
315 *
316 * @param[in] M0        The number of rows to store
317 * @param[in] N0        The size of each vector
318 * @param[in] DATA_TYPE The data type of the vectors
319 * @param[in] BASENAME  The basename of the variables
320 * @param[in] PTR       The base pointer
321 * @param[in] STRIDE_Y  The stride value in y-axis direction
322 * @param[in] Z         The offset in z-axis direction
323 * @{
324 */
325#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
326#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
327/** @} */ // end of group CONVERT_STORE_BLOCK
328
329/** Partially store the 0 to (n-1)th rows of the given variables
330 * @name STORE_ROW_PARTIAL_n
331 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
332 *
333 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
334 *
335 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
336 * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
337 * @param[in] DATA_TYPE The data type of the vectors
338 * @param[in] BASENAME  The basename of the variables
339 * @param[in] PTR       The base pointer
340 * @param[in] STRIDE_Y  The stride value in y-axis direction
341 * @param[in] Z         The offset in z-axis direction
342 * @{
343 */
344#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
345    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
346    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
347
348#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
349    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
350    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
351    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
352
353#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
354    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
355    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
356    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
357
358#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
359    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
360    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
361    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
362
363#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
364    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
365    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
366    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
367
368#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
369    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
370    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
371    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
372
373#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
374    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
375    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
376    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
377
378#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
379    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
380    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
381    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
382
383#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
384    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
385    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
386    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
387
388#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
389    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
390    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
391    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
392
393#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
394    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
395    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
396    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
397
398#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
399    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
400    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
401    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
402
403#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
404    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
405    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
406    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
407
408#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
409    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
410    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
411    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
412
413#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
414    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
415    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
416    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
417
418#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
419    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
420    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
421    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
422/** @} */ // end of groupd STORE_ROW_PARTIAL_n
423
424/** Partially store a block of the given size STORE_M0xSTORE_N0
425 * @name STORE_BLOCK_PARTIAL
426 *
427 * @note The vector width @p N0 is also required for correct partial storing behaviour.
428 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
429 *
430 * The data to store is expected to have consecutive names for each row.
431 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
432 * The Z offset is expected to have consecutive names.
433 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
434 *
435 * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
436 * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
437 * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
438 * @param[in] DATA_TYPE The data type of the vectors
439 * @param[in] BASENAME  The basename of the variables
440 * @param[in] PTR       The base pointer
441 * @param[in] STRIDE_Y  The stride value in y-axis direction
442 * @param[in] Z         The offset in z-axis direction
443 * @{
444 */
445#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
446#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
447/** Store a block that can be partial in both x and y dimensions
448 *
449 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
450 *
451 * The data to store is expected to have consecutive names for each row.
452 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
453 * The Z offset is expected to have consecutive names.
454 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
455 *
456 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
457 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
458 * @param[in] DATA_TYPE        The data type of the vectors
459 * @param[in] BASENAME         The basename of the variables
460 * @param[in] PTR              The base pointer
461 * @param[in] STRIDE_Y         The stride value in y-axis direction
462 * @param[in] Z                The offset in z-axis direction
463 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
464 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
465 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
466 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
467 */
468#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
469    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
470    {                                                                                                                                                     \
471        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
472    }                                                                                                                                                     \
473    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
474    {                                                                                                                                                     \
475        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
476    }                                                                                                                                                     \
477    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
478    {                                                                                                                                                     \
479        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
480    }                                                                                                                                                     \
481    else                                                                                                                                                  \
482    {                                                                                                                                                     \
483        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
484    }
485/** Store a block that can only be partial in x but not y.
486 *
487 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
488 *
489 * The data to store is expected to have consecutive names for each row.
490 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
491 * The Z offset is expected to have consecutive names.
492 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
493 *
494 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
495 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
496 * @param[in] DATA_TYPE        The data type of the vectors
497 * @param[in] BASENAME         The basename of the variables
498 * @param[in] PTR              The base pointer
499 * @param[in] STRIDE_Y         The stride value in y-axis direction
500 * @param[in] Z                The offset in z-axis direction
501 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
502 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
503 */
504#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
505    if(!(PARTIAL_COND_X))                                                                                         \
506    {                                                                                                             \
507        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
508    }                                                                                                             \
509    else                                                                                                          \
510    {                                                                                                             \
511        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
512    }
513/** Store a block that can only be partial in y but not x.
514 *
515 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
516 *
517 * The data to store is expected to have consecutive names for each row.
518 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
519 * The Z offset is expected to have consecutive names.
520 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
521 *
522 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
523 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
524 * @param[in] DATA_TYPE        The data type of the vectors
525 * @param[in] BASENAME         The basename of the variables
526 * @param[in] PTR              The base pointer
527 * @param[in] STRIDE_Y         The stride value in y-axis direction
528 * @param[in] Z                The offset in z-axis direction
529 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
530 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
531 */
532#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
533    if(!(PARTIAL_COND_Y))                                                                                         \
534    {                                                                                                             \
535        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
536    }                                                                                                             \
537    else                                                                                                          \
538    {                                                                                                             \
539        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
540    }
541/** @} */ // end of group STORE_BLOCK_PARTIAL
542
543#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
544
545/** Boundary-aware GEMM block store
546 * @name STORE_BLOCK_BOUNDARY_AWARE
547 * This macro assumes the following schemes to achieve boundary-awareness:
548 *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
549 *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
550 *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
551 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
552 *
553 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
554 * blocks **at the end**.
555 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
556 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
557 *
558 *  *--x-->                         x == 0                        x == 1
559 *  |                  |<------------------------------N-------------------------->|
560 *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
561 *  |     -------------#############################################################
562 *  *     |          | |...............................|...........................|
563 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
564 *        |          | |...............................|...........................|
565 *        M          --#############################################################
566 *        |          | |                               |...........................|
567 * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
568 *        |          | |                               |...........................|
569 *        |------------#############################################################
570 *
571 * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
572 *
573 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
574 *
575 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
576 * and select corresponding store methods such that the boundary detection logic is only added when needed.
577 *
578 * The data to store is expected to have consecutive names for each row.
579 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
580 * The Z offset is expected to have consecutive names.
581 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
582 *
583 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
584 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
585 * @param[in] DATA_TYPE        The data type of the vectors
586 * @param[in] BASENAME         The basename of the variables
587 * @param[in] PTR              The base pointer
588 * @param[in] STRIDE_Y         The stride value in y-axis direction
589 * @param[in] Z                The offset in z-axis direction
590 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
591 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
592 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
593 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
594 * @{
595 */
596#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
597// Case1: No partial blocks in either x or y
598#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
599    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
600
601#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
602// Case2: Partial blocks in y
603#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
604    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
605
606#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
607// Case3: Partial blocks in x
608#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
609    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
610
611#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
612// Case4: Partial blocks in both x and y
613#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
614    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
615
616#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
617
618#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
619/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
620
621#if defined(PARTIAL_STORE_M0)
622/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
623 * @name COMPUTE_M0_START_ROW
624 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
625 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
626 * blocks in the y dimension to avoid any padding.
627 * EG: M0=4, PARTIAL_STORE_M0=1:
628 *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
629 * block 0 (partial)| start row = 0   | start row = 0
630 * block 1 (full)   | start row = 4   | start row = 1
631 * block 2 (full)   | start row = 8   | start row = 5
632 *
633 * @param[in] y                Global id of current block in y.
634 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
635 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
636 * @{
637 */
638#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
639    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
640#else // defined(PARTIAL_STORE_M0)
641#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
642    ((uint)(y * M0))
643#endif    // defined(PARTIAL_STORE_M0)
644/** @} */ // end of group COMPUTE_M0_START_ROW
645
646/** Store a vector that can only be partial in x.
647 *
648 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
649 *
650 * The data to store is expected to end in a 0.
651 * E.g., for basename=c, the expected name is c0.
652 *
653 * @param[in] basename  The name of the variable without trailing 0
654 * @param[in] data_type The data type of the vector
655 * @param[in] ptr       The base pointer
656 * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
657 * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
658 * @param[in] cond      Condition to select either vec_size0 or vec_size1
659 * @{
660 */
661#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
662    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
663/** @} */ // end of group STORE_VECTOR_SELECT
664
665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666#pragma OPENCL EXTENSION cl_khr_fp16 : enable
667#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
668
669#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
670#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
671#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
672
673#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
674#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
675#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
676
677#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
678#pragma OPENCL EXTENSION cl_arm_printf : enable
679#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
680
681#define GPU_ARCH_MIDGARD 0x100
682#define GPU_ARCH_BIFROST 0x200
683
684/** Concatenate two inputs.
685 *
686 * @param[in] a The first input to be concatenated
687 * @param[in] b The second input to be concatenated
688 *
689 * @return The concatenated output
690 */
691#define CONCAT(a, b) a##b
692
693/** Expand the given vector
694 *
695 * @param[in] x The vector to be expanded
696 *
697 * @return The expanded output
698 */
699#define EXPAND(x) x
700
701/** Clamp the given value between an upper and lower bound.
702 *
703 * @param[in] x       The value to be clamped
704 * @param[in] min_val The lower bound
705 * @param[in] max_val The upper bound
706 *
707 * @return The clamped value.
708 */
709#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
710
711/** REVn reverses the given vector whose size is n.
712 * @name REVn
713 *
714 * @param[in] x The vector to be reversed
715 *
716 * @return The reversed vector
717 * @{
718 */
719#define REV1(x) ((x))
720#define REV2(x) ((x).s10)
721#define REV3(x) ((x).s210)
722#define REV4(x) ((x).s3210)
723#define REV8(x) ((x).s76543210)
724#define REV16(x) ((x).sFEDCBA9876543210)
725/** @} */ // end of group REVn
726
727/** Reverse the given vector.
728 * @name REVERSE
729 *
730 * @param[in] x The vector to be reversed
731 * @param[in] s The size of the vector
732 *
733 * @return The reversed vector
734 * @{
735 */
736#define REVERSE_STR(x, s) REV##s((x))
737#define REVERSE(x, s) REVERSE_STR(x, s)
738/** @} */ // end of group REVERSE
739
740/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
741 * @name ROTs_n
742 *
743 * @param[in] x The vector to be shifted
744 *
745 * @return The shifted vector
746 * @{
747 */
748#define ROT1_0(x) ((x))
749
750#define ROT2_0(x) ((x))
751#define ROT2_1(x) ((x).s10)
752
753#define ROT3_0(x) ((x))
754#define ROT3_1(x) ((x).s201)
755#define ROT3_2(x) ((x).s120)
756
757#define ROT4_0(x) ((x))
758#define ROT4_1(x) ((x).s3012)
759#define ROT4_2(x) ((x).s2301)
760#define ROT4_3(x) ((x).s1230)
761
762#define ROT8_0(x) ((x))
763#define ROT8_1(x) ((x).s70123456)
764#define ROT8_2(x) ((x).s67012345)
765#define ROT8_3(x) ((x).s56701234)
766#define ROT8_4(x) ((x).s45670123)
767#define ROT8_5(x) ((x).s34567012)
768#define ROT8_6(x) ((x).s23456701)
769#define ROT8_7(x) ((x).s12345670)
770
771#define ROT16_0(x) ((x))
772#define ROT16_1(x) ((x).sF0123456789ABCDE)
773#define ROT16_2(x) ((x).sEF0123456789ABCD)
774#define ROT16_3(x) ((x).sDEF0123456789ABC)
775#define ROT16_4(x) ((x).sCDEF0123456789AB)
776#define ROT16_5(x) ((x).sBCDEF0123456789A)
777#define ROT16_6(x) ((x).sABCDEF0123456789)
778#define ROT16_7(x) ((x).s9ABCDEF012345678)
779#define ROT16_8(x) ((x).s89ABCDEF01234567)
780#define ROT16_9(x) ((x).s789ABCDEF0123456)
781#define ROT16_10(x) ((x).s6789ABCDEF012345)
782#define ROT16_11(x) ((x).s56789ABCDEF01234)
783#define ROT16_12(x) ((x).s456789ABCDEF0123)
784#define ROT16_13(x) ((x).s3456789ABCDEF012)
785#define ROT16_14(x) ((x).s23456789ABCDEF01)
786#define ROT16_15(x) ((x).s123456789ABCDEF0)
787/** @} */ // end of group ROTs_n
788
789/** Circular-right-shift (rotate-right) the given vector by the given amount.
790 * @name ROTATE
791 *
792 * @param[in] x The vector to be shifted
793 * @param[in] s The size of the vector
794 * @param[in] n The amount to be shifted
795 *
796 * @return The shifted vector
797 * @{
798 */
799#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
800#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
801/** @} */ // end of group ROTATE
802
803/** Creates a vector of size n filled with offset values corresponding to the location of each element.
804 * @name V_OFFSn
805 *
806 * @param[in] dt The data type of the output vector
807 *
808 * @return The vector filled with offset values
809 * @{
810 */
811#define V_OFFS1(dt) (dt##1)(0)
812#define V_OFFS2(dt) (dt##2)(0, 1)
813#define V_OFFS3(dt) (dt##3)(0, 1, 2)
814#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
815#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
816#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
817/** @} */ // end of group V_OFFSn
818
819/** Create a vector filled with offset values corresponding to the location of each element.
820 * @name VEC_OFFS
821 *
822 * @param[in] dt The data type of the output vector
823 * @param[in] s  The size of the output vector
824 *
825 * @return The vector filled with offset values
826 * @{
827 */
828#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
829#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
830/** @} */ // end of group VEC_OFFS
831
832#define VLOAD_STR(size) vload##size
833#define VLOAD(size) VLOAD_STR(size)
834
835#define PIXEL_UNIT4 1
836#define PIXEL_UNIT8 2
837#define PIXEL_UNIT16 4
838
839/** Utility macro to convert a vector size in pixel unit.
840 *
841 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
842 *
843 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
844 *
845 * @return The pixel unit (number of pixels)
846 * @{
847 */
848#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
849#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
850/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
851
852#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
853#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
854#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
855
856#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
857#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
858#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
859#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
860#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
861
862/** Utility macro to read a 2D OpenCL image object.
863 *
864 * @note Coordinates are not normalized
865 *
866 * @param[in] data_type Data type
867 * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
868 * @param[in] img       OpenCL image object
869 * @param[in] x_coord   The x coordinate for the top-left pixel
870 * @param[in] y_coord   The y coordinate for the top-left pixel
871 *
872 * @return Pixels from the 2D OpenCL image object
873 * @{
874 */
875#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
876#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
877
878#define VSTORE_STR(size) vstore##size
879#define VSTORE(size) VSTORE_STR(size)
880
881#define float1 float
882#define half1 half
883#define char1 char
884#define uchar1 uchar
885#define short1 short
886#define ushort1 ushort
887#define int1 int
888#define uint1 uint
889#define long1 long
890#define ulong1 ulong
891#define double1 double
892
893#define vload1(OFFSET, PTR) *(OFFSET + PTR)
894#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
895
896/** Extended partial vstore that correctly handles scalar values as well.
897 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
898 * @name VSTORE_PARTIAL
899 *
900 * @note With this macro, the passed data can be both a vector and a scalar
901 * @note @p store_size needs to be <= @p size
902 * eg 1: Valid
903 * VSTORE_PARTIAL(16, 15) ...;
904 * eg 2: Invalid
905 * VSTORE_PARTIAL(4, 7) ...;
906 *
907 * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
908 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
909 * @{
910 */
911#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
912#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
913
914#define NO_STORE(data, offs, ptr) \
915    {                             \
916    }
917
918// Size == 1 (scalar)
919#define vstore_partial_1_0 NO_STORE
920#define vstore_partial_1_1 vstore1
921#define vstore_partial_1_2 NO_STORE
922#define vstore_partial_1_3 NO_STORE
923#define vstore_partial_1_4 NO_STORE
924#define vstore_partial_1_5 NO_STORE
925#define vstore_partial_1_6 NO_STORE
926#define vstore_partial_1_7 NO_STORE
927#define vstore_partial_1_8 NO_STORE
928#define vstore_partial_1_9 NO_STORE
929#define vstore_partial_1_10 NO_STORE
930#define vstore_partial_1_11 NO_STORE
931#define vstore_partial_1_12 NO_STORE
932#define vstore_partial_1_13 NO_STORE
933#define vstore_partial_1_14 NO_STORE
934#define vstore_partial_1_15 NO_STORE
935#define vstore_partial_1_16 NO_STORE
936// Size == 2
937#define vstore_partial_2_0 NO_STORE
938#define vstore_partial_2_1 vstore_partial_1
939#define vstore_partial_2_2 vstore_partial_2
940#define vstore_partial_2_3 NO_STORE
941#define vstore_partial_2_4 NO_STORE
942#define vstore_partial_2_5 NO_STORE
943#define vstore_partial_2_6 NO_STORE
944#define vstore_partial_2_7 NO_STORE
945#define vstore_partial_2_8 NO_STORE
946#define vstore_partial_2_9 NO_STORE
947#define vstore_partial_2_10 NO_STORE
948#define vstore_partial_2_11 NO_STORE
949#define vstore_partial_2_12 NO_STORE
950#define vstore_partial_2_13 NO_STORE
951#define vstore_partial_2_14 NO_STORE
952#define vstore_partial_2_15 NO_STORE
953#define vstore_partial_2_16 NO_STORE
954// Size == 3
955#define vstore_partial_3_0 NO_STORE
956#define vstore_partial_3_1 vstore_partial_1
957#define vstore_partial_3_2 vstore_partial_2
958#define vstore_partial_3_3 vstore_partial_3
959#define vstore_partial_3_4 NO_STORE
960#define vstore_partial_3_5 NO_STORE
961#define vstore_partial_3_6 NO_STORE
962#define vstore_partial_3_7 NO_STORE
963#define vstore_partial_3_8 NO_STORE
964#define vstore_partial_3_9 NO_STORE
965#define vstore_partial_3_10 NO_STORE
966#define vstore_partial_3_11 NO_STORE
967#define vstore_partial_3_12 NO_STORE
968#define vstore_partial_3_13 NO_STORE
969#define vstore_partial_3_14 NO_STORE
970#define vstore_partial_3_15 NO_STORE
971#define vstore_partial_3_16 NO_STORE
972// Size == 4
973#define vstore_partial_4_0 NO_STORE
974#define vstore_partial_4_1 vstore_partial_1
975#define vstore_partial_4_2 vstore_partial_2
976#define vstore_partial_4_3 vstore_partial_3
977#define vstore_partial_4_4 vstore_partial_4
978#define vstore_partial_4_5 NO_STORE
979#define vstore_partial_4_6 NO_STORE
980#define vstore_partial_4_7 NO_STORE
981#define vstore_partial_4_8 NO_STORE
982#define vstore_partial_4_9 NO_STORE
983#define vstore_partial_4_10 NO_STORE
984#define vstore_partial_4_11 NO_STORE
985#define vstore_partial_4_12 NO_STORE
986#define vstore_partial_4_13 NO_STORE
987#define vstore_partial_4_14 NO_STORE
988#define vstore_partial_4_15 NO_STORE
989#define vstore_partial_4_16 NO_STORE
990// Size == 8
991#define vstore_partial_8_0 NO_STORE
992#define vstore_partial_8_1 vstore_partial_1
993#define vstore_partial_8_2 vstore_partial_2
994#define vstore_partial_8_3 vstore_partial_3
995#define vstore_partial_8_4 vstore_partial_4
996#define vstore_partial_8_5 vstore_partial_5
997#define vstore_partial_8_6 vstore_partial_6
998#define vstore_partial_8_7 vstore_partial_7
999#define vstore_partial_8_8 vstore_partial_8
1000#define vstore_partial_8_9 NO_STORE
1001#define vstore_partial_8_10 NO_STORE
1002#define vstore_partial_8_11 NO_STORE
1003#define vstore_partial_8_12 NO_STORE
1004#define vstore_partial_8_13 NO_STORE
1005#define vstore_partial_8_14 NO_STORE
1006#define vstore_partial_8_15 NO_STORE
1007#define vstore_partial_8_16 NO_STORE
1008// Size == 16
1009#define vstore_partial_16_0 NO_STORE
1010#define vstore_partial_16_1 vstore_partial_1
1011#define vstore_partial_16_2 vstore_partial_2
1012#define vstore_partial_16_3 vstore_partial_3
1013#define vstore_partial_16_4 vstore_partial_4
1014#define vstore_partial_16_5 vstore_partial_5
1015#define vstore_partial_16_6 vstore_partial_6
1016#define vstore_partial_16_7 vstore_partial_7
1017#define vstore_partial_16_8 vstore_partial_8
1018#define vstore_partial_16_9 vstore_partial_9
1019#define vstore_partial_16_10 vstore_partial_10
1020#define vstore_partial_16_11 vstore_partial_11
1021#define vstore_partial_16_12 vstore_partial_12
1022#define vstore_partial_16_13 vstore_partial_13
1023#define vstore_partial_16_14 vstore_partial_14
1024#define vstore_partial_16_15 vstore_partial_15
1025#define vstore_partial_16_16 vstore_partial_16
1026
1027/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
1028 * @name vstore_partial_n
1029 *
1030 * @note @p DATA needs to be a vector not a scalar
1031 * @note n needs to be <= the vector width of the input variable @p DATA
1032 * eg 1: Valid
1033 * vstore_partial_15(var:float16, 0, 0xabcd);
1034 * eg 2: Invalid
1035 * vstore_partial_7(var:float4, 0, 0xabcd);
1036 *
1037 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
1038 *
1039 * @param[in] DATA   The name of the variable
1040 * @param[in] OFFSET Offset in n
1041 * @param[in] PTR    The base pointer
1042 * @{
1043 */
1044#define vstore_partial_1(DATA, OFFSET, PTR) \
1045    vstore1(DATA.s0, OFFSET, PTR);
1046
1047#define vstore_partial_2(DATA, OFFSET, PTR) \
1048    vstore2(DATA.s01, OFFSET, PTR);
1049
1050#define vstore_partial_3(DATA, OFFSET, PTR) \
1051    vstore3(DATA.s012, OFFSET, PTR);
1052
1053#define vstore_partial_4(DATA, OFFSET, PTR) \
1054    vstore4(DATA.s0123, OFFSET, PTR);
1055
1056#define vstore_partial_5(DATA, OFFSET, PTR)    \
1057    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1058    vstore1(DATA.s4, OFFSET, PTR + 4);
1059
1060#define vstore_partial_6(DATA, OFFSET, PTR)    \
1061    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1062    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
1063
1064#define vstore_partial_7(DATA, OFFSET, PTR)    \
1065    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1066    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
1067
1068#define vstore_partial_8(DATA, OFFSET, PTR) \
1069    vstore8(DATA.s01234567, OFFSET, PTR);
1070
1071#define vstore_partial_9(DATA, OFFSET, PTR)        \
1072    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1073    vstore1(DATA.s8, OFFSET, PTR + 8);
1074
1075#define vstore_partial_10(DATA, OFFSET, PTR)       \
1076    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1077    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
1078
1079#define vstore_partial_11(DATA, OFFSET, PTR)       \
1080    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1081    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
1082
1083#define vstore_partial_12(DATA, OFFSET, PTR)       \
1084    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1085    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
1086
1087#define vstore_partial_13(DATA, OFFSET, PTR)       \
1088    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1089    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
1090
1091#define vstore_partial_14(DATA, OFFSET, PTR)       \
1092    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1093    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
1094
1095#define vstore_partial_15(DATA, OFFSET, PTR)       \
1096    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1097    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
1098
1099#define vstore_partial_16(DATA, OFFSET, PTR) \
1100    vstore16(DATA, OFFSET, PTR);
1101/** @} */ // end of groupd vstore_partial_n
1102/** @} */ // end of groupd VSTORE_PARTIAL
1103
1104// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
1105// without _sat to overcome this issue
1106#define convert_float_sat convert_float
1107#define convert_float1_sat convert_float
1108#define convert_float2_sat convert_float2
1109#define convert_float3_sat convert_float3
1110#define convert_float4_sat convert_float4
1111#define convert_float8_sat convert_float8
1112#define convert_float16_sat convert_float16
1113#define convert_half_sat convert_float
1114#define convert_half1_sat convert_half
1115#define convert_half2_sat convert_half2
1116#define convert_half3_sat convert_half3
1117#define convert_half4_sat convert_half4
1118#define convert_half8_sat convert_half8
1119#define convert_half16_sat convert_half16
1120
1121#define convert_float1 convert_float
1122#define convert_half1 convert_half
1123#define convert_char1 convert_char
1124#define convert_uchar1 convert_uchar
1125#define convert_short1 convert_short
1126#define convert_ushort1 convert_ushort
1127#define convert_int1 convert_int
1128#define convert_uint1 convert_uint
1129#define convert_long1 convert_long
1130#define convert_ulong1 convert_ulong
1131#define convert_double1 convert_double
1132
1133#define convert_char1_sat convert_char_sat
1134#define convert_uchar1_sat convert_uchar_sat
1135#define convert_short1_sat convert_short_sat
1136#define convert_ushort1_sat convert_ushort_sat
1137#define convert_int1_sat convert_int_sat
1138#define convert_uint1_sat convert_uint_sat
1139#define convert_long1_sat convert_long_sat
1140#define convert_ulong1_sat convert_ulong_sat
1141#define convert_double1_sat convert_double_sat
1142
1143#define VEC_DATA_TYPE_STR(type, size) type##size
1144#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
1145
1146#define CONVERT_STR(x, type) (convert_##type((x)))
1147#define CONVERT(x, type) CONVERT_STR(x, type)
1148
1149#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
1150#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
1151
1152#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
1153#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
1154
1155#define select_vec_dt_uchar(size) uchar##size
1156#define select_vec_dt_char(size) char##size
1157#define select_vec_dt_ushort(size) ushort##size
1158#define select_vec_dt_short(size) short##size
1159#define select_vec_dt_half(size) short##size
1160#define select_vec_dt_uint(size) uint##size
1161#define select_vec_dt_int(size) int##size
1162#define select_vec_dt_float(size) int##size
1163#define select_vec_dt_ulong(size) ulong##size
1164#define select_vec_dt_long(size) long##size
1165
1166#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
1167#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
1168#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
1169
1170#define sum_reduce_1(x) (x)
1171#define sum_reduce_2(x) ((x).s0) + ((x).s1)
1172#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
1173#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
1174#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
1175#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
1176
1177#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
1178#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
1179
1180#define max_reduce_1(x) (x)
1181#define max_reduce_2(x) max(((x).s0), ((x).s1))
1182#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
1183#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
1184#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
1185#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
1186
1187#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
1188#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
1189
1190#define VECTOR_DECLARATION(name)     \
1191    __global uchar *name##_ptr,      \
1192    uint        name##_stride_x, \
1193    uint        name##_step_x,   \
1194    uint        name##_offset_first_element_in_bytes
1195
1196#define IMAGE_DECLARATION(name)      \
1197    __global uchar *name##_ptr,      \
1198    uint        name##_stride_x, \
1199    uint        name##_step_x,   \
1200    uint        name##_stride_y, \
1201    uint        name##_step_y,   \
1202    uint        name##_offset_first_element_in_bytes
1203
1204#define TENSOR3D_DECLARATION(name)   \
1205    __global uchar *name##_ptr,      \
1206    uint        name##_stride_x, \
1207    uint        name##_step_x,   \
1208    uint        name##_stride_y, \
1209    uint        name##_step_y,   \
1210    uint        name##_stride_z, \
1211    uint        name##_step_z,   \
1212    uint        name##_offset_first_element_in_bytes
1213
1214#define TENSOR4D_DECLARATION(name)   \
1215    __global uchar *name##_ptr,      \
1216    uint        name##_stride_x, \
1217    uint        name##_step_x,   \
1218    uint        name##_stride_y, \
1219    uint        name##_step_y,   \
1220    uint        name##_stride_z, \
1221    uint        name##_step_z,   \
1222    uint        name##_stride_w, \
1223    uint        name##_step_w,   \
1224    uint        name##_offset_first_element_in_bytes
1225
1226#define CONVERT_TO_VECTOR_STRUCT(name) \
1227    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1228
1229#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1230    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1231
1232#define CONVERT_TO_IMAGE_STRUCT(name) \
1233    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1234
1235#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1236    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1237
1238#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1239    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1240
1241#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1242    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1243
1244#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1245    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1246
1247#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1248    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1249                                 name##_stride_z, name##_step_z)
1250
1251#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1252    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1253
1254#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1255    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1256                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1257
1258#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1259    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1260
1261#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1262    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1263                           name##_stride_z, name##_step_z)
1264
1265/** Structure to hold Vector information */
1266typedef struct Vector
1267{
1268    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1269    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1270    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1271} Vector;
1272
1273/** Structure to hold Image information */
1274typedef struct Image
1275{
1276    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1277    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1278    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1279    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1280} Image;
1281
1282/** Structure to hold 3D tensor information */
1283typedef struct Tensor3D
1284{
1285    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1286    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1287    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1288    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1289    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
1290} Tensor3D;
1291
1292/** Structure to hold 4D tensor information */
1293typedef struct Tensor4D
1294{
1295    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1296    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1297    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1298    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1299    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
1300    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
1301} Tensor4D;
1302
1303/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
1304 *
1305 * @param[in] ptr                           Pointer to the starting postion of the buffer
1306 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
1307 * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
1308 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1309 *
1310 * @return An image object
1311 */
1312inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1313{
1314    Vector vector =
1315    {
1316        .ptr                           = ptr,
1317        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1318        .stride_x                      = stride_x,
1319    };
1320    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1321    return vector;
1322}
1323
1324/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
1325 *
1326 * @param[in] ptr                           Pointer to the starting postion of the buffer
1327 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1328 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1329 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1330 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1331 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1332 *
1333 * @return An image object
1334 */
1335inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1336{
1337    Image img =
1338    {
1339        .ptr                           = ptr,
1340        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1341        .stride_x                      = stride_x,
1342        .stride_y                      = stride_y
1343    };
1344    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1345    return img;
1346}
1347
1348/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
1349 *
1350 * @param[in] ptr                           Pointer to the starting postion of the buffer
1351 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1352 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1353 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1354 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1355 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1356 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1357 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1358 *
1359 * @return A 3D tensor object
1360 */
1361inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1362{
1363    Image img =
1364    {
1365        .ptr                           = ptr,
1366        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1367        .stride_x                      = stride_x,
1368        .stride_y                      = stride_y
1369    };
1370    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1371    return img;
1372}
1373
1374/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
1375 *
1376 * @param[in] ptr                           Pointer to the starting postion of the buffer
1377 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1378 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1379 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1380 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1381 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1382 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1383 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1384 *
1385 * @return A 3D tensor object
1386 */
1387inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1388{
1389    Tensor3D tensor =
1390    {
1391        .ptr                           = ptr,
1392        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1393        .stride_x                      = stride_x,
1394        .stride_y                      = stride_y,
1395        .stride_z                      = stride_z
1396    };
1397    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1398    return tensor;
1399}
1400
1401/** Wrap 3D tensor information into an tensor structure.
1402 *
1403 * @param[in] ptr                           Pointer to the starting postion of the buffer
1404 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1405 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1406 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1407 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1408 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1409 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1410 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1411 *
1412 * @return A 3D tensor object
1413 */
1414inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1415{
1416    Tensor3D tensor =
1417    {
1418        .ptr                           = ptr,
1419        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1420        .stride_x                      = stride_x,
1421        .stride_y                      = stride_y,
1422        .stride_z                      = stride_z
1423    };
1424    return tensor;
1425}
1426
1427inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1428                                             uint step_w,
1429                                             uint mod_size)
1430{
1431    Tensor4D tensor =
1432    {
1433        .ptr                           = ptr,
1434        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1435        .stride_x                      = stride_x,
1436        .stride_y                      = stride_y,
1437        .stride_z                      = stride_z,
1438        .stride_w                      = stride_w
1439    };
1440
1441    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1442    return tensor;
1443}
1444
1445/** Get the pointer position of a Vector
1446 *
1447 * @param[in] vec Pointer to the starting position of the buffer
1448 * @param[in] x   Relative X position
1449 */
1450inline __global const uchar *vector_offset(const Vector *vec, int x)
1451{
1452    return vec->ptr + x * vec->stride_x;
1453}
1454
1455/** Get the pointer position of a Image
1456 *
1457 * @param[in] img Pointer to the starting position of the buffer
1458 * @param[in] x   Relative X position
1459 * @param[in] y   Relative Y position
1460 */
1461inline __global uchar *offset(const Image *img, int x, int y)
1462{
1463    return img->ptr + x * img->stride_x + y * img->stride_y;
1464}
1465
1466/** Get the pointer position of a Tensor3D
1467 *
1468 * @param[in] tensor Pointer to the starting position of the buffer
1469 * @param[in] x      Relative X position
1470 * @param[in] y      Relative Y position
1471 * @param[in] z      Relative Z position
1472 */
1473inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1474{
1475    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1476}
1477
1478/** Get the pointer position of a Tensor4D
1479 *
1480 * @param[in] tensor Pointer to the starting position of the buffer
1481 * @param[in] x      Relative X position
1482 * @param[in] y      Relative Y position
1483 * @param[in] z      Relative Z position
1484 * @param[in] w      Relative W position
1485 */
1486inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1487{
1488    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1489}
1490
1491/** Get the offset for a given linear index of a Tensor3D
1492 *
1493 * @param[in] tensor Pointer to the starting position of the buffer
1494 * @param[in] width  Width of the input tensor
1495 * @param[in] height Height of the input tensor
1496 * @param[in] depth  Depth of the input tensor
1497 * @param[in] index  Linear index
1498 */
1499inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1500{
1501    uint num_elements = width * height;
1502
1503    const uint z = index / num_elements;
1504
1505    index %= num_elements;
1506
1507    const uint y = index / width;
1508
1509    index %= width;
1510
1511    const uint x = index;
1512
1513    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1514}
1515
1516#endif // _HELPER_H
1517
1518/** Convert the given vector with round to nearest even rounding mode
1519 *
1520 * @param[in] x    The target to be converted
1521 * @param[in] type The target type
1522 *
1523 * @return The converted vector
1524 */
1525#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
1526#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
1527
1528/** Quantize a floating-point scalar value to 8-bit asymmetric
1529 *
1530 * @param[in] input  Input value to quantize
1531 * @param[in] offset Quantization offset
1532 * @param[in] scale  Quantization scale
1533 *
1534 * @return quantized value
1535 */
1536inline uchar quantize_qasymm8(float input, float offset, float scale)
1537{
1538    float out_f32 = input / scale + offset;
1539    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
1540    return res_u8;
1541}
1542
1543/** Dequantize a scalar value from 8-bit asymmetric to floating-point
1544 *
1545 * @param[in] input  Input value to quantize
1546 * @param[in] offset Quantization offset
1547 * @param[in] scale  Quantization scale
1548 *
1549 * @return quantized value
1550 */
1551inline float dequantize_qasymm8(uchar input, float offset, float scale)
1552{
1553    return ((float)input - offset) * scale;
1554}
1555
1556/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
1557 *
1558 * @param[in] input  Input value to quantize
1559 * @param[in] offset Quantization offset
1560 * @param[in] scale  Quantization scale
1561 *
1562 * @return quantized value
1563 */
1564inline float dequantize_qasymm8_signed(char input, float offset, float scale)
1565{
1566    return ((float)input - offset) * scale;
1567}
1568
1569/** Quantize a vector of values from floating-point
1570 *
1571 * @param[in] type Output data type.
1572 * @param[in] size Size of vector.
1573 *
1574 * @return quantized values
1575 */
1576#define QUANTIZE_IMPL(type, size)                                                                                       \
1577    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
1578    {                                                                                                                   \
1579        VEC_DATA_TYPE(float, size)                                                                                      \
1580        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
1581        VEC_DATA_TYPE(type, size)                                                                                       \
1582        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
1583        return res;                                                                                                     \
1584    }
1585
1586/** Dequantize a vector of values to floating-point
1587 *
1588 * @param[in] type Input data type.
1589 * @param[in] size Size of vector.
1590 *
1591 * @return dequantized values in floating point
1592 */
1593#define DEQUANTIZE_IMPL(type, size)                                                                                       \
1594    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
1595    {                                                                                                                     \
1596        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
1597    }
1598
1599/** Correctly-rounded-to-nearest division by a power-of-two.
1600 *
1601 * @param[in] size Size of vector.
1602 *
1603 * @return Correctly-rounded-to-nearest division by a power-of-two.
1604 */
1605#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
1606    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
1607    {                                                                                                                                   \
1608        const VEC_DATA_TYPE(int, size)                                                                                                  \
1609        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
1610        const VEC_DATA_TYPE(int, size)                                                                                                  \
1611        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
1612        VEC_DATA_TYPE(int, size)                                                                                                        \
1613        mask = (one << exponent) - one;                                                                                                 \
1614        VEC_DATA_TYPE(int, size)                                                                                                        \
1615        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
1616        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
1617    }
1618
1619/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
1620 * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
1621 *
1622 * @param[in] size Size of vector.
1623 *
1624 * @return Product of two fixed-point numbers.
1625 */
1626#define ASYMM_MULT_IMPL(size)                                                                                \
1627    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
1628    {                                                                                                        \
1629        VEC_DATA_TYPE(int, size)                                                                             \
1630        overflow = a == b && a == INT_MIN;                                                                   \
1631        VEC_DATA_TYPE(long, size)                                                                            \
1632        a_64 = convert_long##size(a);                                                                        \
1633        VEC_DATA_TYPE(long, size)                                                                            \
1634        b_64 = convert_long##size(b);                                                                        \
1635        VEC_DATA_TYPE(long, size)                                                                            \
1636        ab_64 = a_64 * b_64;                                                                                 \
1637        /* Revert COMPMID-907 */                                                                             \
1638        VEC_DATA_TYPE(long, size)                                                                            \
1639        mask1 = 1 << 30;                                                                                     \
1640        VEC_DATA_TYPE(long, size)                                                                            \
1641        mask2 = 1 - (1 << 30);                                                                               \
1642        VEC_DATA_TYPE(long, size)                                                                            \
1643        is_positive_or_zero = ab_64 >= 0;                                                                    \
1644        VEC_DATA_TYPE(long, size)                                                                            \
1645        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
1646        VEC_DATA_TYPE(long, size)                                                                            \
1647        mask = 1ll << 31;                                                                                    \
1648        VEC_DATA_TYPE(int, size)                                                                             \
1649        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
1650        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
1651    }
1652
1653/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
1654 *
1655 * @param[in] size Size of vector.
1656 *
1657 * @return Result in fixed-point format Q0.
1658 */
1659#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
1660    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
1661    {                                                                                                                               \
1662        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
1663        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
1664        const int k_fractional_bits = 31;                                                                                           \
1665        VEC_DATA_TYPE(int, size)                                                                                                    \
1666        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
1667        VEC_DATA_TYPE(int, size)                                                                                                    \
1668        x2 = ASYMM_MULT(x, x, size);                                                                                                \
1669        VEC_DATA_TYPE(int, size)                                                                                                    \
1670        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
1671        VEC_DATA_TYPE(int, size)                                                                                                    \
1672        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
1673        VEC_DATA_TYPE(int, size)                                                                                                    \
1674        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
1675        VEC_DATA_TYPE(int, size)                                                                                                    \
1676        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
1677        VEC_DATA_TYPE(int, size)                                                                                                    \
1678        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
1679        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
1680    }
1681
1682/** Each bit of the result is set to the corresponding bit of either then_val or
1683 * else_val depending on whether the corresponding bit of if_mask is set.
1684 * Equivalent to the VBSL instruction in ARM NEON.
1685 *
1686 * @param[in] size Size of vector.
1687 *
1688 * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
1689 */
1690#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
1691    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
1692    {                                                                                                                                                                     \
1693        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
1694    }
1695
1696/** For each element of input vector, the corresponding bits of the result item are set
1697 * if the input item is zero.
1698 *
1699 * @param[in] size Size of vector.
1700 *
1701 * @returns Output vector with bits set when corresponding bit in @p a is zero.
1702 */
1703#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
1704    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
1705    {                                                                                    \
1706        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
1707        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
1708        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
1709    }
1710
1711/** For each element of input vector, the corresponding bits of the result item are set
1712 * if the input item is non-zero.
1713 *
1714 * @param[in] size Size of vector.
1715 *
1716 * @returns Output vector with bits set when corresponding bit in @p a is non zero.
1717 */
1718#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
1719    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
1720    {                                                                                        \
1721        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
1722        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
1723        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
1724    }
1725
1726#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
1727    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
1728    {                                                                                                                                                                                                         \
1729        if(k_integer_bits > exponent)                                                                                                                                                                         \
1730        {                                                                                                                                                                                                     \
1731            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
1732            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
1733                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
1734                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
1735        }                                                                                                                                                                                                     \
1736        \
1737        return result;                                                                                                                                                                                        \
1738    }
1739
1740/** Calculates \f$ exp(x) \f$ for x < 0.
1741 *
1742 * @param[in] size Size of vector.
1743 *
1744 * @return Result in fixed-point format Q0.
1745 */
1746#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
1747    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
1748    {                                                                                                                         \
1749        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
1750        VEC_DATA_TYPE(int, size)                                                                                              \
1751        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
1752        VEC_DATA_TYPE(int, size)                                                                                              \
1753        mask = k_one_quarter - 1;                                                                                             \
1754        VEC_DATA_TYPE(int, size)                                                                                              \
1755        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
1756        VEC_DATA_TYPE(int, size)                                                                                              \
1757        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
1758        VEC_DATA_TYPE(int, size)                                                                                              \
1759        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
1760        VEC_DATA_TYPE(int, size)                                                                                              \
1761        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
1762        \
1763        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
1764        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
1765        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
1766        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
1767        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
1768        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
1769        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
1770        \
1771        if(k_integer_bits > 5)                                                                                                \
1772        {                                                                                                                     \
1773            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
1774            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
1775        }                                                                                                                     \
1776        \
1777        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
1778        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
1779    }
1780
1781/** Calculates the product of a integer value by a power of two, with either a positive exponent
1782 * (equivalent to an arithmetic left shift, saturating) or a negative exponent
1783 * (equivalent to an arithmetic right shift, rounding to nearest).
1784 *
1785 * @param[in] size Size of vector.
1786 *
1787 * @return Arithmetic left or right shift.
1788 */
1789#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
1790    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
1791    {                                                                                                                      \
1792        if(exponent < 0)                                                                                                   \
1793        {                                                                                                                  \
1794            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
1795        }                                                                                                                  \
1796        \
1797        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
1798        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
1799        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
1800        VEC_DATA_TYPE(int, size)                                                                                           \
1801        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
1802        VEC_DATA_TYPE(int, size)                                                                                           \
1803        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
1804        VEC_DATA_TYPE(int, size)                                                                                           \
1805        result = x << exponent;                                                                                            \
1806        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
1807        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
1808        return result;                                                                                                     \
1809    }
1810
1811/** Calculates (a+b)/2, rounded to the nearest integer.
1812 * Equivalent to VRHADD in the ARM NEON instruction set.
1813 *
1814 * @param[in] size Size of vector.
1815 *
1816 * @return (a+b)/2, rounded to the nearest integer.
1817 */
1818#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
1819    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
1820    {                                                                                                                     \
1821        VEC_DATA_TYPE(long, size)                                                                                         \
1822        a64 = convert_long##size(a);                                                                                      \
1823        VEC_DATA_TYPE(long, size)                                                                                         \
1824        b64 = convert_long##size(b);                                                                                      \
1825        VEC_DATA_TYPE(long, size)                                                                                         \
1826        sum = a64 + b64;                                                                                                  \
1827        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
1828        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
1829        VEC_DATA_TYPE(long, size)                                                                                         \
1830        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
1831        return convert_int##size((sum + sign) / 2);                                                                       \
1832    }
1833
1834/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
1835 *
1836 * @param[in] size Size of vector.
1837 *
1838 * @return Result in fixed-point format Q0.
1839 */
1840#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
1841    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
1842    {                                                                                                        \
1843        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
1844        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
1845        VEC_DATA_TYPE(int, size)                                                                             \
1846        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
1847        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
1848        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
1849        VEC_DATA_TYPE(int, size)                                                                             \
1850        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
1851        for(int i = 0; i < 3; i++)                                                                           \
1852        {                                                                                                    \
1853            VEC_DATA_TYPE(int, size)                                                                         \
1854            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
1855            VEC_DATA_TYPE(int, size)                                                                         \
1856            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
1857            VEC_DATA_TYPE(int, size)                                                                         \
1858            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
1859            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
1860        }                                                                                                    \
1861        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
1862    }
1863
1864/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
1865 *
1866 * @param[in] size Size of vector.
1867 *
1868 * @return Rescaled value.
1869 */
1870#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
1871    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
1872    {                                                                                                                               \
1873        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
1874        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
1875    }
1876
1877#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
1878#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
1879#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
1880#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
1881
1882#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
1883#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
1884#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
1885#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
1886#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
1887    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
1888#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
1889    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
1890#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
1891#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
1892#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
1893#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
1894#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
1895#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
1896#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
1897#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
1898#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
1899#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
1900#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
1901#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
1902#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
1903
1904#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
1905    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
1906    {                                                                                                                           \
1907        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
1908        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
1909        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
1910    }
1911#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
1912
1913QUANTIZE_IMPL(uchar, 1)
1914QUANTIZE_IMPL(char, 1)
1915QUANTIZE_IMPL(uint, 1)
1916QUANTIZE_IMPL(int, 1)
1917QUANTIZE_IMPL(uchar, 4)
1918QUANTIZE_IMPL(ushort, 4)
1919QUANTIZE_IMPL(short, 4)
1920QUANTIZE_IMPL(uchar, 16)
1921QUANTIZE_IMPL(char, 16)
1922QUANTIZE_IMPL(ushort, 16)
1923QUANTIZE_IMPL(short, 16)
1924QUANTIZE_IMPL(uint, 16)
1925QUANTIZE_IMPL(int, 16)
1926
1927DEQUANTIZE_IMPL(uchar, 1)
1928DEQUANTIZE_IMPL(char, 1)
1929DEQUANTIZE_IMPL(uint, 1)
1930DEQUANTIZE_IMPL(int, 1)
1931DEQUANTIZE_IMPL(uchar, 4)
1932DEQUANTIZE_IMPL(ushort, 4)
1933DEQUANTIZE_IMPL(short, 4)
1934DEQUANTIZE_IMPL(uchar, 16)
1935DEQUANTIZE_IMPL(char, 16)
1936DEQUANTIZE_IMPL(ushort, 16)
1937DEQUANTIZE_IMPL(short, 16)
1938DEQUANTIZE_IMPL(uint, 16)
1939DEQUANTIZE_IMPL(int, 16)
1940
1941ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
1942ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
1943ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
1944ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
1945ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
1946ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
1947
1948ASYMM_MULT_IMPL(1)
1949ASYMM_MULT_IMPL(2)
1950ASYMM_MULT_IMPL(3)
1951ASYMM_MULT_IMPL(4)
1952ASYMM_MULT_IMPL(8)
1953ASYMM_MULT_IMPL(16)
1954
1955ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
1956ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
1957ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
1958ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
1959ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
1960ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
1961
1962ASYMM_SELECT_USING_MASK_IMPL(1)
1963ASYMM_SELECT_USING_MASK_IMPL(2)
1964ASYMM_SELECT_USING_MASK_IMPL(3)
1965ASYMM_SELECT_USING_MASK_IMPL(4)
1966ASYMM_SELECT_USING_MASK_IMPL(8)
1967ASYMM_SELECT_USING_MASK_IMPL(16)
1968
1969ASYMM_MASK_IF_ZERO_IMPL(1)
1970ASYMM_MASK_IF_ZERO_IMPL(2)
1971ASYMM_MASK_IF_ZERO_IMPL(3)
1972ASYMM_MASK_IF_ZERO_IMPL(4)
1973ASYMM_MASK_IF_ZERO_IMPL(8)
1974ASYMM_MASK_IF_ZERO_IMPL(16)
1975
1976ASYMM_MASK_IF_NON_ZERO_IMPL(1)
1977ASYMM_MASK_IF_NON_ZERO_IMPL(2)
1978ASYMM_MASK_IF_NON_ZERO_IMPL(3)
1979ASYMM_MASK_IF_NON_ZERO_IMPL(4)
1980ASYMM_MASK_IF_NON_ZERO_IMPL(8)
1981ASYMM_MASK_IF_NON_ZERO_IMPL(16)
1982
1983EXP_BARREL_SHIFTER_IMPL(1)
1984EXP_BARREL_SHIFTER_IMPL(2)
1985EXP_BARREL_SHIFTER_IMPL(3)
1986EXP_BARREL_SHIFTER_IMPL(4)
1987EXP_BARREL_SHIFTER_IMPL(8)
1988EXP_BARREL_SHIFTER_IMPL(16)
1989
1990ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
1991ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
1992ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
1993ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
1994ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
1995ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
1996
1997ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
1998ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
1999ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
2000ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
2001ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
2002ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
2003
2004ASYMM_ROUNDING_HALF_SUM_IMPL(1)
2005ASYMM_ROUNDING_HALF_SUM_IMPL(2)
2006ASYMM_ROUNDING_HALF_SUM_IMPL(3)
2007ASYMM_ROUNDING_HALF_SUM_IMPL(4)
2008ASYMM_ROUNDING_HALF_SUM_IMPL(8)
2009ASYMM_ROUNDING_HALF_SUM_IMPL(16)
2010
2011ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
2012ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
2013ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
2014ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
2015ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
2016ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
2017
2018ASYMM_RESCALE_IMPL(1)
2019ASYMM_RESCALE_IMPL(2)
2020ASYMM_RESCALE_IMPL(3)
2021ASYMM_RESCALE_IMPL(4)
2022ASYMM_RESCALE_IMPL(8)
2023ASYMM_RESCALE_IMPL(16)
2024
2025MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
2026MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
2027MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
2028MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
2029MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
2030MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
2031
2032#endif // ARM_COMPUTE_HELPERS_ASYMM_H
2033/*
2034 * Copyright (c) 2018-2020 Arm Limited.
2035 *
2036 * SPDX-License-Identifier: MIT
2037 *
2038 * Permission is hereby granted, free of charge, to any person obtaining a copy
2039 * of this software and associated documentation files (the "Software"), to
2040 * deal in the Software without restriction, including without limitation the
2041 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
2042 * sell copies of the Software, and to permit persons to whom the Software is
2043 * furnished to do so, subject to the following conditions:
2044 *
2045 * The above copyright notice and this permission notice shall be included in all
2046 * copies or substantial portions of the Software.
2047 *
2048 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2049 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2050 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2051 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2052 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2053 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2054 * SOFTWARE.
2055 */
2056/*
2057 * Copyright (c) 2017-2020 Arm Limited.
2058 *
2059 * SPDX-License-Identifier: MIT
2060 *
2061 * Permission is hereby granted, free of charge, to any person obtaining a copy
2062 * of this software and associated documentation files (the "Software"), to
2063 * deal in the Software without restriction, including without limitation the
2064 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
2065 * sell copies of the Software, and to permit persons to whom the Software is
2066 * furnished to do so, subject to the following conditions:
2067 *
2068 * The above copyright notice and this permission notice shall be included in all
2069 * copies or substantial portions of the Software.
2070 *
2071 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2072 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2073 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2074 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2075 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2076 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2077 * SOFTWARE.
2078 */
2079#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
2080#define ARM_COMPUTE_HELPERS_ASYMM_H
2081
2082/*
2083 * Copyright (c) 2016-2020 Arm Limited.
2084 *
2085 * SPDX-License-Identifier: MIT
2086 *
2087 * Permission is hereby granted, free of charge, to any person obtaining a copy
2088 * of this software and associated documentation files (the "Software"), to
2089 * deal in the Software without restriction, including without limitation the
2090 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
2091 * sell copies of the Software, and to permit persons to whom the Software is
2092 * furnished to do so, subject to the following conditions:
2093 *
2094 * The above copyright notice and this permission notice shall be included in all
2095 * copies or substantial portions of the Software.
2096 *
2097 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2098 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2099 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2100 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2101 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2102 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2103 * SOFTWARE.
2104 */
2105#ifndef ARM_COMPUTE_HELPER_H
2106#define ARM_COMPUTE_HELPER_H
2107
2108/*
2109 * Copyright (c) 2020 Arm Limited.
2110 *
2111 * SPDX-License-Identifier: MIT
2112 *
2113 * Permission is hereby granted, free of charge, to any person obtaining a copy
2114 * of this software and associated documentation files (the "Software"), to
2115 * deal in the Software without restriction, including without limitation the
2116 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
2117 * sell copies of the Software, and to permit persons to whom the Software is
2118 * furnished to do so, subject to the following conditions:
2119 *
2120 * The above copyright notice and this permission notice shall be included in all
2121 * copies or substantial portions of the Software.
2122 *
2123 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2124 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2125 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2126 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2127 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2128 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2129 * SOFTWARE.
2130 */
2131
2132/** Store the 0 to (n-1)th rows of the given variables
2133 * @name STORE_ROW_n
2134 *
2135 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
2136 * @param[in] DATA_TYPE The data type of the vectors
2137 * @param[in] BASENAME  The basename of the variables
2138 * @param[in] PTR       The base pointer
2139 * @param[in] STRIDE_Y  The stride value in y-axis direction
2140 * @param[in] Z         The offset in z-axis direction
2141 * @{
2142 */
2143#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2144    VSTORE(N0)                                                 \
2145    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2146
2147#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2148    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2149    VSTORE(N0)                                                 \
2150    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2151
2152#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2153    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2154    VSTORE(N0)                                                 \
2155    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2156
2157#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2158    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2159    VSTORE(N0)                                                 \
2160    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2161
2162#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2163    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2164    VSTORE(N0)                                                 \
2165    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2166
2167#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2168    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2169    VSTORE(N0)                                                 \
2170    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2171
2172#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2173    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2174    VSTORE(N0)                                                 \
2175    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2176
2177#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2178    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2179    VSTORE(N0)                                                 \
2180    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2181
2182#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2183    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2184    VSTORE(N0)                                                 \
2185    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2186
2187#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2188    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2189    VSTORE(N0)                                                  \
2190    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2191
2192#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2193    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2194    VSTORE(N0)                                                  \
2195    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2196
2197#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2198    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2199    VSTORE(N0)                                                  \
2200    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2201
2202#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2203    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2204    VSTORE(N0)                                                  \
2205    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2206
2207#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2208    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2209    VSTORE(N0)                                                  \
2210    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2211
2212#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2213    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2214    VSTORE(N0)                                                  \
2215    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2216
2217#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2218    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2219    VSTORE(N0)                                                  \
2220    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2221/** @} */ // end of groupd STORE_ROW_n
2222
2223/** Convert and store the 0th to (n-1)th rows of the given variables
2224 * @name CONVERT_STORE_ROW_n
2225 *
2226 * @param[in] N0        The size of the vectors
2227 * @param[in] DATA_TYPE The data type of the vectors
2228 * @param[in] BASENAME  The basename of the variables
2229 * @param[in] PTR       The base pointer
2230 * @param[in] STRIDE_Y  The stride value in y-axis direction
2231 * @param[in] Z         The offset in z-axis direction
2232 * @{
2233 */
2234#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2235    VSTORE(N0)                                                         \
2236    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2237
2238#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2239    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2240    VSTORE(N0)                                                         \
2241    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2242
2243#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2244    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2245    VSTORE(N0)                                                         \
2246    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2247
2248#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2249    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2250    VSTORE(N0)                                                         \
2251    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2252
2253#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2254    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2255    VSTORE(N0)                                                         \
2256    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2257
2258#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2259    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2260    VSTORE(N0)                                                         \
2261    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2262
2263#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2264    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2265    VSTORE(N0)                                                         \
2266    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2267
2268#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2269    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2270    VSTORE(N0)                                                         \
2271    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2272
2273#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2274    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2275    VSTORE(N0)                                                         \
2276    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2277
2278#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
2279    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2280    VSTORE(N0)                                                     \
2281    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2282
2283#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2284    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2285    VSTORE(N0)                                                          \
2286    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2287
2288#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2289    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2290    VSTORE(N0)                                                          \
2291    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2292
2293#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2294    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2295    VSTORE(N0)                                                          \
2296    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2297
2298#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2299    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2300    VSTORE(N0)                                                          \
2301    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2302
2303#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2304    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2305    VSTORE(N0)                                                          \
2306    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2307
2308#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2309    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2310    VSTORE(N0)                                                          \
2311    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2312
2313/** @} */ // end of groupd CONVERT_STORE_ROW_n
2314
2315/** Store a block of the given size M0xN0
2316 * @name STORE_BLOCK
2317 *
2318 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
2319 * The data to store is expected to have consecutive names for each row.
2320 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2321 * The Z offset is expected to have consecutive names.
2322 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2323 *
2324 * @param[in] M0        The number of rows to store
2325 * @param[in] N0        The size of each vector
2326 * @param[in] DATA_TYPE The data type of the vectors
2327 * @param[in] BASENAME  The basename of the variables
2328 * @param[in] PTR       The base pointer
2329 * @param[in] STRIDE_Y  The stride value in y-axis direction
2330 * @param[in] Z         The offset in z-axis direction
2331 * @{
2332 */
2333#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2334#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2335/** @} */ // end of group STORE_BLOCK
2336
2337/** Convert and store a block of the given size M0xN0
2338 * @name CONVERT_STORE_BLOCK
2339 *
2340 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
2341 * The data to store is expected to have consecutive names for each row.
2342 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2343 * The Z offset is expected to have consecutive names.
2344 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2345 *
2346 * @param[in] M0        The number of rows to store
2347 * @param[in] N0        The size of each vector
2348 * @param[in] DATA_TYPE The data type of the vectors
2349 * @param[in] BASENAME  The basename of the variables
2350 * @param[in] PTR       The base pointer
2351 * @param[in] STRIDE_Y  The stride value in y-axis direction
2352 * @param[in] Z         The offset in z-axis direction
2353 * @{
2354 */
2355#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2356#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2357/** @} */ // end of group CONVERT_STORE_BLOCK
2358
2359/** Partially store the 0 to (n-1)th rows of the given variables
2360 * @name STORE_ROW_PARTIAL_n
2361 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
2362 *
2363 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2364 *
2365 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
2366 * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
2367 * @param[in] DATA_TYPE The data type of the vectors
2368 * @param[in] BASENAME  The basename of the variables
2369 * @param[in] PTR       The base pointer
2370 * @param[in] STRIDE_Y  The stride value in y-axis direction
2371 * @param[in] Z         The offset in z-axis direction
2372 * @{
2373 */
2374#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2375    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2376    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2377
2378#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2379    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2380    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2381    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2382
2383#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2384    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2385    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2386    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2387
2388#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2389    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2390    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2391    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2392
2393#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2394    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2395    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2396    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2397
2398#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2399    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2400    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2401    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2402
2403#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2404    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2405    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2406    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2407
2408#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2409    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2410    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2411    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2412
2413#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2414    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2415    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2416    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2417
2418#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2419    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2420    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2421    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2422
2423#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2424    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2425    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2426    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2427
2428#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2429    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2430    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2431    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2432
2433#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2434    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2435    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2436    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2437
2438#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2439    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2440    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2441    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2442
2443#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2444    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2445    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2446    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2447
2448#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2449    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2450    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2451    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2452/** @} */ // end of groupd STORE_ROW_PARTIAL_n
2453
2454/** Partially store a block of the given size STORE_M0xSTORE_N0
2455 * @name STORE_BLOCK_PARTIAL
2456 *
2457 * @note The vector width @p N0 is also required for correct partial storing behaviour.
2458 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2459 *
2460 * The data to store is expected to have consecutive names for each row.
2461 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
2462 * The Z offset is expected to have consecutive names.
2463 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2464 *
2465 * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
2466 * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
2467 * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
2468 * @param[in] DATA_TYPE The data type of the vectors
2469 * @param[in] BASENAME  The basename of the variables
2470 * @param[in] PTR       The base pointer
2471 * @param[in] STRIDE_Y  The stride value in y-axis direction
2472 * @param[in] Z         The offset in z-axis direction
2473 * @{
2474 */
2475#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2476#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2477/** Store a block that can be partial in both x and y dimensions
2478 *
2479 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2480 *
2481 * The data to store is expected to have consecutive names for each row.
2482 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2483 * The Z offset is expected to have consecutive names.
2484 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2485 *
2486 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2487 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2488 * @param[in] DATA_TYPE        The data type of the vectors
2489 * @param[in] BASENAME         The basename of the variables
2490 * @param[in] PTR              The base pointer
2491 * @param[in] STRIDE_Y         The stride value in y-axis direction
2492 * @param[in] Z                The offset in z-axis direction
2493 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
2494 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
2495 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2496 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2497 */
2498#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2499    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
2500    {                                                                                                                                                     \
2501        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
2502    }                                                                                                                                                     \
2503    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
2504    {                                                                                                                                                     \
2505        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2506    }                                                                                                                                                     \
2507    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
2508    {                                                                                                                                                     \
2509        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2510    }                                                                                                                                                     \
2511    else                                                                                                                                                  \
2512    {                                                                                                                                                     \
2513        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
2514    }
2515/** Store a block that can only be partial in x but not y.
2516 *
2517 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2518 *
2519 * The data to store is expected to have consecutive names for each row.
2520 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2521 * The Z offset is expected to have consecutive names.
2522 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2523 *
2524 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2525 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2526 * @param[in] DATA_TYPE        The data type of the vectors
2527 * @param[in] BASENAME         The basename of the variables
2528 * @param[in] PTR              The base pointer
2529 * @param[in] STRIDE_Y         The stride value in y-axis direction
2530 * @param[in] Z                The offset in z-axis direction
2531 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
2532 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2533 */
2534#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2535    if(!(PARTIAL_COND_X))                                                                                         \
2536    {                                                                                                             \
2537        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2538    }                                                                                                             \
2539    else                                                                                                          \
2540    {                                                                                                             \
2541        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2542    }
2543/** Store a block that can only be partial in y but not x.
2544 *
2545 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2546 *
2547 * The data to store is expected to have consecutive names for each row.
2548 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2549 * The Z offset is expected to have consecutive names.
2550 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2551 *
2552 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2553 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2554 * @param[in] DATA_TYPE        The data type of the vectors
2555 * @param[in] BASENAME         The basename of the variables
2556 * @param[in] PTR              The base pointer
2557 * @param[in] STRIDE_Y         The stride value in y-axis direction
2558 * @param[in] Z                The offset in z-axis direction
2559 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
2560 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2561 */
2562#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2563    if(!(PARTIAL_COND_Y))                                                                                         \
2564    {                                                                                                             \
2565        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2566    }                                                                                                             \
2567    else                                                                                                          \
2568    {                                                                                                             \
2569        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2570    }
2571/** @} */ // end of group STORE_BLOCK_PARTIAL
2572
2573#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2574
2575/** Boundary-aware GEMM block store
2576 * @name STORE_BLOCK_BOUNDARY_AWARE
2577 * This macro assumes the following schemes to achieve boundary-awareness:
2578 *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
2579 *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
2580 *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
2581 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
2582 *
2583 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
2584 * blocks **at the end**.
2585 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
2586 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
2587 *
2588 *  *--x-->                         x == 0                        x == 1
2589 *  |                  |<------------------------------N-------------------------->|
2590 *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
2591 *  |     -------------#############################################################
2592 *  *     |          | |...............................|...........................|
2593 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
2594 *        |          | |...............................|...........................|
2595 *        M          --#############################################################
2596 *        |          | |                               |...........................|
2597 * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
2598 *        |          | |                               |...........................|
2599 *        |------------#############################################################
2600 *
2601 * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
2602 *
2603 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2604 *
2605 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
2606 * and select corresponding store methods such that the boundary detection logic is only added when needed.
2607 *
2608 * The data to store is expected to have consecutive names for each row.
2609 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2610 * The Z offset is expected to have consecutive names.
2611 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2612 *
2613 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2614 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2615 * @param[in] DATA_TYPE        The data type of the vectors
2616 * @param[in] BASENAME         The basename of the variables
2617 * @param[in] PTR              The base pointer
2618 * @param[in] STRIDE_Y         The stride value in y-axis direction
2619 * @param[in] Z                The offset in z-axis direction
2620 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
2621 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
2622 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2623 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2624 * @{
2625 */
2626#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2627// Case1: No partial blocks in either x or y
2628#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2629    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2630
2631#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2632// Case2: Partial blocks in y
2633#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2634    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2635
2636#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2637// Case3: Partial blocks in x
2638#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2639    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2640
2641#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2642// Case4: Partial blocks in both x and y
2643#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2644    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2645
2646#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2647
2648#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2649/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
2650
2651#if defined(PARTIAL_STORE_M0)
2652/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
2653 * @name COMPUTE_M0_START_ROW
2654 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
2655 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
2656 * blocks in the y dimension to avoid any padding.
2657 * EG: M0=4, PARTIAL_STORE_M0=1:
2658 *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
2659 * block 0 (partial)| start row = 0   | start row = 0
2660 * block 1 (full)   | start row = 4   | start row = 1
2661 * block 2 (full)   | start row = 8   | start row = 5
2662 *
2663 * @param[in] y                Global id of current block in y.
2664 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2665 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
2666 * @{
2667 */
2668#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2669    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
2670#else // defined(PARTIAL_STORE_M0)
2671#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2672    ((uint)(y * M0))
2673#endif    // defined(PARTIAL_STORE_M0)
2674/** @} */ // end of group COMPUTE_M0_START_ROW
2675
2676/** Store a vector that can only be partial in x.
2677 *
2678 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2679 *
2680 * The data to store is expected to end in a 0.
2681 * E.g., for basename=c, the expected name is c0.
2682 *
2683 * @param[in] basename  The name of the variable without trailing 0
2684 * @param[in] data_type The data type of the vector
2685 * @param[in] ptr       The base pointer
2686 * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
2687 * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
2688 * @param[in] cond      Condition to select either vec_size0 or vec_size1
2689 * @{
2690 */
2691#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
2692    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
2693/** @} */ // end of group STORE_VECTOR_SELECT
2694
2695#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2696#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2697#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2698
2699#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2700#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
2701#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2702
2703#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2704#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
2705#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2706
2707#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2708#pragma OPENCL EXTENSION cl_arm_printf : enable
2709#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2710
2711#define GPU_ARCH_MIDGARD 0x100
2712#define GPU_ARCH_BIFROST 0x200
2713
2714/** Concatenate two inputs.
2715 *
2716 * @param[in] a The first input to be concatenated
2717 * @param[in] b The second input to be concatenated
2718 *
2719 * @return The concatenated output
2720 */
2721#define CONCAT(a, b) a##b
2722
2723/** Expand the given vector
2724 *
2725 * @param[in] x The vector to be expanded
2726 *
2727 * @return The expanded output
2728 */
2729#define EXPAND(x) x
2730
2731/** Clamp the given value between an upper and lower bound.
2732 *
2733 * @param[in] x       The value to be clamped
2734 * @param[in] min_val The lower bound
2735 * @param[in] max_val The upper bound
2736 *
2737 * @return The clamped value.
2738 */
2739#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
2740
2741/** REVn reverses the given vector whose size is n.
2742 * @name REVn
2743 *
2744 * @param[in] x The vector to be reversed
2745 *
2746 * @return The reversed vector
2747 * @{
2748 */
2749#define REV1(x) ((x))
2750#define REV2(x) ((x).s10)
2751#define REV3(x) ((x).s210)
2752#define REV4(x) ((x).s3210)
2753#define REV8(x) ((x).s76543210)
2754#define REV16(x) ((x).sFEDCBA9876543210)
2755/** @} */ // end of group REVn
2756
2757/** Reverse the given vector.
2758 * @name REVERSE
2759 *
2760 * @param[in] x The vector to be reversed
2761 * @param[in] s The size of the vector
2762 *
2763 * @return The reversed vector
2764 * @{
2765 */
2766#define REVERSE_STR(x, s) REV##s((x))
2767#define REVERSE(x, s) REVERSE_STR(x, s)
2768/** @} */ // end of group REVERSE
2769
2770/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
2771 * @name ROTs_n
2772 *
2773 * @param[in] x The vector to be shifted
2774 *
2775 * @return The shifted vector
2776 * @{
2777 */
2778#define ROT1_0(x) ((x))
2779
2780#define ROT2_0(x) ((x))
2781#define ROT2_1(x) ((x).s10)
2782
2783#define ROT3_0(x) ((x))
2784#define ROT3_1(x) ((x).s201)
2785#define ROT3_2(x) ((x).s120)
2786
2787#define ROT4_0(x) ((x))
2788#define ROT4_1(x) ((x).s3012)
2789#define ROT4_2(x) ((x).s2301)
2790#define ROT4_3(x) ((x).s1230)
2791
2792#define ROT8_0(x) ((x))
2793#define ROT8_1(x) ((x).s70123456)
2794#define ROT8_2(x) ((x).s67012345)
2795#define ROT8_3(x) ((x).s56701234)
2796#define ROT8_4(x) ((x).s45670123)
2797#define ROT8_5(x) ((x).s34567012)
2798#define ROT8_6(x) ((x).s23456701)
2799#define ROT8_7(x) ((x).s12345670)
2800
2801#define ROT16_0(x) ((x))
2802#define ROT16_1(x) ((x).sF0123456789ABCDE)
2803#define ROT16_2(x) ((x).sEF0123456789ABCD)
2804#define ROT16_3(x) ((x).sDEF0123456789ABC)
2805#define ROT16_4(x) ((x).sCDEF0123456789AB)
2806#define ROT16_5(x) ((x).sBCDEF0123456789A)
2807#define ROT16_6(x) ((x).sABCDEF0123456789)
2808#define ROT16_7(x) ((x).s9ABCDEF012345678)
2809#define ROT16_8(x) ((x).s89ABCDEF01234567)
2810#define ROT16_9(x) ((x).s789ABCDEF0123456)
2811#define ROT16_10(x) ((x).s6789ABCDEF012345)
2812#define ROT16_11(x) ((x).s56789ABCDEF01234)
2813#define ROT16_12(x) ((x).s456789ABCDEF0123)
2814#define ROT16_13(x) ((x).s3456789ABCDEF012)
2815#define ROT16_14(x) ((x).s23456789ABCDEF01)
2816#define ROT16_15(x) ((x).s123456789ABCDEF0)
2817/** @} */ // end of group ROTs_n
2818
2819/** Circular-right-shift (rotate-right) the given vector by the given amount.
2820 * @name ROTATE
2821 *
2822 * @param[in] x The vector to be shifted
2823 * @param[in] s The size of the vector
2824 * @param[in] n The amount to be shifted
2825 *
2826 * @return The shifted vector
2827 * @{
2828 */
2829#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
2830#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
2831/** @} */ // end of group ROTATE
2832
2833/** Creates a vector of size n filled with offset values corresponding to the location of each element.
2834 * @name V_OFFSn
2835 *
2836 * @param[in] dt The data type of the output vector
2837 *
2838 * @return The vector filled with offset values
2839 * @{
2840 */
2841#define V_OFFS1(dt) (dt##1)(0)
2842#define V_OFFS2(dt) (dt##2)(0, 1)
2843#define V_OFFS3(dt) (dt##3)(0, 1, 2)
2844#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
2845#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
2846#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
2847/** @} */ // end of group V_OFFSn
2848
2849/** Create a vector filled with offset values corresponding to the location of each element.
2850 * @name VEC_OFFS
2851 *
2852 * @param[in] dt The data type of the output vector
2853 * @param[in] s  The size of the output vector
2854 *
2855 * @return The vector filled with offset values
2856 * @{
2857 */
2858#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
2859#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
2860/** @} */ // end of group VEC_OFFS
2861
2862#define VLOAD_STR(size) vload##size
2863#define VLOAD(size) VLOAD_STR(size)
2864
2865#define PIXEL_UNIT4 1
2866#define PIXEL_UNIT8 2
2867#define PIXEL_UNIT16 4
2868
2869/** Utility macro to convert a vector size in pixel unit.
2870 *
2871 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
2872 *
2873 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
2874 *
2875 * @return The pixel unit (number of pixels)
2876 * @{
2877 */
2878#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
2879#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
2880/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
2881
2882#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
2883#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
2884#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
2885
2886#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2887#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
2888#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
2889#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
2890#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2891
2892/** Utility macro to read a 2D OpenCL image object.
2893 *
2894 * @note Coordinates are not normalized
2895 *
2896 * @param[in] data_type Data type
2897 * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
2898 * @param[in] img       OpenCL image object
2899 * @param[in] x_coord   The x coordinate for the top-left pixel
2900 * @param[in] y_coord   The y coordinate for the top-left pixel
2901 *
2902 * @return Pixels from the 2D OpenCL image object
2903 * @{
2904 */
2905#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
2906#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
2907
2908#define VSTORE_STR(size) vstore##size
2909#define VSTORE(size) VSTORE_STR(size)
2910
2911#define float1 float
2912#define half1 half
2913#define char1 char
2914#define uchar1 uchar
2915#define short1 short
2916#define ushort1 ushort
2917#define int1 int
2918#define uint1 uint
2919#define long1 long
2920#define ulong1 ulong
2921#define double1 double
2922
2923#define vload1(OFFSET, PTR) *(OFFSET + PTR)
2924#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
2925
2926/** Extended partial vstore that correctly handles scalar values as well.
2927 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
2928 * @name VSTORE_PARTIAL
2929 *
2930 * @note With this macro, the passed data can be both a vector and a scalar
2931 * @note @p store_size needs to be <= @p size
2932 * eg 1: Valid
2933 * VSTORE_PARTIAL(16, 15) ...;
2934 * eg 2: Invalid
2935 * VSTORE_PARTIAL(4, 7) ...;
2936 *
2937 * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
2938 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
2939 * @{
2940 */
2941#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
2942#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
2943
2944#define NO_STORE(data, offs, ptr) \
2945    {                             \
2946    }
2947
2948// Size == 1 (scalar)
2949#define vstore_partial_1_0 NO_STORE
2950#define vstore_partial_1_1 vstore1
2951#define vstore_partial_1_2 NO_STORE
2952#define vstore_partial_1_3 NO_STORE
2953#define vstore_partial_1_4 NO_STORE
2954#define vstore_partial_1_5 NO_STORE
2955#define vstore_partial_1_6 NO_STORE
2956#define vstore_partial_1_7 NO_STORE
2957#define vstore_partial_1_8 NO_STORE
2958#define vstore_partial_1_9 NO_STORE
2959#define vstore_partial_1_10 NO_STORE
2960#define vstore_partial_1_11 NO_STORE
2961#define vstore_partial_1_12 NO_STORE
2962#define vstore_partial_1_13 NO_STORE
2963#define vstore_partial_1_14 NO_STORE
2964#define vstore_partial_1_15 NO_STORE
2965#define vstore_partial_1_16 NO_STORE
2966// Size == 2
2967#define vstore_partial_2_0 NO_STORE
2968#define vstore_partial_2_1 vstore_partial_1
2969#define vstore_partial_2_2 vstore_partial_2
2970#define vstore_partial_2_3 NO_STORE
2971#define vstore_partial_2_4 NO_STORE
2972#define vstore_partial_2_5 NO_STORE
2973#define vstore_partial_2_6 NO_STORE
2974#define vstore_partial_2_7 NO_STORE
2975#define vstore_partial_2_8 NO_STORE
2976#define vstore_partial_2_9 NO_STORE
2977#define vstore_partial_2_10 NO_STORE
2978#define vstore_partial_2_11 NO_STORE
2979#define vstore_partial_2_12 NO_STORE
2980#define vstore_partial_2_13 NO_STORE
2981#define vstore_partial_2_14 NO_STORE
2982#define vstore_partial_2_15 NO_STORE
2983#define vstore_partial_2_16 NO_STORE
2984// Size == 3
2985#define vstore_partial_3_0 NO_STORE
2986#define vstore_partial_3_1 vstore_partial_1
2987#define vstore_partial_3_2 vstore_partial_2
2988#define vstore_partial_3_3 vstore_partial_3
2989#define vstore_partial_3_4 NO_STORE
2990#define vstore_partial_3_5 NO_STORE
2991#define vstore_partial_3_6 NO_STORE
2992#define vstore_partial_3_7 NO_STORE
2993#define vstore_partial_3_8 NO_STORE
2994#define vstore_partial_3_9 NO_STORE
2995#define vstore_partial_3_10 NO_STORE
2996#define vstore_partial_3_11 NO_STORE
2997#define vstore_partial_3_12 NO_STORE
2998#define vstore_partial_3_13 NO_STORE
2999#define vstore_partial_3_14 NO_STORE
3000#define vstore_partial_3_15 NO_STORE
3001#define vstore_partial_3_16 NO_STORE
3002// Size == 4
3003#define vstore_partial_4_0 NO_STORE
3004#define vstore_partial_4_1 vstore_partial_1
3005#define vstore_partial_4_2 vstore_partial_2
3006#define vstore_partial_4_3 vstore_partial_3
3007#define vstore_partial_4_4 vstore_partial_4
3008#define vstore_partial_4_5 NO_STORE
3009#define vstore_partial_4_6 NO_STORE
3010#define vstore_partial_4_7 NO_STORE
3011#define vstore_partial_4_8 NO_STORE
3012#define vstore_partial_4_9 NO_STORE
3013#define vstore_partial_4_10 NO_STORE
3014#define vstore_partial_4_11 NO_STORE
3015#define vstore_partial_4_12 NO_STORE
3016#define vstore_partial_4_13 NO_STORE
3017#define vstore_partial_4_14 NO_STORE
3018#define vstore_partial_4_15 NO_STORE
3019#define vstore_partial_4_16 NO_STORE
3020// Size == 8
3021#define vstore_partial_8_0 NO_STORE
3022#define vstore_partial_8_1 vstore_partial_1
3023#define vstore_partial_8_2 vstore_partial_2
3024#define vstore_partial_8_3 vstore_partial_3
3025#define vstore_partial_8_4 vstore_partial_4
3026#define vstore_partial_8_5 vstore_partial_5
3027#define vstore_partial_8_6 vstore_partial_6
3028#define vstore_partial_8_7 vstore_partial_7
3029#define vstore_partial_8_8 vstore_partial_8
3030#define vstore_partial_8_9 NO_STORE
3031#define vstore_partial_8_10 NO_STORE
3032#define vstore_partial_8_11 NO_STORE
3033#define vstore_partial_8_12 NO_STORE
3034#define vstore_partial_8_13 NO_STORE
3035#define vstore_partial_8_14 NO_STORE
3036#define vstore_partial_8_15 NO_STORE
3037#define vstore_partial_8_16 NO_STORE
3038// Size == 16
3039#define vstore_partial_16_0 NO_STORE
3040#define vstore_partial_16_1 vstore_partial_1
3041#define vstore_partial_16_2 vstore_partial_2
3042#define vstore_partial_16_3 vstore_partial_3
3043#define vstore_partial_16_4 vstore_partial_4
3044#define vstore_partial_16_5 vstore_partial_5
3045#define vstore_partial_16_6 vstore_partial_6
3046#define vstore_partial_16_7 vstore_partial_7
3047#define vstore_partial_16_8 vstore_partial_8
3048#define vstore_partial_16_9 vstore_partial_9
3049#define vstore_partial_16_10 vstore_partial_10
3050#define vstore_partial_16_11 vstore_partial_11
3051#define vstore_partial_16_12 vstore_partial_12
3052#define vstore_partial_16_13 vstore_partial_13
3053#define vstore_partial_16_14 vstore_partial_14
3054#define vstore_partial_16_15 vstore_partial_15
3055#define vstore_partial_16_16 vstore_partial_16
3056
3057/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
3058 * @name vstore_partial_n
3059 *
3060 * @note @p DATA needs to be a vector not a scalar
3061 * @note n needs to be <= the vector width of the input variable @p DATA
3062 * eg 1: Valid
3063 * vstore_partial_15(var:float16, 0, 0xabcd);
3064 * eg 2: Invalid
3065 * vstore_partial_7(var:float4, 0, 0xabcd);
3066 *
3067 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
3068 *
3069 * @param[in] DATA   The name of the variable
3070 * @param[in] OFFSET Offset in n
3071 * @param[in] PTR    The base pointer
3072 * @{
3073 */
3074#define vstore_partial_1(DATA, OFFSET, PTR) \
3075    vstore1(DATA.s0, OFFSET, PTR);
3076
3077#define vstore_partial_2(DATA, OFFSET, PTR) \
3078    vstore2(DATA.s01, OFFSET, PTR);
3079
3080#define vstore_partial_3(DATA, OFFSET, PTR) \
3081    vstore3(DATA.s012, OFFSET, PTR);
3082
3083#define vstore_partial_4(DATA, OFFSET, PTR) \
3084    vstore4(DATA.s0123, OFFSET, PTR);
3085
3086#define vstore_partial_5(DATA, OFFSET, PTR)    \
3087    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3088    vstore1(DATA.s4, OFFSET, PTR + 4);
3089
3090#define vstore_partial_6(DATA, OFFSET, PTR)    \
3091    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3092    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
3093
3094#define vstore_partial_7(DATA, OFFSET, PTR)    \
3095    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3096    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
3097
3098#define vstore_partial_8(DATA, OFFSET, PTR) \
3099    vstore8(DATA.s01234567, OFFSET, PTR);
3100
3101#define vstore_partial_9(DATA, OFFSET, PTR)        \
3102    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3103    vstore1(DATA.s8, OFFSET, PTR + 8);
3104
3105#define vstore_partial_10(DATA, OFFSET, PTR)       \
3106    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3107    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
3108
3109#define vstore_partial_11(DATA, OFFSET, PTR)       \
3110    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3111    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
3112
3113#define vstore_partial_12(DATA, OFFSET, PTR)       \
3114    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3115    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
3116
3117#define vstore_partial_13(DATA, OFFSET, PTR)       \
3118    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3119    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
3120
3121#define vstore_partial_14(DATA, OFFSET, PTR)       \
3122    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3123    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
3124
3125#define vstore_partial_15(DATA, OFFSET, PTR)       \
3126    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3127    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
3128
3129#define vstore_partial_16(DATA, OFFSET, PTR) \
3130    vstore16(DATA, OFFSET, PTR);
3131/** @} */ // end of groupd vstore_partial_n
3132/** @} */ // end of groupd VSTORE_PARTIAL
3133
3134// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
3135// without _sat to overcome this issue
3136#define convert_float_sat convert_float
3137#define convert_float1_sat convert_float
3138#define convert_float2_sat convert_float2
3139#define convert_float3_sat convert_float3
3140#define convert_float4_sat convert_float4
3141#define convert_float8_sat convert_float8
3142#define convert_float16_sat convert_float16
3143#define convert_half_sat convert_float
3144#define convert_half1_sat convert_half
3145#define convert_half2_sat convert_half2
3146#define convert_half3_sat convert_half3
3147#define convert_half4_sat convert_half4
3148#define convert_half8_sat convert_half8
3149#define convert_half16_sat convert_half16
3150
3151#define convert_float1 convert_float
3152#define convert_half1 convert_half
3153#define convert_char1 convert_char
3154#define convert_uchar1 convert_uchar
3155#define convert_short1 convert_short
3156#define convert_ushort1 convert_ushort
3157#define convert_int1 convert_int
3158#define convert_uint1 convert_uint
3159#define convert_long1 convert_long
3160#define convert_ulong1 convert_ulong
3161#define convert_double1 convert_double
3162
3163#define convert_char1_sat convert_char_sat
3164#define convert_uchar1_sat convert_uchar_sat
3165#define convert_short1_sat convert_short_sat
3166#define convert_ushort1_sat convert_ushort_sat
3167#define convert_int1_sat convert_int_sat
3168#define convert_uint1_sat convert_uint_sat
3169#define convert_long1_sat convert_long_sat
3170#define convert_ulong1_sat convert_ulong_sat
3171#define convert_double1_sat convert_double_sat
3172
3173#define VEC_DATA_TYPE_STR(type, size) type##size
3174#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
3175
3176#define CONVERT_STR(x, type) (convert_##type((x)))
3177#define CONVERT(x, type) CONVERT_STR(x, type)
3178
3179#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
3180#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
3181
3182#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
3183#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
3184
3185#define select_vec_dt_uchar(size) uchar##size
3186#define select_vec_dt_char(size) char##size
3187#define select_vec_dt_ushort(size) ushort##size
3188#define select_vec_dt_short(size) short##size
3189#define select_vec_dt_half(size) short##size
3190#define select_vec_dt_uint(size) uint##size
3191#define select_vec_dt_int(size) int##size
3192#define select_vec_dt_float(size) int##size
3193#define select_vec_dt_ulong(size) ulong##size
3194#define select_vec_dt_long(size) long##size
3195
3196#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
3197#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
3198#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
3199
3200#define sum_reduce_1(x) (x)
3201#define sum_reduce_2(x) ((x).s0) + ((x).s1)
3202#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
3203#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
3204#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
3205#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
3206
3207#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
3208#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
3209
3210#define max_reduce_1(x) (x)
3211#define max_reduce_2(x) max(((x).s0), ((x).s1))
3212#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
3213#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
3214#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
3215#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
3216
3217#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
3218#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
3219
3220#define VECTOR_DECLARATION(name)     \
3221    __global uchar *name##_ptr,      \
3222    uint        name##_stride_x, \
3223    uint        name##_step_x,   \
3224    uint        name##_offset_first_element_in_bytes
3225
3226#define IMAGE_DECLARATION(name)      \
3227    __global uchar *name##_ptr,      \
3228    uint        name##_stride_x, \
3229    uint        name##_step_x,   \
3230    uint        name##_stride_y, \
3231    uint        name##_step_y,   \
3232    uint        name##_offset_first_element_in_bytes
3233
3234#define TENSOR3D_DECLARATION(name)   \
3235    __global uchar *name##_ptr,      \
3236    uint        name##_stride_x, \
3237    uint        name##_step_x,   \
3238    uint        name##_stride_y, \
3239    uint        name##_step_y,   \
3240    uint        name##_stride_z, \
3241    uint        name##_step_z,   \
3242    uint        name##_offset_first_element_in_bytes
3243
3244#define TENSOR4D_DECLARATION(name)   \
3245    __global uchar *name##_ptr,      \
3246    uint        name##_stride_x, \
3247    uint        name##_step_x,   \
3248    uint        name##_stride_y, \
3249    uint        name##_step_y,   \
3250    uint        name##_stride_z, \
3251    uint        name##_step_z,   \
3252    uint        name##_stride_w, \
3253    uint        name##_step_w,   \
3254    uint        name##_offset_first_element_in_bytes
3255
3256#define CONVERT_TO_VECTOR_STRUCT(name) \
3257    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
3258
3259#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
3260    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
3261
3262#define CONVERT_TO_IMAGE_STRUCT(name) \
3263    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
3264
3265#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
3266    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
3267
3268#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3269    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3270
3271#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
3272    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
3273
3274#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3275    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3276
3277#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
3278    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3279                                 name##_stride_z, name##_step_z)
3280
3281#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
3282    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
3283
3284#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
3285    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3286                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
3287
3288#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
3289    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
3290
3291#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
3292    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3293                           name##_stride_z, name##_step_z)
3294
3295/** Structure to hold Vector information */
3296typedef struct Vector
3297{
3298    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
3299    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
3300    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
3301} Vector;
3302
3303/** Structure to hold Image information */
3304typedef struct Image
3305{
3306    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
3307    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
3308    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
3309    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
3310} Image;
3311
3312/** Structure to hold 3D tensor information */
3313typedef struct Tensor3D
3314{
3315    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
3316    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
3317    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
3318    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
3319    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
3320} Tensor3D;
3321
3322/** Structure to hold 4D tensor information */
3323typedef struct Tensor4D
3324{
3325    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
3326    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
3327    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
3328    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
3329    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
3330    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
3331} Tensor4D;
3332
3333/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
3334 *
3335 * @param[in] ptr                           Pointer to the starting postion of the buffer
3336 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
3337 * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
3338 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
3339 *
3340 * @return An image object
3341 */
3342inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
3343{
3344    Vector vector =
3345    {
3346        .ptr                           = ptr,
3347        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3348        .stride_x                      = stride_x,
3349    };
3350    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
3351    return vector;
3352}
3353
3354/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
3355 *
3356 * @param[in] ptr                           Pointer to the starting postion of the buffer
3357 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
3358 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
3359 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
3360 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
3361 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
3362 *
3363 * @return An image object
3364 */
3365inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
3366{
3367    Image img =
3368    {
3369        .ptr                           = ptr,
3370        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3371        .stride_x                      = stride_x,
3372        .stride_y                      = stride_y
3373    };
3374    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
3375    return img;
3376}
3377
3378/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
3379 *
3380 * @param[in] ptr                           Pointer to the starting postion of the buffer
3381 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
3382 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
3383 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
3384 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
3385 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
3386 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
3387 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
3388 *
3389 * @return A 3D tensor object
3390 */
3391inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3392{
3393    Image img =
3394    {
3395        .ptr                           = ptr,
3396        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3397        .stride_x                      = stride_x,
3398        .stride_y                      = stride_y
3399    };
3400    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3401    return img;
3402}
3403
3404/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
3405 *
3406 * @param[in] ptr                           Pointer to the starting postion of the buffer
3407 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
3408 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
3409 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
3410 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
3411 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
3412 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
3413 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
3414 *
3415 * @return A 3D tensor object
3416 */
3417inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3418{
3419    Tensor3D tensor =
3420    {
3421        .ptr                           = ptr,
3422        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3423        .stride_x                      = stride_x,
3424        .stride_y                      = stride_y,
3425        .stride_z                      = stride_z
3426    };
3427    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3428    return tensor;
3429}
3430
3431/** Wrap 3D tensor information into an tensor structure.
3432 *
3433 * @param[in] ptr                           Pointer to the starting postion of the buffer
3434 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
3435 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
3436 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
3437 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
3438 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
3439 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
3440 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
3441 *
3442 * @return A 3D tensor object
3443 */
3444inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3445{
3446    Tensor3D tensor =
3447    {
3448        .ptr                           = ptr,
3449        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3450        .stride_x                      = stride_x,
3451        .stride_y                      = stride_y,
3452        .stride_z                      = stride_z
3453    };
3454    return tensor;
3455}
3456
3457inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
3458                                             uint step_w,
3459                                             uint mod_size)
3460{
3461    Tensor4D tensor =
3462    {
3463        .ptr                           = ptr,
3464        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3465        .stride_x                      = stride_x,
3466        .stride_y                      = stride_y,
3467        .stride_z                      = stride_z,
3468        .stride_w                      = stride_w
3469    };
3470
3471    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
3472    return tensor;
3473}
3474
3475/** Get the pointer position of a Vector
3476 *
3477 * @param[in] vec Pointer to the starting position of the buffer
3478 * @param[in] x   Relative X position
3479 */
3480inline __global const uchar *vector_offset(const Vector *vec, int x)
3481{
3482    return vec->ptr + x * vec->stride_x;
3483}
3484
3485/** Get the pointer position of a Image
3486 *
3487 * @param[in] img Pointer to the starting position of the buffer
3488 * @param[in] x   Relative X position
3489 * @param[in] y   Relative Y position
3490 */
3491inline __global uchar *offset(const Image *img, int x, int y)
3492{
3493    return img->ptr + x * img->stride_x + y * img->stride_y;
3494}
3495
3496/** Get the pointer position of a Tensor3D
3497 *
3498 * @param[in] tensor Pointer to the starting position of the buffer
3499 * @param[in] x      Relative X position
3500 * @param[in] y      Relative Y position
3501 * @param[in] z      Relative Z position
3502 */
3503inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
3504{
3505    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
3506}
3507
3508/** Get the pointer position of a Tensor4D
3509 *
3510 * @param[in] tensor Pointer to the starting position of the buffer
3511 * @param[in] x      Relative X position
3512 * @param[in] y      Relative Y position
3513 * @param[in] z      Relative Z position
3514 * @param[in] w      Relative W position
3515 */
3516inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
3517{
3518    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
3519}
3520
3521/** Get the offset for a given linear index of a Tensor3D
3522 *
3523 * @param[in] tensor Pointer to the starting position of the buffer
3524 * @param[in] width  Width of the input tensor
3525 * @param[in] height Height of the input tensor
3526 * @param[in] depth  Depth of the input tensor
3527 * @param[in] index  Linear index
3528 */
3529inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
3530{
3531    uint num_elements = width * height;
3532
3533    const uint z = index / num_elements;
3534
3535    index %= num_elements;
3536
3537    const uint y = index / width;
3538
3539    index %= width;
3540
3541    const uint x = index;
3542
3543    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
3544}
3545
3546#endif // _HELPER_H
3547
3548/** Convert the given vector with round to nearest even rounding mode
3549 *
3550 * @param[in] x    The target to be converted
3551 * @param[in] type The target type
3552 *
3553 * @return The converted vector
3554 */
3555#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
3556#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
3557
3558/** Quantize a floating-point scalar value to 8-bit asymmetric
3559 *
3560 * @param[in] input  Input value to quantize
3561 * @param[in] offset Quantization offset
3562 * @param[in] scale  Quantization scale
3563 *
3564 * @return quantized value
3565 */
3566inline uchar quantize_qasymm8(float input, float offset, float scale)
3567{
3568    float out_f32 = input / scale + offset;
3569    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
3570    return res_u8;
3571}
3572
3573/** Dequantize a scalar value from 8-bit asymmetric to floating-point
3574 *
3575 * @param[in] input  Input value to quantize
3576 * @param[in] offset Quantization offset
3577 * @param[in] scale  Quantization scale
3578 *
3579 * @return quantized value
3580 */
3581inline float dequantize_qasymm8(uchar input, float offset, float scale)
3582{
3583    return ((float)input - offset) * scale;
3584}
3585
3586/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
3587 *
3588 * @param[in] input  Input value to quantize
3589 * @param[in] offset Quantization offset
3590 * @param[in] scale  Quantization scale
3591 *
3592 * @return quantized value
3593 */
3594inline float dequantize_qasymm8_signed(char input, float offset, float scale)
3595{
3596    return ((float)input - offset) * scale;
3597}
3598
3599/** Quantize a vector of values from floating-point
3600 *
3601 * @param[in] type Output data type.
3602 * @param[in] size Size of vector.
3603 *
3604 * @return quantized values
3605 */
3606#define QUANTIZE_IMPL(type, size)                                                                                       \
3607    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
3608    {                                                                                                                   \
3609        VEC_DATA_TYPE(float, size)                                                                                      \
3610        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
3611        VEC_DATA_TYPE(type, size)                                                                                       \
3612        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
3613        return res;                                                                                                     \
3614    }
3615
3616/** Dequantize a vector of values to floating-point
3617 *
3618 * @param[in] type Input data type.
3619 * @param[in] size Size of vector.
3620 *
3621 * @return dequantized values in floating point
3622 */
3623#define DEQUANTIZE_IMPL(type, size)                                                                                       \
3624    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
3625    {                                                                                                                     \
3626        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
3627    }
3628
3629/** Correctly-rounded-to-nearest division by a power-of-two.
3630 *
3631 * @param[in] size Size of vector.
3632 *
3633 * @return Correctly-rounded-to-nearest division by a power-of-two.
3634 */
3635#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
3636    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
3637    {                                                                                                                                   \
3638        const VEC_DATA_TYPE(int, size)                                                                                                  \
3639        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
3640        const VEC_DATA_TYPE(int, size)                                                                                                  \
3641        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
3642        VEC_DATA_TYPE(int, size)                                                                                                        \
3643        mask = (one << exponent) - one;                                                                                                 \
3644        VEC_DATA_TYPE(int, size)                                                                                                        \
3645        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
3646        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
3647    }
3648
3649/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
3650 * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
3651 *
3652 * @param[in] size Size of vector.
3653 *
3654 * @return Product of two fixed-point numbers.
3655 */
3656#define ASYMM_MULT_IMPL(size)                                                                                \
3657    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
3658    {                                                                                                        \
3659        VEC_DATA_TYPE(int, size)                                                                             \
3660        overflow = a == b && a == INT_MIN;                                                                   \
3661        VEC_DATA_TYPE(long, size)                                                                            \
3662        a_64 = convert_long##size(a);                                                                        \
3663        VEC_DATA_TYPE(long, size)                                                                            \
3664        b_64 = convert_long##size(b);                                                                        \
3665        VEC_DATA_TYPE(long, size)                                                                            \
3666        ab_64 = a_64 * b_64;                                                                                 \
3667        /* Revert COMPMID-907 */                                                                             \
3668        VEC_DATA_TYPE(long, size)                                                                            \
3669        mask1 = 1 << 30;                                                                                     \
3670        VEC_DATA_TYPE(long, size)                                                                            \
3671        mask2 = 1 - (1 << 30);                                                                               \
3672        VEC_DATA_TYPE(long, size)                                                                            \
3673        is_positive_or_zero = ab_64 >= 0;                                                                    \
3674        VEC_DATA_TYPE(long, size)                                                                            \
3675        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
3676        VEC_DATA_TYPE(long, size)                                                                            \
3677        mask = 1ll << 31;                                                                                    \
3678        VEC_DATA_TYPE(int, size)                                                                             \
3679        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
3680        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
3681    }
3682
3683/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
3684 *
3685 * @param[in] size Size of vector.
3686 *
3687 * @return Result in fixed-point format Q0.
3688 */
3689#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
3690    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
3691    {                                                                                                                               \
3692        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
3693        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
3694        const int k_fractional_bits = 31;                                                                                           \
3695        VEC_DATA_TYPE(int, size)                                                                                                    \
3696        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
3697        VEC_DATA_TYPE(int, size)                                                                                                    \
3698        x2 = ASYMM_MULT(x, x, size);                                                                                                \
3699        VEC_DATA_TYPE(int, size)                                                                                                    \
3700        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
3701        VEC_DATA_TYPE(int, size)                                                                                                    \
3702        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
3703        VEC_DATA_TYPE(int, size)                                                                                                    \
3704        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
3705        VEC_DATA_TYPE(int, size)                                                                                                    \
3706        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
3707        VEC_DATA_TYPE(int, size)                                                                                                    \
3708        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
3709        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
3710    }
3711
3712/** Each bit of the result is set to the corresponding bit of either then_val or
3713 * else_val depending on whether the corresponding bit of if_mask is set.
3714 * Equivalent to the VBSL instruction in ARM NEON.
3715 *
3716 * @param[in] size Size of vector.
3717 *
3718 * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
3719 */
3720#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
3721    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
3722    {                                                                                                                                                                     \
3723        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
3724    }
3725
3726/** For each element of input vector, the corresponding bits of the result item are set
3727 * if the input item is zero.
3728 *
3729 * @param[in] size Size of vector.
3730 *
3731 * @returns Output vector with bits set when corresponding bit in @p a is zero.
3732 */
3733#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
3734    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
3735    {                                                                                    \
3736        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
3737        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
3738        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
3739    }
3740
3741/** For each element of input vector, the corresponding bits of the result item are set
3742 * if the input item is non-zero.
3743 *
3744 * @param[in] size Size of vector.
3745 *
3746 * @returns Output vector with bits set when corresponding bit in @p a is non zero.
3747 */
3748#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
3749    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
3750    {                                                                                        \
3751        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
3752        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
3753        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
3754    }
3755
3756#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
3757    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
3758    {                                                                                                                                                                                                         \
3759        if(k_integer_bits > exponent)                                                                                                                                                                         \
3760        {                                                                                                                                                                                                     \
3761            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
3762            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
3763                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
3764                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
3765        }                                                                                                                                                                                                     \
3766        \
3767        return result;                                                                                                                                                                                        \
3768    }
3769
3770/** Calculates \f$ exp(x) \f$ for x < 0.
3771 *
3772 * @param[in] size Size of vector.
3773 *
3774 * @return Result in fixed-point format Q0.
3775 */
3776#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
3777    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
3778    {                                                                                                                         \
3779        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
3780        VEC_DATA_TYPE(int, size)                                                                                              \
3781        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
3782        VEC_DATA_TYPE(int, size)                                                                                              \
3783        mask = k_one_quarter - 1;                                                                                             \
3784        VEC_DATA_TYPE(int, size)                                                                                              \
3785        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
3786        VEC_DATA_TYPE(int, size)                                                                                              \
3787        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
3788        VEC_DATA_TYPE(int, size)                                                                                              \
3789        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
3790        VEC_DATA_TYPE(int, size)                                                                                              \
3791        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
3792        \
3793        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
3794        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
3795        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
3796        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
3797        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
3798        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
3799        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
3800        \
3801        if(k_integer_bits > 5)                                                                                                \
3802        {                                                                                                                     \
3803            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
3804            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
3805        }                                                                                                                     \
3806        \
3807        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
3808        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
3809    }
3810
3811/** Calculates the product of a integer value by a power of two, with either a positive exponent
3812 * (equivalent to an arithmetic left shift, saturating) or a negative exponent
3813 * (equivalent to an arithmetic right shift, rounding to nearest).
3814 *
3815 * @param[in] size Size of vector.
3816 *
3817 * @return Arithmetic left or right shift.
3818 */
3819#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
3820    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
3821    {                                                                                                                      \
3822        if(exponent < 0)                                                                                                   \
3823        {                                                                                                                  \
3824            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
3825        }                                                                                                                  \
3826        \
3827        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
3828        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
3829        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
3830        VEC_DATA_TYPE(int, size)                                                                                           \
3831        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
3832        VEC_DATA_TYPE(int, size)                                                                                           \
3833        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
3834        VEC_DATA_TYPE(int, size)                                                                                           \
3835        result = x << exponent;                                                                                            \
3836        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
3837        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
3838        return result;                                                                                                     \
3839    }
3840
3841/** Calculates (a+b)/2, rounded to the nearest integer.
3842 * Equivalent to VRHADD in the ARM NEON instruction set.
3843 *
3844 * @param[in] size Size of vector.
3845 *
3846 * @return (a+b)/2, rounded to the nearest integer.
3847 */
3848#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
3849    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
3850    {                                                                                                                     \
3851        VEC_DATA_TYPE(long, size)                                                                                         \
3852        a64 = convert_long##size(a);                                                                                      \
3853        VEC_DATA_TYPE(long, size)                                                                                         \
3854        b64 = convert_long##size(b);                                                                                      \
3855        VEC_DATA_TYPE(long, size)                                                                                         \
3856        sum = a64 + b64;                                                                                                  \
3857        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
3858        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
3859        VEC_DATA_TYPE(long, size)                                                                                         \
3860        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
3861        return convert_int##size((sum + sign) / 2);                                                                       \
3862    }
3863
3864/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
3865 *
3866 * @param[in] size Size of vector.
3867 *
3868 * @return Result in fixed-point format Q0.
3869 */
3870#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
3871    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
3872    {                                                                                                        \
3873        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
3874        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
3875        VEC_DATA_TYPE(int, size)                                                                             \
3876        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
3877        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
3878        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
3879        VEC_DATA_TYPE(int, size)                                                                             \
3880        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
3881        for(int i = 0; i < 3; i++)                                                                           \
3882        {                                                                                                    \
3883            VEC_DATA_TYPE(int, size)                                                                         \
3884            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
3885            VEC_DATA_TYPE(int, size)                                                                         \
3886            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
3887            VEC_DATA_TYPE(int, size)                                                                         \
3888            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
3889            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
3890        }                                                                                                    \
3891        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
3892    }
3893
3894/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
3895 *
3896 * @param[in] size Size of vector.
3897 *
3898 * @return Rescaled value.
3899 */
3900#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
3901    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
3902    {                                                                                                                               \
3903        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
3904        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
3905    }
3906
3907#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
3908#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
3909#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
3910#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
3911
3912#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
3913#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
3914#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
3915#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
3916#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
3917    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
3918#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
3919    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
3920#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
3921#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
3922#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
3923#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
3924#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
3925#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
3926#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
3927#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
3928#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
3929#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
3930#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
3931#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
3932#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
3933
3934#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
3935    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
3936    {                                                                                                                           \
3937        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
3938        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
3939        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
3940    }
3941#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
3942
3943QUANTIZE_IMPL(uchar, 1)
3944QUANTIZE_IMPL(char, 1)
3945QUANTIZE_IMPL(uint, 1)
3946QUANTIZE_IMPL(int, 1)
3947QUANTIZE_IMPL(uchar, 4)
3948QUANTIZE_IMPL(ushort, 4)
3949QUANTIZE_IMPL(short, 4)
3950QUANTIZE_IMPL(uchar, 16)
3951QUANTIZE_IMPL(char, 16)
3952QUANTIZE_IMPL(ushort, 16)
3953QUANTIZE_IMPL(short, 16)
3954QUANTIZE_IMPL(uint, 16)
3955QUANTIZE_IMPL(int, 16)
3956
3957DEQUANTIZE_IMPL(uchar, 1)
3958DEQUANTIZE_IMPL(char, 1)
3959DEQUANTIZE_IMPL(uint, 1)
3960DEQUANTIZE_IMPL(int, 1)
3961DEQUANTIZE_IMPL(uchar, 4)
3962DEQUANTIZE_IMPL(ushort, 4)
3963DEQUANTIZE_IMPL(short, 4)
3964DEQUANTIZE_IMPL(uchar, 16)
3965DEQUANTIZE_IMPL(char, 16)
3966DEQUANTIZE_IMPL(ushort, 16)
3967DEQUANTIZE_IMPL(short, 16)
3968DEQUANTIZE_IMPL(uint, 16)
3969DEQUANTIZE_IMPL(int, 16)
3970
3971ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
3972ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
3973ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
3974ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
3975ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
3976ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
3977
3978ASYMM_MULT_IMPL(1)
3979ASYMM_MULT_IMPL(2)
3980ASYMM_MULT_IMPL(3)
3981ASYMM_MULT_IMPL(4)
3982ASYMM_MULT_IMPL(8)
3983ASYMM_MULT_IMPL(16)
3984
3985ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
3986ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
3987ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
3988ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
3989ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
3990ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
3991
3992ASYMM_SELECT_USING_MASK_IMPL(1)
3993ASYMM_SELECT_USING_MASK_IMPL(2)
3994ASYMM_SELECT_USING_MASK_IMPL(3)
3995ASYMM_SELECT_USING_MASK_IMPL(4)
3996ASYMM_SELECT_USING_MASK_IMPL(8)
3997ASYMM_SELECT_USING_MASK_IMPL(16)
3998
3999ASYMM_MASK_IF_ZERO_IMPL(1)
4000ASYMM_MASK_IF_ZERO_IMPL(2)
4001ASYMM_MASK_IF_ZERO_IMPL(3)
4002ASYMM_MASK_IF_ZERO_IMPL(4)
4003ASYMM_MASK_IF_ZERO_IMPL(8)
4004ASYMM_MASK_IF_ZERO_IMPL(16)
4005
4006ASYMM_MASK_IF_NON_ZERO_IMPL(1)
4007ASYMM_MASK_IF_NON_ZERO_IMPL(2)
4008ASYMM_MASK_IF_NON_ZERO_IMPL(3)
4009ASYMM_MASK_IF_NON_ZERO_IMPL(4)
4010ASYMM_MASK_IF_NON_ZERO_IMPL(8)
4011ASYMM_MASK_IF_NON_ZERO_IMPL(16)
4012
4013EXP_BARREL_SHIFTER_IMPL(1)
4014EXP_BARREL_SHIFTER_IMPL(2)
4015EXP_BARREL_SHIFTER_IMPL(3)
4016EXP_BARREL_SHIFTER_IMPL(4)
4017EXP_BARREL_SHIFTER_IMPL(8)
4018EXP_BARREL_SHIFTER_IMPL(16)
4019
4020ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
4021ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
4022ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
4023ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
4024ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
4025ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
4026
4027ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
4028ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
4029ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
4030ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
4031ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
4032ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
4033
4034ASYMM_ROUNDING_HALF_SUM_IMPL(1)
4035ASYMM_ROUNDING_HALF_SUM_IMPL(2)
4036ASYMM_ROUNDING_HALF_SUM_IMPL(3)
4037ASYMM_ROUNDING_HALF_SUM_IMPL(4)
4038ASYMM_ROUNDING_HALF_SUM_IMPL(8)
4039ASYMM_ROUNDING_HALF_SUM_IMPL(16)
4040
4041ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
4042ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
4043ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
4044ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
4045ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
4046ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
4047
4048ASYMM_RESCALE_IMPL(1)
4049ASYMM_RESCALE_IMPL(2)
4050ASYMM_RESCALE_IMPL(3)
4051ASYMM_RESCALE_IMPL(4)
4052ASYMM_RESCALE_IMPL(8)
4053ASYMM_RESCALE_IMPL(16)
4054
4055MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
4056MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
4057MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
4058MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
4059MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
4060MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
4061
4062#endif // ARM_COMPUTE_HELPERS_ASYMM_H
4063
4064/** Clamps the given coordinates to the borders according to the border size.
4065 *
4066 * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
4067 * @param[in] width       Width of the image
4068 * @param[in] height      Height of the image
4069 * @param[in] border_size Border size of the image
4070 *
4071 */
4072inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
4073{
4074    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
4075    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
4076    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
4077}
4078
4079/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
4080/** Clamps the given coordinates to the borders.
4081 *
4082 * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
4083 * @param[in] width  Width of the image
4084 * @param[in] height Height of the image
4085 *
4086 */
4087inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
4088{
4089    return clamp_to_border_with_size_quantized(coords, width, height, 1);
4090}
4091
4092/** Given a texel coordinates this function will return the following array of coordinates:
4093 * [ P, right neighbour, below neighbour, below right neighbour ]
4094 *
4095 * @note No checks to see if the coordinates are out of the image are done here.
4096 *
4097 * @param[in] coord Input coordinates
4098 *
4099 * @return vector of 8 floats with the coordinates, even positions are x and odd y.
4100 */
4101inline const float8 get_neighbour_coords_quantized(const float2 coord)
4102{
4103    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
4104}
4105
4106/** Returns the current thread coordinates. */
4107inline const float2 get_current_coords_quantized()
4108{
4109    return (float2)(get_global_id(0) * 4, get_global_id(1));
4110}
4111
4112/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
4113 *
4114 * @param[in] in            Pointer to the source image.
4115 * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
4116 * @param[in] width         Width of the image
4117 * @param[in] height        Height of the image
4118 * @param[in] border_size   Border size
4119 * @param[in] scale         Scale value
4120 * @param[in] offset_qasymm Offset value
4121 */
4122inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
4123                                                                                    const float scale, const int offset_qasymm)
4124{
4125    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
4126
4127    // Sets the 4x4 coordinates for each of the four input texels
4128    const float8  fc = floor(coords);
4129    const float16 c1 = (float16)(
4130                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
4131                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
4132    const float16 c2 = (float16)(
4133                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
4134                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
4135
4136    // Loads the values from the input image
4137    const int16 t = (int16)(
4138                        /* tl, tr, bl, br */
4139                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
4140                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
4141                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
4142                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
4143                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
4144                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
4145                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
4146                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
4147
4148    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
4149
4150    const float8 a  = coords - fc;
4151    const float8 b  = ((float8)(1.f)) - a;
4152    const float4 fr = (float4)(
4153                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
4154                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
4155                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
4156                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
4157
4158    const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
4159
4160    return res;
4161}
4162
4163/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
4164/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
4165 *
4166 * @param[in] in            Pointer to the source image.
4167 * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
4168 * @param[in] width         Width of the image
4169 * @param[in] height        Height of the image
4170 * @param[in] scale         Scale value
4171 * @param[in] offset_qasymm Offset value
4172 */
4173inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
4174{
4175    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
4176}
4177
4178/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
4179 *
4180 * @param[in] coord 2D coordinates to transform.
4181 * @param[in] scale input/output scale ratio
4182 *
4183 * @return a float8 containing 4 2D transformed values in the input image.
4184 */
4185inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
4186{
4187    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
4188#ifdef SAMPLING_POLICY_TOP_LEFT
4189    const float4 new_x = in_x_coords * (float4)(scale.s0);
4190    const float4 new_y = (float4)(coord.s1 * scale.s1);
4191    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
4192#elif SAMPLING_POLICY_CENTER
4193    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
4194    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
4195    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
4196#else /* SAMPLING_POLICY */
4197#error("Unsupported sampling policy");
4198#endif /* SAMPLING_POLICY */
4199}
4200
4201/** Performs an affine transformation on an image interpolating with the BILINEAR method.
4202 *
4203 * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
4204 * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
4205 * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
4206 *
4207 * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
4208 * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
4209 * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
4210 * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
4211 * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
4212 * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
4213 * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
4214 * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
4215 * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
4216 * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
4217 * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
4218 * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
4219 * @param[in]  input_width                       Input image width
4220 * @param[in]  input_height                      Input image height
4221 * @param[in]  scale_x                           The scale factor along x dimension
4222 * @param[in]  scale_y                           The scale factor along y dimension
4223 */
4224__kernel void scale_bilinear_quantized_nchw(
4225    IMAGE_DECLARATION(in),
4226    IMAGE_DECLARATION(out),
4227    const float input_width,
4228    const float input_height,
4229    const float scale_x,
4230    const float scale_y)
4231{
4232    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
4233    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
4234    const float2 r   = (float2)(scale_x, scale_y);
4235    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
4236    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
4237}
4238
4239#if defined(DEPTH_OUT)
4240/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
4241 *
4242 * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
4243 * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
4244 * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
4245 * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
4246 * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
4247 *
4248 * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
4249 * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
4250 * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
4251 * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
4252 * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
4253 * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
4254 * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
4255 * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
4256 * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
4257 * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
4258 * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
4259 * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
4260 * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
4261 * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
4262 * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
4263 * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
4264 * @param[in]  input_width                       Input image width
4265 * @param[in]  input_height                      Input image height
4266 * @param[in]  scale_x                           The scale factor along x dimension
4267 * @param[in]  scale_y                           The scale factor along y dimension
4268 */
4269__kernel void scale_bilinear_quantized_nhwc(
4270    TENSOR4D_DECLARATION(in),
4271    TENSOR4D_DECLARATION(out),
4272    const float input_width,
4273    const float input_height,
4274    const float scale_x,
4275    const float scale_y)
4276{
4277    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
4278    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
4279
4280#ifdef SAMPLING_POLICY_TOP_LEFT
4281    const float new_x = get_global_id(1) * scale_x;
4282    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
4283#elif SAMPLING_POLICY_CENTER
4284    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
4285    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
4286#else /* SAMPLING_POLICY */
4287#error("Unsupported sampling policy");
4288#endif /* SAMPLING_POLICY */
4289
4290    const float new_xf      = floor(new_x);
4291    const float new_yf      = floor(new_y);
4292    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
4293    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
4294    float       clamped_x_  = clamped_x;
4295    float       clamped_x1_ = clamped_x1;
4296    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
4297    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
4298
4299#ifndef BORDER_MODE_REPLICATE
4300    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
4301    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
4302    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
4303    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
4304#endif /* BORDER_MODE_REPLICATE */
4305
4306    int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
4307                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
4308                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
4309                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
4310
4311    const float  a      = new_x - new_xf;
4312    const float  b      = 1.f - a;
4313    const float  a1     = new_y - new_yf;
4314    const float  b1     = 1.f - a1;
4315    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
4316
4317    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
4318
4319    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
4320
4321    *((__global DATA_TYPE *)out.ptr) = res;
4322}
4323#endif /* defined(DEPTH_OUT) */
4324
4325)"