• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1R"(
2
3/*
4 * Copyright (c) 2018-2020 Arm Limited.
5 *
6 * SPDX-License-Identifier: MIT
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to
10 * deal in the Software without restriction, including without limitation the
11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
12 * sell copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in all
16 * copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26/*
27 * Copyright (c) 2016-2020 Arm Limited.
28 *
29 * SPDX-License-Identifier: MIT
30 *
31 * Permission is hereby granted, free of charge, to any person obtaining a copy
32 * of this software and associated documentation files (the "Software"), to
33 * deal in the Software without restriction, including without limitation the
34 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
35 * sell copies of the Software, and to permit persons to whom the Software is
36 * furnished to do so, subject to the following conditions:
37 *
38 * The above copyright notice and this permission notice shall be included in all
39 * copies or substantial portions of the Software.
40 *
41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
46 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
47 * SOFTWARE.
48 */
49#ifndef ARM_COMPUTE_HELPER_H
50#define ARM_COMPUTE_HELPER_H
51
52/*
53 * Copyright (c) 2020 Arm Limited.
54 *
55 * SPDX-License-Identifier: MIT
56 *
57 * Permission is hereby granted, free of charge, to any person obtaining a copy
58 * of this software and associated documentation files (the "Software"), to
59 * deal in the Software without restriction, including without limitation the
60 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
61 * sell copies of the Software, and to permit persons to whom the Software is
62 * furnished to do so, subject to the following conditions:
63 *
64 * The above copyright notice and this permission notice shall be included in all
65 * copies or substantial portions of the Software.
66 *
67 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
68 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
70 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
71 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
72 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
73 * SOFTWARE.
74 */
75
76/** Store the 0 to (n-1)th rows of the given variables
77 * @name STORE_ROW_n
78 *
79 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
80 * @param[in] DATA_TYPE The data type of the vectors
81 * @param[in] BASENAME  The basename of the variables
82 * @param[in] PTR       The base pointer
83 * @param[in] STRIDE_Y  The stride value in y-axis direction
84 * @param[in] Z         The offset in z-axis direction
85 * @{
86 */
87#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
88    VSTORE(N0)                                                 \
89    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
90
91#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
92    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
93    VSTORE(N0)                                                 \
94    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
95
96#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
97    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
98    VSTORE(N0)                                                 \
99    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
100
101#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
102    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
103    VSTORE(N0)                                                 \
104    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
105
106#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
107    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
108    VSTORE(N0)                                                 \
109    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
110
111#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
112    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
113    VSTORE(N0)                                                 \
114    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
115
116#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
117    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
118    VSTORE(N0)                                                 \
119    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
120
121#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
122    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
123    VSTORE(N0)                                                 \
124    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
125
126#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
127    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
128    VSTORE(N0)                                                 \
129    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
130
131#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
132    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
133    VSTORE(N0)                                                  \
134    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
135
136#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
137    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
138    VSTORE(N0)                                                  \
139    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
140
141#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
142    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
143    VSTORE(N0)                                                  \
144    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
145
146#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
147    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
148    VSTORE(N0)                                                  \
149    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
150
151#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
152    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
153    VSTORE(N0)                                                  \
154    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
155
156#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
157    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
158    VSTORE(N0)                                                  \
159    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
160
161#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
162    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
163    VSTORE(N0)                                                  \
164    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
165/** @} */ // end of groupd STORE_ROW_n
166
167/** Convert and store the 0th to (n-1)th rows of the given variables
168 * @name CONVERT_STORE_ROW_n
169 *
170 * @param[in] N0        The size of the vectors
171 * @param[in] DATA_TYPE The data type of the vectors
172 * @param[in] BASENAME  The basename of the variables
173 * @param[in] PTR       The base pointer
174 * @param[in] STRIDE_Y  The stride value in y-axis direction
175 * @param[in] Z         The offset in z-axis direction
176 * @{
177 */
178#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
179    VSTORE(N0)                                                         \
180    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
181
182#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
183    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
184    VSTORE(N0)                                                         \
185    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
186
187#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
188    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
189    VSTORE(N0)                                                         \
190    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
191
192#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
193    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
194    VSTORE(N0)                                                         \
195    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
196
197#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
198    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
199    VSTORE(N0)                                                         \
200    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
201
202#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
203    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
204    VSTORE(N0)                                                         \
205    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
206
207#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
208    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
209    VSTORE(N0)                                                         \
210    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
211
212#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
213    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
214    VSTORE(N0)                                                         \
215    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
216
217#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
218    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
219    VSTORE(N0)                                                         \
220    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
221
222#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
223    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
224    VSTORE(N0)                                                     \
225    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
226
227#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
228    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
229    VSTORE(N0)                                                          \
230    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
231
232#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
233    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
234    VSTORE(N0)                                                          \
235    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
236
237#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
238    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
239    VSTORE(N0)                                                          \
240    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
241
242#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
243    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
244    VSTORE(N0)                                                          \
245    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
246
247#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
248    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
249    VSTORE(N0)                                                          \
250    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
251
252#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
253    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
254    VSTORE(N0)                                                          \
255    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
256
257/** @} */ // end of groupd CONVERT_STORE_ROW_n
258
259/** Store a block of the given size M0xN0
260 * @name STORE_BLOCK
261 *
262 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
263 * The data to store is expected to have consecutive names for each row.
264 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
265 * The Z offset is expected to have consecutive names.
266 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
267 *
268 * @param[in] M0        The number of rows to store
269 * @param[in] N0        The size of each vector
270 * @param[in] DATA_TYPE The data type of the vectors
271 * @param[in] BASENAME  The basename of the variables
272 * @param[in] PTR       The base pointer
273 * @param[in] STRIDE_Y  The stride value in y-axis direction
274 * @param[in] Z         The offset in z-axis direction
275 * @{
276 */
277#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
278#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
279/** @} */ // end of group STORE_BLOCK
280
281/** Convert and store a block of the given size M0xN0
282 * @name CONVERT_STORE_BLOCK
283 *
284 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
285 * The data to store is expected to have consecutive names for each row.
286 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
287 * The Z offset is expected to have consecutive names.
288 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
289 *
290 * @param[in] M0        The number of rows to store
291 * @param[in] N0        The size of each vector
292 * @param[in] DATA_TYPE The data type of the vectors
293 * @param[in] BASENAME  The basename of the variables
294 * @param[in] PTR       The base pointer
295 * @param[in] STRIDE_Y  The stride value in y-axis direction
296 * @param[in] Z         The offset in z-axis direction
297 * @{
298 */
299#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
300#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
301/** @} */ // end of group CONVERT_STORE_BLOCK
302
303/** Partially store the 0 to (n-1)th rows of the given variables
304 * @name STORE_ROW_PARTIAL_n
305 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
306 *
307 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
308 *
309 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
310 * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
311 * @param[in] DATA_TYPE The data type of the vectors
312 * @param[in] BASENAME  The basename of the variables
313 * @param[in] PTR       The base pointer
314 * @param[in] STRIDE_Y  The stride value in y-axis direction
315 * @param[in] Z         The offset in z-axis direction
316 * @{
317 */
318#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
319    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
320    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
321
322#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
323    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
324    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
325    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
326
327#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
328    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
329    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
330    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
331
332#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
333    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
334    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
335    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
336
337#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
338    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
339    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
340    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
341
342#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
343    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
344    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
345    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
346
347#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
348    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
349    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
350    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
351
352#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
353    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
354    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
355    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
356
357#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
358    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
359    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
360    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
361
362#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
363    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
364    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
365    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
366
367#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
368    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
369    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
370    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
371
372#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
373    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
374    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
375    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
376
377#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
378    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
379    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
380    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
381
382#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
383    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
384    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
385    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
386
387#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
388    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
389    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
390    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
391
392#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
393    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
394    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
395    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
396/** @} */ // end of groupd STORE_ROW_PARTIAL_n
397
398/** Partially store a block of the given size STORE_M0xSTORE_N0
399 * @name STORE_BLOCK_PARTIAL
400 *
401 * @note The vector width @p N0 is also required for correct partial storing behaviour.
402 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
403 *
404 * The data to store is expected to have consecutive names for each row.
405 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
406 * The Z offset is expected to have consecutive names.
407 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
408 *
409 * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
410 * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
411 * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
412 * @param[in] DATA_TYPE The data type of the vectors
413 * @param[in] BASENAME  The basename of the variables
414 * @param[in] PTR       The base pointer
415 * @param[in] STRIDE_Y  The stride value in y-axis direction
416 * @param[in] Z         The offset in z-axis direction
417 * @{
418 */
419#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
420#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
421/** Store a block that can be partial in both x and y dimensions
422 *
423 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
424 *
425 * The data to store is expected to have consecutive names for each row.
426 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
427 * The Z offset is expected to have consecutive names.
428 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
429 *
430 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
431 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
432 * @param[in] DATA_TYPE        The data type of the vectors
433 * @param[in] BASENAME         The basename of the variables
434 * @param[in] PTR              The base pointer
435 * @param[in] STRIDE_Y         The stride value in y-axis direction
436 * @param[in] Z                The offset in z-axis direction
437 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
438 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
439 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
440 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
441 */
442#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
443    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
444    {                                                                                                                                                     \
445        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
446    }                                                                                                                                                     \
447    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
448    {                                                                                                                                                     \
449        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
450    }                                                                                                                                                     \
451    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
452    {                                                                                                                                                     \
453        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
454    }                                                                                                                                                     \
455    else                                                                                                                                                  \
456    {                                                                                                                                                     \
457        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
458    }
459/** Store a block that can only be partial in x but not y.
460 *
461 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
462 *
463 * The data to store is expected to have consecutive names for each row.
464 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
465 * The Z offset is expected to have consecutive names.
466 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
467 *
468 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
469 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
470 * @param[in] DATA_TYPE        The data type of the vectors
471 * @param[in] BASENAME         The basename of the variables
472 * @param[in] PTR              The base pointer
473 * @param[in] STRIDE_Y         The stride value in y-axis direction
474 * @param[in] Z                The offset in z-axis direction
475 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
476 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
477 */
478#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
479    if(!(PARTIAL_COND_X))                                                                                         \
480    {                                                                                                             \
481        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
482    }                                                                                                             \
483    else                                                                                                          \
484    {                                                                                                             \
485        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
486    }
487/** Store a block that can only be partial in y but not x.
488 *
489 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
490 *
491 * The data to store is expected to have consecutive names for each row.
492 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
493 * The Z offset is expected to have consecutive names.
494 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
495 *
496 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
497 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
498 * @param[in] DATA_TYPE        The data type of the vectors
499 * @param[in] BASENAME         The basename of the variables
500 * @param[in] PTR              The base pointer
501 * @param[in] STRIDE_Y         The stride value in y-axis direction
502 * @param[in] Z                The offset in z-axis direction
503 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
504 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
505 */
506#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
507    if(!(PARTIAL_COND_Y))                                                                                         \
508    {                                                                                                             \
509        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
510    }                                                                                                             \
511    else                                                                                                          \
512    {                                                                                                             \
513        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
514    }
515/** @} */ // end of group STORE_BLOCK_PARTIAL
516
517#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
518
519/** Boundary-aware GEMM block store
520 * @name STORE_BLOCK_BOUNDARY_AWARE
521 * This macro assumes the following schemes to achieve boundary-awareness:
522 *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
523 *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
524 *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
525 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
526 *
527 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
528 * blocks **at the end**.
529 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
530 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
531 *
532 *  *--x-->                         x == 0                        x == 1
533 *  |                  |<------------------------------N-------------------------->|
534 *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
535 *  |     -------------#############################################################
536 *  *     |          | |...............................|...........................|
537 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
538 *        |          | |...............................|...........................|
539 *        M          --#############################################################
540 *        |          | |                               |...........................|
541 * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
542 *        |          | |                               |...........................|
543 *        |------------#############################################################
544 *
545 * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
546 *
547 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
548 *
549 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
550 * and select corresponding store methods such that the boundary detection logic is only added when needed.
551 *
552 * The data to store is expected to have consecutive names for each row.
553 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
554 * The Z offset is expected to have consecutive names.
555 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
556 *
557 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
558 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
559 * @param[in] DATA_TYPE        The data type of the vectors
560 * @param[in] BASENAME         The basename of the variables
561 * @param[in] PTR              The base pointer
562 * @param[in] STRIDE_Y         The stride value in y-axis direction
563 * @param[in] Z                The offset in z-axis direction
564 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
565 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
566 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
567 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
568 * @{
569 */
570#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
571// Case1: No partial blocks in either x or y
572#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
573    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
574
575#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
576// Case2: Partial blocks in y
577#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
578    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
579
580#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
581// Case3: Partial blocks in x
582#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
583    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
584
585#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
586// Case4: Partial blocks in both x and y
587#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
588    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
589
590#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
591
592#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
593/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
594
595#if defined(PARTIAL_STORE_M0)
596/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
597 * @name COMPUTE_M0_START_ROW
598 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
599 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
600 * blocks in the y dimension to avoid any padding.
601 * EG: M0=4, PARTIAL_STORE_M0=1:
602 *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
603 * block 0 (partial)| start row = 0   | start row = 0
604 * block 1 (full)   | start row = 4   | start row = 1
605 * block 2 (full)   | start row = 8   | start row = 5
606 *
607 * @param[in] y                Global id of current block in y.
608 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
609 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
610 * @{
611 */
612#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
613    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
614#else // defined(PARTIAL_STORE_M0)
615#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
616    ((uint)(y * M0))
617#endif    // defined(PARTIAL_STORE_M0)
618/** @} */ // end of group COMPUTE_M0_START_ROW
619
620/** Store a vector that can only be partial in x.
621 *
622 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
623 *
624 * The data to store is expected to end in a 0.
625 * E.g., for basename=c, the expected name is c0.
626 *
627 * @param[in] basename  The name of the variable without trailing 0
628 * @param[in] data_type The data type of the vector
629 * @param[in] ptr       The base pointer
630 * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
631 * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
632 * @param[in] cond      Condition to select either vec_size0 or vec_size1
633 * @{
634 */
635#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
636    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
637/** @} */ // end of group STORE_VECTOR_SELECT
638
639#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
640#pragma OPENCL EXTENSION cl_khr_fp16 : enable
641#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
642
643#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
644#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
645#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
646
647#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
648#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
649#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
650
651#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
652#pragma OPENCL EXTENSION cl_arm_printf : enable
653#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
654
655#define GPU_ARCH_MIDGARD 0x100
656#define GPU_ARCH_BIFROST 0x200
657
658/** Concatenate two inputs.
659 *
660 * @param[in] a The first input to be concatenated
661 * @param[in] b The second input to be concatenated
662 *
663 * @return The concatenated output
664 */
665#define CONCAT(a, b) a##b
666
667/** Expand the given vector
668 *
669 * @param[in] x The vector to be expanded
670 *
671 * @return The expanded output
672 */
673#define EXPAND(x) x
674
675/** Clamp the given value between an upper and lower bound.
676 *
677 * @param[in] x       The value to be clamped
678 * @param[in] min_val The lower bound
679 * @param[in] max_val The upper bound
680 *
681 * @return The clamped value.
682 */
683#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
684
685/** REVn reverses the given vector whose size is n.
686 * @name REVn
687 *
688 * @param[in] x The vector to be reversed
689 *
690 * @return The reversed vector
691 * @{
692 */
693#define REV1(x) ((x))
694#define REV2(x) ((x).s10)
695#define REV3(x) ((x).s210)
696#define REV4(x) ((x).s3210)
697#define REV8(x) ((x).s76543210)
698#define REV16(x) ((x).sFEDCBA9876543210)
699/** @} */ // end of group REVn
700
701/** Reverse the given vector.
702 * @name REVERSE
703 *
704 * @param[in] x The vector to be reversed
705 * @param[in] s The size of the vector
706 *
707 * @return The reversed vector
708 * @{
709 */
710#define REVERSE_STR(x, s) REV##s((x))
711#define REVERSE(x, s) REVERSE_STR(x, s)
712/** @} */ // end of group REVERSE
713
714/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
715 * @name ROTs_n
716 *
717 * @param[in] x The vector to be shifted
718 *
719 * @return The shifted vector
720 * @{
721 */
722#define ROT1_0(x) ((x))
723
724#define ROT2_0(x) ((x))
725#define ROT2_1(x) ((x).s10)
726
727#define ROT3_0(x) ((x))
728#define ROT3_1(x) ((x).s201)
729#define ROT3_2(x) ((x).s120)
730
731#define ROT4_0(x) ((x))
732#define ROT4_1(x) ((x).s3012)
733#define ROT4_2(x) ((x).s2301)
734#define ROT4_3(x) ((x).s1230)
735
736#define ROT8_0(x) ((x))
737#define ROT8_1(x) ((x).s70123456)
738#define ROT8_2(x) ((x).s67012345)
739#define ROT8_3(x) ((x).s56701234)
740#define ROT8_4(x) ((x).s45670123)
741#define ROT8_5(x) ((x).s34567012)
742#define ROT8_6(x) ((x).s23456701)
743#define ROT8_7(x) ((x).s12345670)
744
745#define ROT16_0(x) ((x))
746#define ROT16_1(x) ((x).sF0123456789ABCDE)
747#define ROT16_2(x) ((x).sEF0123456789ABCD)
748#define ROT16_3(x) ((x).sDEF0123456789ABC)
749#define ROT16_4(x) ((x).sCDEF0123456789AB)
750#define ROT16_5(x) ((x).sBCDEF0123456789A)
751#define ROT16_6(x) ((x).sABCDEF0123456789)
752#define ROT16_7(x) ((x).s9ABCDEF012345678)
753#define ROT16_8(x) ((x).s89ABCDEF01234567)
754#define ROT16_9(x) ((x).s789ABCDEF0123456)
755#define ROT16_10(x) ((x).s6789ABCDEF012345)
756#define ROT16_11(x) ((x).s56789ABCDEF01234)
757#define ROT16_12(x) ((x).s456789ABCDEF0123)
758#define ROT16_13(x) ((x).s3456789ABCDEF012)
759#define ROT16_14(x) ((x).s23456789ABCDEF01)
760#define ROT16_15(x) ((x).s123456789ABCDEF0)
761/** @} */ // end of group ROTs_n
762
763/** Circular-right-shift (rotate-right) the given vector by the given amount.
764 * @name ROTATE
765 *
766 * @param[in] x The vector to be shifted
767 * @param[in] s The size of the vector
768 * @param[in] n The amount to be shifted
769 *
770 * @return The shifted vector
771 * @{
772 */
773#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
774#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
775/** @} */ // end of group ROTATE
776
777/** Creates a vector of size n filled with offset values corresponding to the location of each element.
778 * @name V_OFFSn
779 *
780 * @param[in] dt The data type of the output vector
781 *
782 * @return The vector filled with offset values
783 * @{
784 */
785#define V_OFFS1(dt) (dt##1)(0)
786#define V_OFFS2(dt) (dt##2)(0, 1)
787#define V_OFFS3(dt) (dt##3)(0, 1, 2)
788#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
789#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
790#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
791/** @} */ // end of group V_OFFSn
792
793/** Create a vector filled with offset values corresponding to the location of each element.
794 * @name VEC_OFFS
795 *
796 * @param[in] dt The data type of the output vector
797 * @param[in] s  The size of the output vector
798 *
799 * @return The vector filled with offset values
800 * @{
801 */
802#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
803#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
804/** @} */ // end of group VEC_OFFS
805
806#define VLOAD_STR(size) vload##size
807#define VLOAD(size) VLOAD_STR(size)
808
809#define PIXEL_UNIT4 1
810#define PIXEL_UNIT8 2
811#define PIXEL_UNIT16 4
812
813/** Utility macro to convert a vector size in pixel unit.
814 *
815 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
816 *
817 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
818 *
819 * @return The pixel unit (number of pixels)
820 * @{
821 */
822#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
823#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
824/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
825
826#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
827#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
828#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
829
830#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
831#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
832#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
833#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
834#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
835
836/** Utility macro to read a 2D OpenCL image object.
837 *
838 * @note Coordinates are not normalized
839 *
840 * @param[in] data_type Data type
841 * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
842 * @param[in] img       OpenCL image object
843 * @param[in] x_coord   The x coordinate for the top-left pixel
844 * @param[in] y_coord   The y coordinate for the top-left pixel
845 *
846 * @return Pixels from the 2D OpenCL image object
847 * @{
848 */
849#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
850#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
851
852#define VSTORE_STR(size) vstore##size
853#define VSTORE(size) VSTORE_STR(size)
854
855#define float1 float
856#define half1 half
857#define char1 char
858#define uchar1 uchar
859#define short1 short
860#define ushort1 ushort
861#define int1 int
862#define uint1 uint
863#define long1 long
864#define ulong1 ulong
865#define double1 double
866
867#define vload1(OFFSET, PTR) *(OFFSET + PTR)
868#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
869
870/** Extended partial vstore that correctly handles scalar values as well.
871 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
872 * @name VSTORE_PARTIAL
873 *
874 * @note With this macro, the passed data can be both a vector and a scalar
875 * @note @p store_size needs to be <= @p size
876 * eg 1: Valid
877 * VSTORE_PARTIAL(16, 15) ...;
878 * eg 2: Invalid
879 * VSTORE_PARTIAL(4, 7) ...;
880 *
881 * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
882 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
883 * @{
884 */
885#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
886#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
887
888#define NO_STORE(data, offs, ptr) \
889    {                             \
890    }
891
892// Size == 1 (scalar)
893#define vstore_partial_1_0 NO_STORE
894#define vstore_partial_1_1 vstore1
895#define vstore_partial_1_2 NO_STORE
896#define vstore_partial_1_3 NO_STORE
897#define vstore_partial_1_4 NO_STORE
898#define vstore_partial_1_5 NO_STORE
899#define vstore_partial_1_6 NO_STORE
900#define vstore_partial_1_7 NO_STORE
901#define vstore_partial_1_8 NO_STORE
902#define vstore_partial_1_9 NO_STORE
903#define vstore_partial_1_10 NO_STORE
904#define vstore_partial_1_11 NO_STORE
905#define vstore_partial_1_12 NO_STORE
906#define vstore_partial_1_13 NO_STORE
907#define vstore_partial_1_14 NO_STORE
908#define vstore_partial_1_15 NO_STORE
909#define vstore_partial_1_16 NO_STORE
910// Size == 2
911#define vstore_partial_2_0 NO_STORE
912#define vstore_partial_2_1 vstore_partial_1
913#define vstore_partial_2_2 vstore_partial_2
914#define vstore_partial_2_3 NO_STORE
915#define vstore_partial_2_4 NO_STORE
916#define vstore_partial_2_5 NO_STORE
917#define vstore_partial_2_6 NO_STORE
918#define vstore_partial_2_7 NO_STORE
919#define vstore_partial_2_8 NO_STORE
920#define vstore_partial_2_9 NO_STORE
921#define vstore_partial_2_10 NO_STORE
922#define vstore_partial_2_11 NO_STORE
923#define vstore_partial_2_12 NO_STORE
924#define vstore_partial_2_13 NO_STORE
925#define vstore_partial_2_14 NO_STORE
926#define vstore_partial_2_15 NO_STORE
927#define vstore_partial_2_16 NO_STORE
928// Size == 3
929#define vstore_partial_3_0 NO_STORE
930#define vstore_partial_3_1 vstore_partial_1
931#define vstore_partial_3_2 vstore_partial_2
932#define vstore_partial_3_3 vstore_partial_3
933#define vstore_partial_3_4 NO_STORE
934#define vstore_partial_3_5 NO_STORE
935#define vstore_partial_3_6 NO_STORE
936#define vstore_partial_3_7 NO_STORE
937#define vstore_partial_3_8 NO_STORE
938#define vstore_partial_3_9 NO_STORE
939#define vstore_partial_3_10 NO_STORE
940#define vstore_partial_3_11 NO_STORE
941#define vstore_partial_3_12 NO_STORE
942#define vstore_partial_3_13 NO_STORE
943#define vstore_partial_3_14 NO_STORE
944#define vstore_partial_3_15 NO_STORE
945#define vstore_partial_3_16 NO_STORE
946// Size == 4
947#define vstore_partial_4_0 NO_STORE
948#define vstore_partial_4_1 vstore_partial_1
949#define vstore_partial_4_2 vstore_partial_2
950#define vstore_partial_4_3 vstore_partial_3
951#define vstore_partial_4_4 vstore_partial_4
952#define vstore_partial_4_5 NO_STORE
953#define vstore_partial_4_6 NO_STORE
954#define vstore_partial_4_7 NO_STORE
955#define vstore_partial_4_8 NO_STORE
956#define vstore_partial_4_9 NO_STORE
957#define vstore_partial_4_10 NO_STORE
958#define vstore_partial_4_11 NO_STORE
959#define vstore_partial_4_12 NO_STORE
960#define vstore_partial_4_13 NO_STORE
961#define vstore_partial_4_14 NO_STORE
962#define vstore_partial_4_15 NO_STORE
963#define vstore_partial_4_16 NO_STORE
964// Size == 8
965#define vstore_partial_8_0 NO_STORE
966#define vstore_partial_8_1 vstore_partial_1
967#define vstore_partial_8_2 vstore_partial_2
968#define vstore_partial_8_3 vstore_partial_3
969#define vstore_partial_8_4 vstore_partial_4
970#define vstore_partial_8_5 vstore_partial_5
971#define vstore_partial_8_6 vstore_partial_6
972#define vstore_partial_8_7 vstore_partial_7
973#define vstore_partial_8_8 vstore_partial_8
974#define vstore_partial_8_9 NO_STORE
975#define vstore_partial_8_10 NO_STORE
976#define vstore_partial_8_11 NO_STORE
977#define vstore_partial_8_12 NO_STORE
978#define vstore_partial_8_13 NO_STORE
979#define vstore_partial_8_14 NO_STORE
980#define vstore_partial_8_15 NO_STORE
981#define vstore_partial_8_16 NO_STORE
982// Size == 16
983#define vstore_partial_16_0 NO_STORE
984#define vstore_partial_16_1 vstore_partial_1
985#define vstore_partial_16_2 vstore_partial_2
986#define vstore_partial_16_3 vstore_partial_3
987#define vstore_partial_16_4 vstore_partial_4
988#define vstore_partial_16_5 vstore_partial_5
989#define vstore_partial_16_6 vstore_partial_6
990#define vstore_partial_16_7 vstore_partial_7
991#define vstore_partial_16_8 vstore_partial_8
992#define vstore_partial_16_9 vstore_partial_9
993#define vstore_partial_16_10 vstore_partial_10
994#define vstore_partial_16_11 vstore_partial_11
995#define vstore_partial_16_12 vstore_partial_12
996#define vstore_partial_16_13 vstore_partial_13
997#define vstore_partial_16_14 vstore_partial_14
998#define vstore_partial_16_15 vstore_partial_15
999#define vstore_partial_16_16 vstore_partial_16
1000
1001/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
1002 * @name vstore_partial_n
1003 *
1004 * @note @p DATA needs to be a vector not a scalar
1005 * @note n needs to be <= the vector width of the input variable @p DATA
1006 * eg 1: Valid
1007 * vstore_partial_15(var:float16, 0, 0xabcd);
1008 * eg 2: Invalid
1009 * vstore_partial_7(var:float4, 0, 0xabcd);
1010 *
1011 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
1012 *
1013 * @param[in] DATA   The name of the variable
1014 * @param[in] OFFSET Offset in n
1015 * @param[in] PTR    The base pointer
1016 * @{
1017 */
1018#define vstore_partial_1(DATA, OFFSET, PTR) \
1019    vstore1(DATA.s0, OFFSET, PTR);
1020
1021#define vstore_partial_2(DATA, OFFSET, PTR) \
1022    vstore2(DATA.s01, OFFSET, PTR);
1023
1024#define vstore_partial_3(DATA, OFFSET, PTR) \
1025    vstore3(DATA.s012, OFFSET, PTR);
1026
1027#define vstore_partial_4(DATA, OFFSET, PTR) \
1028    vstore4(DATA.s0123, OFFSET, PTR);
1029
1030#define vstore_partial_5(DATA, OFFSET, PTR)    \
1031    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1032    vstore1(DATA.s4, OFFSET, PTR + 4);
1033
1034#define vstore_partial_6(DATA, OFFSET, PTR)    \
1035    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1036    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
1037
1038#define vstore_partial_7(DATA, OFFSET, PTR)    \
1039    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1040    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
1041
1042#define vstore_partial_8(DATA, OFFSET, PTR) \
1043    vstore8(DATA.s01234567, OFFSET, PTR);
1044
1045#define vstore_partial_9(DATA, OFFSET, PTR)        \
1046    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1047    vstore1(DATA.s8, OFFSET, PTR + 8);
1048
1049#define vstore_partial_10(DATA, OFFSET, PTR)       \
1050    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1051    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
1052
1053#define vstore_partial_11(DATA, OFFSET, PTR)       \
1054    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1055    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
1056
1057#define vstore_partial_12(DATA, OFFSET, PTR)       \
1058    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1059    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
1060
1061#define vstore_partial_13(DATA, OFFSET, PTR)       \
1062    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1063    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
1064
1065#define vstore_partial_14(DATA, OFFSET, PTR)       \
1066    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1067    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
1068
1069#define vstore_partial_15(DATA, OFFSET, PTR)       \
1070    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1071    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
1072
1073#define vstore_partial_16(DATA, OFFSET, PTR) \
1074    vstore16(DATA, OFFSET, PTR);
1075/** @} */ // end of groupd vstore_partial_n
1076/** @} */ // end of groupd VSTORE_PARTIAL
1077
1078// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
1079// without _sat to overcome this issue
1080#define convert_float_sat convert_float
1081#define convert_float1_sat convert_float
1082#define convert_float2_sat convert_float2
1083#define convert_float3_sat convert_float3
1084#define convert_float4_sat convert_float4
1085#define convert_float8_sat convert_float8
1086#define convert_float16_sat convert_float16
1087#define convert_half_sat convert_float
1088#define convert_half1_sat convert_half
1089#define convert_half2_sat convert_half2
1090#define convert_half3_sat convert_half3
1091#define convert_half4_sat convert_half4
1092#define convert_half8_sat convert_half8
1093#define convert_half16_sat convert_half16
1094
1095#define convert_float1 convert_float
1096#define convert_half1 convert_half
1097#define convert_char1 convert_char
1098#define convert_uchar1 convert_uchar
1099#define convert_short1 convert_short
1100#define convert_ushort1 convert_ushort
1101#define convert_int1 convert_int
1102#define convert_uint1 convert_uint
1103#define convert_long1 convert_long
1104#define convert_ulong1 convert_ulong
1105#define convert_double1 convert_double
1106
1107#define convert_char1_sat convert_char_sat
1108#define convert_uchar1_sat convert_uchar_sat
1109#define convert_short1_sat convert_short_sat
1110#define convert_ushort1_sat convert_ushort_sat
1111#define convert_int1_sat convert_int_sat
1112#define convert_uint1_sat convert_uint_sat
1113#define convert_long1_sat convert_long_sat
1114#define convert_ulong1_sat convert_ulong_sat
1115#define convert_double1_sat convert_double_sat
1116
1117#define VEC_DATA_TYPE_STR(type, size) type##size
1118#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
1119
1120#define CONVERT_STR(x, type) (convert_##type((x)))
1121#define CONVERT(x, type) CONVERT_STR(x, type)
1122
1123#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
1124#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
1125
1126#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
1127#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
1128
1129#define select_vec_dt_uchar(size) uchar##size
1130#define select_vec_dt_char(size) char##size
1131#define select_vec_dt_ushort(size) ushort##size
1132#define select_vec_dt_short(size) short##size
1133#define select_vec_dt_half(size) short##size
1134#define select_vec_dt_uint(size) uint##size
1135#define select_vec_dt_int(size) int##size
1136#define select_vec_dt_float(size) int##size
1137#define select_vec_dt_ulong(size) ulong##size
1138#define select_vec_dt_long(size) long##size
1139
1140#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
1141#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
1142#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
1143
1144#define sum_reduce_1(x) (x)
1145#define sum_reduce_2(x) ((x).s0) + ((x).s1)
1146#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
1147#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
1148#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
1149#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
1150
1151#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
1152#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
1153
1154#define max_reduce_1(x) (x)
1155#define max_reduce_2(x) max(((x).s0), ((x).s1))
1156#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
1157#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
1158#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
1159#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
1160
1161#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
1162#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
1163
1164#define VECTOR_DECLARATION(name)     \
1165    __global uchar *name##_ptr,      \
1166    uint        name##_stride_x, \
1167    uint        name##_step_x,   \
1168    uint        name##_offset_first_element_in_bytes
1169
1170#define IMAGE_DECLARATION(name)      \
1171    __global uchar *name##_ptr,      \
1172    uint        name##_stride_x, \
1173    uint        name##_step_x,   \
1174    uint        name##_stride_y, \
1175    uint        name##_step_y,   \
1176    uint        name##_offset_first_element_in_bytes
1177
1178#define TENSOR3D_DECLARATION(name)   \
1179    __global uchar *name##_ptr,      \
1180    uint        name##_stride_x, \
1181    uint        name##_step_x,   \
1182    uint        name##_stride_y, \
1183    uint        name##_step_y,   \
1184    uint        name##_stride_z, \
1185    uint        name##_step_z,   \
1186    uint        name##_offset_first_element_in_bytes
1187
1188#define TENSOR4D_DECLARATION(name)   \
1189    __global uchar *name##_ptr,      \
1190    uint        name##_stride_x, \
1191    uint        name##_step_x,   \
1192    uint        name##_stride_y, \
1193    uint        name##_step_y,   \
1194    uint        name##_stride_z, \
1195    uint        name##_step_z,   \
1196    uint        name##_stride_w, \
1197    uint        name##_step_w,   \
1198    uint        name##_offset_first_element_in_bytes
1199
1200#define CONVERT_TO_VECTOR_STRUCT(name) \
1201    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1202
1203#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1204    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1205
1206#define CONVERT_TO_IMAGE_STRUCT(name) \
1207    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1208
1209#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1210    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1211
1212#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1213    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1214
1215#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1216    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1217
1218#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1219    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1220
1221#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1222    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1223                                 name##_stride_z, name##_step_z)
1224
1225#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1226    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1227
1228#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1229    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1230                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1231
1232#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1233    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1234
1235#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1236    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1237                           name##_stride_z, name##_step_z)
1238
1239/** Structure to hold Vector information */
1240typedef struct Vector
1241{
1242    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1243    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1244    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1245} Vector;
1246
1247/** Structure to hold Image information */
1248typedef struct Image
1249{
1250    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1251    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1252    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1253    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1254} Image;
1255
1256/** Structure to hold 3D tensor information */
1257typedef struct Tensor3D
1258{
1259    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1260    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1261    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1262    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1263    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
1264} Tensor3D;
1265
1266/** Structure to hold 4D tensor information */
1267typedef struct Tensor4D
1268{
1269    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1270    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1271    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1272    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1273    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
1274    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
1275} Tensor4D;
1276
1277/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
1278 *
1279 * @param[in] ptr                           Pointer to the starting postion of the buffer
1280 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
1281 * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
1282 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1283 *
1284 * @return An image object
1285 */
1286inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1287{
1288    Vector vector =
1289    {
1290        .ptr                           = ptr,
1291        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1292        .stride_x                      = stride_x,
1293    };
1294    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1295    return vector;
1296}
1297
1298/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
1299 *
1300 * @param[in] ptr                           Pointer to the starting postion of the buffer
1301 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1302 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1303 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1304 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1305 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1306 *
1307 * @return An image object
1308 */
1309inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1310{
1311    Image img =
1312    {
1313        .ptr                           = ptr,
1314        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1315        .stride_x                      = stride_x,
1316        .stride_y                      = stride_y
1317    };
1318    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1319    return img;
1320}
1321
1322/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
1323 *
1324 * @param[in] ptr                           Pointer to the starting postion of the buffer
1325 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1326 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1327 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1328 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1329 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1330 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1331 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1332 *
1333 * @return A 3D tensor object
1334 */
1335inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1336{
1337    Image img =
1338    {
1339        .ptr                           = ptr,
1340        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1341        .stride_x                      = stride_x,
1342        .stride_y                      = stride_y
1343    };
1344    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1345    return img;
1346}
1347
1348/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
1349 *
1350 * @param[in] ptr                           Pointer to the starting postion of the buffer
1351 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1352 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1353 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1354 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1355 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1356 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1357 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1358 *
1359 * @return A 3D tensor object
1360 */
1361inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1362{
1363    Tensor3D tensor =
1364    {
1365        .ptr                           = ptr,
1366        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1367        .stride_x                      = stride_x,
1368        .stride_y                      = stride_y,
1369        .stride_z                      = stride_z
1370    };
1371    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1372    return tensor;
1373}
1374
1375/** Wrap 3D tensor information into an tensor structure.
1376 *
1377 * @param[in] ptr                           Pointer to the starting postion of the buffer
1378 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1379 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1380 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1381 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1382 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1383 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1384 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1385 *
1386 * @return A 3D tensor object
1387 */
1388inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1389{
1390    Tensor3D tensor =
1391    {
1392        .ptr                           = ptr,
1393        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1394        .stride_x                      = stride_x,
1395        .stride_y                      = stride_y,
1396        .stride_z                      = stride_z
1397    };
1398    return tensor;
1399}
1400
1401inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1402                                             uint step_w,
1403                                             uint mod_size)
1404{
1405    Tensor4D tensor =
1406    {
1407        .ptr                           = ptr,
1408        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1409        .stride_x                      = stride_x,
1410        .stride_y                      = stride_y,
1411        .stride_z                      = stride_z,
1412        .stride_w                      = stride_w
1413    };
1414
1415    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1416    return tensor;
1417}
1418
1419/** Get the pointer position of a Vector
1420 *
1421 * @param[in] vec Pointer to the starting position of the buffer
1422 * @param[in] x   Relative X position
1423 */
1424inline __global const uchar *vector_offset(const Vector *vec, int x)
1425{
1426    return vec->ptr + x * vec->stride_x;
1427}
1428
1429/** Get the pointer position of a Image
1430 *
1431 * @param[in] img Pointer to the starting position of the buffer
1432 * @param[in] x   Relative X position
1433 * @param[in] y   Relative Y position
1434 */
1435inline __global uchar *offset(const Image *img, int x, int y)
1436{
1437    return img->ptr + x * img->stride_x + y * img->stride_y;
1438}
1439
1440/** Get the pointer position of a Tensor3D
1441 *
1442 * @param[in] tensor Pointer to the starting position of the buffer
1443 * @param[in] x      Relative X position
1444 * @param[in] y      Relative Y position
1445 * @param[in] z      Relative Z position
1446 */
1447inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1448{
1449    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1450}
1451
1452/** Get the pointer position of a Tensor4D
1453 *
1454 * @param[in] tensor Pointer to the starting position of the buffer
1455 * @param[in] x      Relative X position
1456 * @param[in] y      Relative Y position
1457 * @param[in] z      Relative Z position
1458 * @param[in] w      Relative W position
1459 */
1460inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1461{
1462    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1463}
1464
1465/** Get the offset for a given linear index of a Tensor3D
1466 *
1467 * @param[in] tensor Pointer to the starting position of the buffer
1468 * @param[in] width  Width of the input tensor
1469 * @param[in] height Height of the input tensor
1470 * @param[in] depth  Depth of the input tensor
1471 * @param[in] index  Linear index
1472 */
1473inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1474{
1475    uint num_elements = width * height;
1476
1477    const uint z = index / num_elements;
1478
1479    index %= num_elements;
1480
1481    const uint y = index / width;
1482
1483    index %= width;
1484
1485    const uint x = index;
1486
1487    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1488}
1489
1490#endif // _HELPER_H
1491/*
1492 * Copyright (c) 2016, 2017 Arm Limited.
1493 *
1494 * SPDX-License-Identifier: MIT
1495 *
1496 * Permission is hereby granted, free of charge, to any person obtaining a copy
1497 * of this software and associated documentation files (the "Software"), to
1498 * deal in the Software without restriction, including without limitation the
1499 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1500 * sell copies of the Software, and to permit persons to whom the Software is
1501 * furnished to do so, subject to the following conditions:
1502 *
1503 * The above copyright notice and this permission notice shall be included in all
1504 * copies or substantial portions of the Software.
1505 *
1506 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1507 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1508 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1509 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1510 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1511 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1512 * SOFTWARE.
1513 */
1514/*
1515 * Copyright (c) 2016-2020 Arm Limited.
1516 *
1517 * SPDX-License-Identifier: MIT
1518 *
1519 * Permission is hereby granted, free of charge, to any person obtaining a copy
1520 * of this software and associated documentation files (the "Software"), to
1521 * deal in the Software without restriction, including without limitation the
1522 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1523 * sell copies of the Software, and to permit persons to whom the Software is
1524 * furnished to do so, subject to the following conditions:
1525 *
1526 * The above copyright notice and this permission notice shall be included in all
1527 * copies or substantial portions of the Software.
1528 *
1529 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1530 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1531 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1532 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1533 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1534 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1535 * SOFTWARE.
1536 */
1537#ifndef ARM_COMPUTE_HELPER_H
1538#define ARM_COMPUTE_HELPER_H
1539
1540/*
1541 * Copyright (c) 2020 Arm Limited.
1542 *
1543 * SPDX-License-Identifier: MIT
1544 *
1545 * Permission is hereby granted, free of charge, to any person obtaining a copy
1546 * of this software and associated documentation files (the "Software"), to
1547 * deal in the Software without restriction, including without limitation the
1548 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1549 * sell copies of the Software, and to permit persons to whom the Software is
1550 * furnished to do so, subject to the following conditions:
1551 *
1552 * The above copyright notice and this permission notice shall be included in all
1553 * copies or substantial portions of the Software.
1554 *
1555 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1556 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1557 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1558 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1559 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1560 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1561 * SOFTWARE.
1562 */
1563
1564/** Store the 0 to (n-1)th rows of the given variables
1565 * @name STORE_ROW_n
1566 *
1567 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
1568 * @param[in] DATA_TYPE The data type of the vectors
1569 * @param[in] BASENAME  The basename of the variables
1570 * @param[in] PTR       The base pointer
1571 * @param[in] STRIDE_Y  The stride value in y-axis direction
1572 * @param[in] Z         The offset in z-axis direction
1573 * @{
1574 */
1575#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1576    VSTORE(N0)                                                 \
1577    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1578
1579#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1580    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1581    VSTORE(N0)                                                 \
1582    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1583
1584#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1585    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1586    VSTORE(N0)                                                 \
1587    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1588
1589#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1590    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1591    VSTORE(N0)                                                 \
1592    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1593
1594#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1595    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1596    VSTORE(N0)                                                 \
1597    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1598
1599#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1600    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1601    VSTORE(N0)                                                 \
1602    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1603
1604#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1605    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1606    VSTORE(N0)                                                 \
1607    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1608
1609#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1610    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1611    VSTORE(N0)                                                 \
1612    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1613
1614#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1615    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1616    VSTORE(N0)                                                 \
1617    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1618
1619#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1620    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1621    VSTORE(N0)                                                  \
1622    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1623
1624#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1625    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1626    VSTORE(N0)                                                  \
1627    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1628
1629#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1630    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1631    VSTORE(N0)                                                  \
1632    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1633
1634#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1635    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1636    VSTORE(N0)                                                  \
1637    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1638
1639#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1640    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1641    VSTORE(N0)                                                  \
1642    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1643
1644#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1645    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1646    VSTORE(N0)                                                  \
1647    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1648
1649#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1650    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1651    VSTORE(N0)                                                  \
1652    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1653/** @} */ // end of groupd STORE_ROW_n
1654
1655/** Convert and store the 0th to (n-1)th rows of the given variables
1656 * @name CONVERT_STORE_ROW_n
1657 *
1658 * @param[in] N0        The size of the vectors
1659 * @param[in] DATA_TYPE The data type of the vectors
1660 * @param[in] BASENAME  The basename of the variables
1661 * @param[in] PTR       The base pointer
1662 * @param[in] STRIDE_Y  The stride value in y-axis direction
1663 * @param[in] Z         The offset in z-axis direction
1664 * @{
1665 */
1666#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1667    VSTORE(N0)                                                         \
1668    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1669
1670#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1671    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1672    VSTORE(N0)                                                         \
1673    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1674
1675#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1676    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1677    VSTORE(N0)                                                         \
1678    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1679
1680#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1681    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1682    VSTORE(N0)                                                         \
1683    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1684
1685#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1686    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1687    VSTORE(N0)                                                         \
1688    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1689
1690#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1691    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1692    VSTORE(N0)                                                         \
1693    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1694
1695#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1696    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1697    VSTORE(N0)                                                         \
1698    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1699
1700#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1701    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1702    VSTORE(N0)                                                         \
1703    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1704
1705#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1706    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1707    VSTORE(N0)                                                         \
1708    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1709
1710#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1711    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1712    VSTORE(N0)                                                     \
1713    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1714
1715#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1716    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1717    VSTORE(N0)                                                          \
1718    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1719
1720#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1721    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1722    VSTORE(N0)                                                          \
1723    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1724
1725#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1726    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1727    VSTORE(N0)                                                          \
1728    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1729
1730#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1731    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1732    VSTORE(N0)                                                          \
1733    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1734
1735#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1736    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1737    VSTORE(N0)                                                          \
1738    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1739
1740#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1741    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1742    VSTORE(N0)                                                          \
1743    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1744
1745/** @} */ // end of groupd CONVERT_STORE_ROW_n
1746
1747/** Store a block of the given size M0xN0
1748 * @name STORE_BLOCK
1749 *
1750 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
1751 * The data to store is expected to have consecutive names for each row.
1752 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1753 * The Z offset is expected to have consecutive names.
1754 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1755 *
1756 * @param[in] M0        The number of rows to store
1757 * @param[in] N0        The size of each vector
1758 * @param[in] DATA_TYPE The data type of the vectors
1759 * @param[in] BASENAME  The basename of the variables
1760 * @param[in] PTR       The base pointer
1761 * @param[in] STRIDE_Y  The stride value in y-axis direction
1762 * @param[in] Z         The offset in z-axis direction
1763 * @{
1764 */
1765#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1766#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1767/** @} */ // end of group STORE_BLOCK
1768
1769/** Convert and store a block of the given size M0xN0
1770 * @name CONVERT_STORE_BLOCK
1771 *
1772 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
1773 * The data to store is expected to have consecutive names for each row.
1774 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1775 * The Z offset is expected to have consecutive names.
1776 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1777 *
1778 * @param[in] M0        The number of rows to store
1779 * @param[in] N0        The size of each vector
1780 * @param[in] DATA_TYPE The data type of the vectors
1781 * @param[in] BASENAME  The basename of the variables
1782 * @param[in] PTR       The base pointer
1783 * @param[in] STRIDE_Y  The stride value in y-axis direction
1784 * @param[in] Z         The offset in z-axis direction
1785 * @{
1786 */
1787#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1788#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1789/** @} */ // end of group CONVERT_STORE_BLOCK
1790
1791/** Partially store the 0 to (n-1)th rows of the given variables
1792 * @name STORE_ROW_PARTIAL_n
1793 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
1794 *
1795 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1796 *
1797 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
1798 * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
1799 * @param[in] DATA_TYPE The data type of the vectors
1800 * @param[in] BASENAME  The basename of the variables
1801 * @param[in] PTR       The base pointer
1802 * @param[in] STRIDE_Y  The stride value in y-axis direction
1803 * @param[in] Z         The offset in z-axis direction
1804 * @{
1805 */
1806#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1807    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1808    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1809
1810#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1811    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1812    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1813    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1814
1815#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1816    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1817    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1818    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1819
1820#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1821    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1822    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1823    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1824
1825#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1826    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1827    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1828    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1829
1830#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1831    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1832    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1833    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1834
1835#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1836    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1837    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1838    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1839
1840#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1841    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1842    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1843    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1844
1845#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1846    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1847    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1848    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1849
1850#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1851    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1852    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1853    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1854
1855#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1856    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1857    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1858    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1859
1860#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1861    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1862    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1863    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1864
1865#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1866    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1867    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1868    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1869
1870#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1871    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1872    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1873    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1874
1875#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1876    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1877    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1878    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1879
1880#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1881    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1882    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1883    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1884/** @} */ // end of groupd STORE_ROW_PARTIAL_n
1885
1886/** Partially store a block of the given size STORE_M0xSTORE_N0
1887 * @name STORE_BLOCK_PARTIAL
1888 *
1889 * @note The vector width @p N0 is also required for correct partial storing behaviour.
1890 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1891 *
1892 * The data to store is expected to have consecutive names for each row.
1893 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
1894 * The Z offset is expected to have consecutive names.
1895 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1896 *
1897 * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
1898 * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
1899 * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
1900 * @param[in] DATA_TYPE The data type of the vectors
1901 * @param[in] BASENAME  The basename of the variables
1902 * @param[in] PTR       The base pointer
1903 * @param[in] STRIDE_Y  The stride value in y-axis direction
1904 * @param[in] Z         The offset in z-axis direction
1905 * @{
1906 */
1907#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1908#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1909/** Store a block that can be partial in both x and y dimensions
1910 *
1911 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1912 *
1913 * The data to store is expected to have consecutive names for each row.
1914 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1915 * The Z offset is expected to have consecutive names.
1916 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1917 *
1918 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
1919 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
1920 * @param[in] DATA_TYPE        The data type of the vectors
1921 * @param[in] BASENAME         The basename of the variables
1922 * @param[in] PTR              The base pointer
1923 * @param[in] STRIDE_Y         The stride value in y-axis direction
1924 * @param[in] Z                The offset in z-axis direction
1925 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
1926 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
1927 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
1928 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
1929 */
1930#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1931    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1932    {                                                                                                                                                     \
1933        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1934    }                                                                                                                                                     \
1935    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1936    {                                                                                                                                                     \
1937        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1938    }                                                                                                                                                     \
1939    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1940    {                                                                                                                                                     \
1941        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1942    }                                                                                                                                                     \
1943    else                                                                                                                                                  \
1944    {                                                                                                                                                     \
1945        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1946    }
1947/** Store a block that can only be partial in x but not y.
1948 *
1949 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1950 *
1951 * The data to store is expected to have consecutive names for each row.
1952 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1953 * The Z offset is expected to have consecutive names.
1954 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1955 *
1956 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
1957 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
1958 * @param[in] DATA_TYPE        The data type of the vectors
1959 * @param[in] BASENAME         The basename of the variables
1960 * @param[in] PTR              The base pointer
1961 * @param[in] STRIDE_Y         The stride value in y-axis direction
1962 * @param[in] Z                The offset in z-axis direction
1963 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
1964 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
1965 */
1966#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1967    if(!(PARTIAL_COND_X))                                                                                         \
1968    {                                                                                                             \
1969        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1970    }                                                                                                             \
1971    else                                                                                                          \
1972    {                                                                                                             \
1973        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1974    }
1975/** Store a block that can only be partial in y but not x.
1976 *
1977 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1978 *
1979 * The data to store is expected to have consecutive names for each row.
1980 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1981 * The Z offset is expected to have consecutive names.
1982 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1983 *
1984 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
1985 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
1986 * @param[in] DATA_TYPE        The data type of the vectors
1987 * @param[in] BASENAME         The basename of the variables
1988 * @param[in] PTR              The base pointer
1989 * @param[in] STRIDE_Y         The stride value in y-axis direction
1990 * @param[in] Z                The offset in z-axis direction
1991 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
1992 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
1993 */
1994#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1995    if(!(PARTIAL_COND_Y))                                                                                         \
1996    {                                                                                                             \
1997        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1998    }                                                                                                             \
1999    else                                                                                                          \
2000    {                                                                                                             \
2001        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2002    }
2003/** @} */ // end of group STORE_BLOCK_PARTIAL
2004
2005#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2006
2007/** Boundary-aware GEMM block store
2008 * @name STORE_BLOCK_BOUNDARY_AWARE
2009 * This macro assumes the following schemes to achieve boundary-awareness:
2010 *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
2011 *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
2012 *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
2013 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
2014 *
2015 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
2016 * blocks **at the end**.
2017 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
2018 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
2019 *
2020 *  *--x-->                         x == 0                        x == 1
2021 *  |                  |<------------------------------N-------------------------->|
2022 *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
2023 *  |     -------------#############################################################
2024 *  *     |          | |...............................|...........................|
2025 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
2026 *        |          | |...............................|...........................|
2027 *        M          --#############################################################
2028 *        |          | |                               |...........................|
2029 * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
2030 *        |          | |                               |...........................|
2031 *        |------------#############################################################
2032 *
2033 * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
2034 *
2035 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2036 *
2037 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
2038 * and select corresponding store methods such that the boundary detection logic is only added when needed.
2039 *
2040 * The data to store is expected to have consecutive names for each row.
2041 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2042 * The Z offset is expected to have consecutive names.
2043 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2044 *
2045 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2046 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2047 * @param[in] DATA_TYPE        The data type of the vectors
2048 * @param[in] BASENAME         The basename of the variables
2049 * @param[in] PTR              The base pointer
2050 * @param[in] STRIDE_Y         The stride value in y-axis direction
2051 * @param[in] Z                The offset in z-axis direction
2052 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
2053 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
2054 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2055 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2056 * @{
2057 */
2058#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2059// Case1: No partial blocks in either x or y
2060#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2061    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2062
2063#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2064// Case2: Partial blocks in y
2065#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2066    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2067
2068#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2069// Case3: Partial blocks in x
2070#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2071    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2072
2073#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2074// Case4: Partial blocks in both x and y
2075#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2076    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2077
2078#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2079
2080#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2081/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
2082
2083#if defined(PARTIAL_STORE_M0)
2084/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
2085 * @name COMPUTE_M0_START_ROW
2086 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
2087 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
2088 * blocks in the y dimension to avoid any padding.
2089 * EG: M0=4, PARTIAL_STORE_M0=1:
2090 *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
2091 * block 0 (partial)| start row = 0   | start row = 0
2092 * block 1 (full)   | start row = 4   | start row = 1
2093 * block 2 (full)   | start row = 8   | start row = 5
2094 *
2095 * @param[in] y                Global id of current block in y.
2096 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2097 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
2098 * @{
2099 */
2100#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2101    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
2102#else // defined(PARTIAL_STORE_M0)
2103#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2104    ((uint)(y * M0))
2105#endif    // defined(PARTIAL_STORE_M0)
2106/** @} */ // end of group COMPUTE_M0_START_ROW
2107
2108/** Store a vector that can only be partial in x.
2109 *
2110 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2111 *
2112 * The data to store is expected to end in a 0.
2113 * E.g., for basename=c, the expected name is c0.
2114 *
2115 * @param[in] basename  The name of the variable without trailing 0
2116 * @param[in] data_type The data type of the vector
2117 * @param[in] ptr       The base pointer
2118 * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
2119 * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
2120 * @param[in] cond      Condition to select either vec_size0 or vec_size1
2121 * @{
2122 */
2123#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
2124    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
2125/** @} */ // end of group STORE_VECTOR_SELECT
2126
2127#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2128#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2129#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2130
2131#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2132#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
2133#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2134
2135#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2136#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
2137#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2138
2139#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2140#pragma OPENCL EXTENSION cl_arm_printf : enable
2141#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2142
2143#define GPU_ARCH_MIDGARD 0x100
2144#define GPU_ARCH_BIFROST 0x200
2145
2146/** Concatenate two inputs.
2147 *
2148 * @param[in] a The first input to be concatenated
2149 * @param[in] b The second input to be concatenated
2150 *
2151 * @return The concatenated output
2152 */
2153#define CONCAT(a, b) a##b
2154
2155/** Expand the given vector
2156 *
2157 * @param[in] x The vector to be expanded
2158 *
2159 * @return The expanded output
2160 */
2161#define EXPAND(x) x
2162
2163/** Clamp the given value between an upper and lower bound.
2164 *
2165 * @param[in] x       The value to be clamped
2166 * @param[in] min_val The lower bound
2167 * @param[in] max_val The upper bound
2168 *
2169 * @return The clamped value.
2170 */
2171#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
2172
2173/** REVn reverses the given vector whose size is n.
2174 * @name REVn
2175 *
2176 * @param[in] x The vector to be reversed
2177 *
2178 * @return The reversed vector
2179 * @{
2180 */
2181#define REV1(x) ((x))
2182#define REV2(x) ((x).s10)
2183#define REV3(x) ((x).s210)
2184#define REV4(x) ((x).s3210)
2185#define REV8(x) ((x).s76543210)
2186#define REV16(x) ((x).sFEDCBA9876543210)
2187/** @} */ // end of group REVn
2188
2189/** Reverse the given vector.
2190 * @name REVERSE
2191 *
2192 * @param[in] x The vector to be reversed
2193 * @param[in] s The size of the vector
2194 *
2195 * @return The reversed vector
2196 * @{
2197 */
2198#define REVERSE_STR(x, s) REV##s((x))
2199#define REVERSE(x, s) REVERSE_STR(x, s)
2200/** @} */ // end of group REVERSE
2201
2202/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
2203 * @name ROTs_n
2204 *
2205 * @param[in] x The vector to be shifted
2206 *
2207 * @return The shifted vector
2208 * @{
2209 */
2210#define ROT1_0(x) ((x))
2211
2212#define ROT2_0(x) ((x))
2213#define ROT2_1(x) ((x).s10)
2214
2215#define ROT3_0(x) ((x))
2216#define ROT3_1(x) ((x).s201)
2217#define ROT3_2(x) ((x).s120)
2218
2219#define ROT4_0(x) ((x))
2220#define ROT4_1(x) ((x).s3012)
2221#define ROT4_2(x) ((x).s2301)
2222#define ROT4_3(x) ((x).s1230)
2223
2224#define ROT8_0(x) ((x))
2225#define ROT8_1(x) ((x).s70123456)
2226#define ROT8_2(x) ((x).s67012345)
2227#define ROT8_3(x) ((x).s56701234)
2228#define ROT8_4(x) ((x).s45670123)
2229#define ROT8_5(x) ((x).s34567012)
2230#define ROT8_6(x) ((x).s23456701)
2231#define ROT8_7(x) ((x).s12345670)
2232
2233#define ROT16_0(x) ((x))
2234#define ROT16_1(x) ((x).sF0123456789ABCDE)
2235#define ROT16_2(x) ((x).sEF0123456789ABCD)
2236#define ROT16_3(x) ((x).sDEF0123456789ABC)
2237#define ROT16_4(x) ((x).sCDEF0123456789AB)
2238#define ROT16_5(x) ((x).sBCDEF0123456789A)
2239#define ROT16_6(x) ((x).sABCDEF0123456789)
2240#define ROT16_7(x) ((x).s9ABCDEF012345678)
2241#define ROT16_8(x) ((x).s89ABCDEF01234567)
2242#define ROT16_9(x) ((x).s789ABCDEF0123456)
2243#define ROT16_10(x) ((x).s6789ABCDEF012345)
2244#define ROT16_11(x) ((x).s56789ABCDEF01234)
2245#define ROT16_12(x) ((x).s456789ABCDEF0123)
2246#define ROT16_13(x) ((x).s3456789ABCDEF012)
2247#define ROT16_14(x) ((x).s23456789ABCDEF01)
2248#define ROT16_15(x) ((x).s123456789ABCDEF0)
2249/** @} */ // end of group ROTs_n
2250
2251/** Circular-right-shift (rotate-right) the given vector by the given amount.
2252 * @name ROTATE
2253 *
2254 * @param[in] x The vector to be shifted
2255 * @param[in] s The size of the vector
2256 * @param[in] n The amount to be shifted
2257 *
2258 * @return The shifted vector
2259 * @{
2260 */
2261#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
2262#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
2263/** @} */ // end of group ROTATE
2264
2265/** Creates a vector of size n filled with offset values corresponding to the location of each element.
2266 * @name V_OFFSn
2267 *
2268 * @param[in] dt The data type of the output vector
2269 *
2270 * @return The vector filled with offset values
2271 * @{
2272 */
2273#define V_OFFS1(dt) (dt##1)(0)
2274#define V_OFFS2(dt) (dt##2)(0, 1)
2275#define V_OFFS3(dt) (dt##3)(0, 1, 2)
2276#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
2277#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
2278#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
2279/** @} */ // end of group V_OFFSn
2280
2281/** Create a vector filled with offset values corresponding to the location of each element.
2282 * @name VEC_OFFS
2283 *
2284 * @param[in] dt The data type of the output vector
2285 * @param[in] s  The size of the output vector
2286 *
2287 * @return The vector filled with offset values
2288 * @{
2289 */
2290#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
2291#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
2292/** @} */ // end of group VEC_OFFS
2293
2294#define VLOAD_STR(size) vload##size
2295#define VLOAD(size) VLOAD_STR(size)
2296
2297#define PIXEL_UNIT4 1
2298#define PIXEL_UNIT8 2
2299#define PIXEL_UNIT16 4
2300
2301/** Utility macro to convert a vector size in pixel unit.
2302 *
2303 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
2304 *
2305 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
2306 *
2307 * @return The pixel unit (number of pixels)
2308 * @{
2309 */
2310#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
2311#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
2312/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
2313
2314#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
2315#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
2316#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
2317
2318#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2319#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
2320#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
2321#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
2322#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2323
2324/** Utility macro to read a 2D OpenCL image object.
2325 *
2326 * @note Coordinates are not normalized
2327 *
2328 * @param[in] data_type Data type
2329 * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
2330 * @param[in] img       OpenCL image object
2331 * @param[in] x_coord   The x coordinate for the top-left pixel
2332 * @param[in] y_coord   The y coordinate for the top-left pixel
2333 *
2334 * @return Pixels from the 2D OpenCL image object
2335 * @{
2336 */
2337#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
2338#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
2339
2340#define VSTORE_STR(size) vstore##size
2341#define VSTORE(size) VSTORE_STR(size)
2342
2343#define float1 float
2344#define half1 half
2345#define char1 char
2346#define uchar1 uchar
2347#define short1 short
2348#define ushort1 ushort
2349#define int1 int
2350#define uint1 uint
2351#define long1 long
2352#define ulong1 ulong
2353#define double1 double
2354
2355#define vload1(OFFSET, PTR) *(OFFSET + PTR)
2356#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
2357
2358/** Extended partial vstore that correctly handles scalar values as well.
2359 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
2360 * @name VSTORE_PARTIAL
2361 *
2362 * @note With this macro, the passed data can be both a vector and a scalar
2363 * @note @p store_size needs to be <= @p size
2364 * eg 1: Valid
2365 * VSTORE_PARTIAL(16, 15) ...;
2366 * eg 2: Invalid
2367 * VSTORE_PARTIAL(4, 7) ...;
2368 *
2369 * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
2370 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
2371 * @{
2372 */
2373#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
2374#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
2375
2376#define NO_STORE(data, offs, ptr) \
2377    {                             \
2378    }
2379
2380// Size == 1 (scalar)
2381#define vstore_partial_1_0 NO_STORE
2382#define vstore_partial_1_1 vstore1
2383#define vstore_partial_1_2 NO_STORE
2384#define vstore_partial_1_3 NO_STORE
2385#define vstore_partial_1_4 NO_STORE
2386#define vstore_partial_1_5 NO_STORE
2387#define vstore_partial_1_6 NO_STORE
2388#define vstore_partial_1_7 NO_STORE
2389#define vstore_partial_1_8 NO_STORE
2390#define vstore_partial_1_9 NO_STORE
2391#define vstore_partial_1_10 NO_STORE
2392#define vstore_partial_1_11 NO_STORE
2393#define vstore_partial_1_12 NO_STORE
2394#define vstore_partial_1_13 NO_STORE
2395#define vstore_partial_1_14 NO_STORE
2396#define vstore_partial_1_15 NO_STORE
2397#define vstore_partial_1_16 NO_STORE
2398// Size == 2
2399#define vstore_partial_2_0 NO_STORE
2400#define vstore_partial_2_1 vstore_partial_1
2401#define vstore_partial_2_2 vstore_partial_2
2402#define vstore_partial_2_3 NO_STORE
2403#define vstore_partial_2_4 NO_STORE
2404#define vstore_partial_2_5 NO_STORE
2405#define vstore_partial_2_6 NO_STORE
2406#define vstore_partial_2_7 NO_STORE
2407#define vstore_partial_2_8 NO_STORE
2408#define vstore_partial_2_9 NO_STORE
2409#define vstore_partial_2_10 NO_STORE
2410#define vstore_partial_2_11 NO_STORE
2411#define vstore_partial_2_12 NO_STORE
2412#define vstore_partial_2_13 NO_STORE
2413#define vstore_partial_2_14 NO_STORE
2414#define vstore_partial_2_15 NO_STORE
2415#define vstore_partial_2_16 NO_STORE
2416// Size == 3
2417#define vstore_partial_3_0 NO_STORE
2418#define vstore_partial_3_1 vstore_partial_1
2419#define vstore_partial_3_2 vstore_partial_2
2420#define vstore_partial_3_3 vstore_partial_3
2421#define vstore_partial_3_4 NO_STORE
2422#define vstore_partial_3_5 NO_STORE
2423#define vstore_partial_3_6 NO_STORE
2424#define vstore_partial_3_7 NO_STORE
2425#define vstore_partial_3_8 NO_STORE
2426#define vstore_partial_3_9 NO_STORE
2427#define vstore_partial_3_10 NO_STORE
2428#define vstore_partial_3_11 NO_STORE
2429#define vstore_partial_3_12 NO_STORE
2430#define vstore_partial_3_13 NO_STORE
2431#define vstore_partial_3_14 NO_STORE
2432#define vstore_partial_3_15 NO_STORE
2433#define vstore_partial_3_16 NO_STORE
2434// Size == 4
2435#define vstore_partial_4_0 NO_STORE
2436#define vstore_partial_4_1 vstore_partial_1
2437#define vstore_partial_4_2 vstore_partial_2
2438#define vstore_partial_4_3 vstore_partial_3
2439#define vstore_partial_4_4 vstore_partial_4
2440#define vstore_partial_4_5 NO_STORE
2441#define vstore_partial_4_6 NO_STORE
2442#define vstore_partial_4_7 NO_STORE
2443#define vstore_partial_4_8 NO_STORE
2444#define vstore_partial_4_9 NO_STORE
2445#define vstore_partial_4_10 NO_STORE
2446#define vstore_partial_4_11 NO_STORE
2447#define vstore_partial_4_12 NO_STORE
2448#define vstore_partial_4_13 NO_STORE
2449#define vstore_partial_4_14 NO_STORE
2450#define vstore_partial_4_15 NO_STORE
2451#define vstore_partial_4_16 NO_STORE
2452// Size == 8
2453#define vstore_partial_8_0 NO_STORE
2454#define vstore_partial_8_1 vstore_partial_1
2455#define vstore_partial_8_2 vstore_partial_2
2456#define vstore_partial_8_3 vstore_partial_3
2457#define vstore_partial_8_4 vstore_partial_4
2458#define vstore_partial_8_5 vstore_partial_5
2459#define vstore_partial_8_6 vstore_partial_6
2460#define vstore_partial_8_7 vstore_partial_7
2461#define vstore_partial_8_8 vstore_partial_8
2462#define vstore_partial_8_9 NO_STORE
2463#define vstore_partial_8_10 NO_STORE
2464#define vstore_partial_8_11 NO_STORE
2465#define vstore_partial_8_12 NO_STORE
2466#define vstore_partial_8_13 NO_STORE
2467#define vstore_partial_8_14 NO_STORE
2468#define vstore_partial_8_15 NO_STORE
2469#define vstore_partial_8_16 NO_STORE
2470// Size == 16
2471#define vstore_partial_16_0 NO_STORE
2472#define vstore_partial_16_1 vstore_partial_1
2473#define vstore_partial_16_2 vstore_partial_2
2474#define vstore_partial_16_3 vstore_partial_3
2475#define vstore_partial_16_4 vstore_partial_4
2476#define vstore_partial_16_5 vstore_partial_5
2477#define vstore_partial_16_6 vstore_partial_6
2478#define vstore_partial_16_7 vstore_partial_7
2479#define vstore_partial_16_8 vstore_partial_8
2480#define vstore_partial_16_9 vstore_partial_9
2481#define vstore_partial_16_10 vstore_partial_10
2482#define vstore_partial_16_11 vstore_partial_11
2483#define vstore_partial_16_12 vstore_partial_12
2484#define vstore_partial_16_13 vstore_partial_13
2485#define vstore_partial_16_14 vstore_partial_14
2486#define vstore_partial_16_15 vstore_partial_15
2487#define vstore_partial_16_16 vstore_partial_16
2488
2489/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
2490 * @name vstore_partial_n
2491 *
2492 * @note @p DATA needs to be a vector not a scalar
2493 * @note n needs to be <= the vector width of the input variable @p DATA
2494 * eg 1: Valid
2495 * vstore_partial_15(var:float16, 0, 0xabcd);
2496 * eg 2: Invalid
2497 * vstore_partial_7(var:float4, 0, 0xabcd);
2498 *
2499 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
2500 *
2501 * @param[in] DATA   The name of the variable
2502 * @param[in] OFFSET Offset in n
2503 * @param[in] PTR    The base pointer
2504 * @{
2505 */
2506#define vstore_partial_1(DATA, OFFSET, PTR) \
2507    vstore1(DATA.s0, OFFSET, PTR);
2508
2509#define vstore_partial_2(DATA, OFFSET, PTR) \
2510    vstore2(DATA.s01, OFFSET, PTR);
2511
2512#define vstore_partial_3(DATA, OFFSET, PTR) \
2513    vstore3(DATA.s012, OFFSET, PTR);
2514
2515#define vstore_partial_4(DATA, OFFSET, PTR) \
2516    vstore4(DATA.s0123, OFFSET, PTR);
2517
2518#define vstore_partial_5(DATA, OFFSET, PTR)    \
2519    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2520    vstore1(DATA.s4, OFFSET, PTR + 4);
2521
2522#define vstore_partial_6(DATA, OFFSET, PTR)    \
2523    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2524    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2525
2526#define vstore_partial_7(DATA, OFFSET, PTR)    \
2527    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2528    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2529
2530#define vstore_partial_8(DATA, OFFSET, PTR) \
2531    vstore8(DATA.s01234567, OFFSET, PTR);
2532
2533#define vstore_partial_9(DATA, OFFSET, PTR)        \
2534    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2535    vstore1(DATA.s8, OFFSET, PTR + 8);
2536
2537#define vstore_partial_10(DATA, OFFSET, PTR)       \
2538    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2539    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2540
2541#define vstore_partial_11(DATA, OFFSET, PTR)       \
2542    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2543    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2544
2545#define vstore_partial_12(DATA, OFFSET, PTR)       \
2546    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2547    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2548
2549#define vstore_partial_13(DATA, OFFSET, PTR)       \
2550    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2551    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2552
2553#define vstore_partial_14(DATA, OFFSET, PTR)       \
2554    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2555    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2556
2557#define vstore_partial_15(DATA, OFFSET, PTR)       \
2558    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2559    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2560
2561#define vstore_partial_16(DATA, OFFSET, PTR) \
2562    vstore16(DATA, OFFSET, PTR);
2563/** @} */ // end of groupd vstore_partial_n
2564/** @} */ // end of groupd VSTORE_PARTIAL
2565
2566// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
2567// without _sat to overcome this issue
2568#define convert_float_sat convert_float
2569#define convert_float1_sat convert_float
2570#define convert_float2_sat convert_float2
2571#define convert_float3_sat convert_float3
2572#define convert_float4_sat convert_float4
2573#define convert_float8_sat convert_float8
2574#define convert_float16_sat convert_float16
2575#define convert_half_sat convert_float
2576#define convert_half1_sat convert_half
2577#define convert_half2_sat convert_half2
2578#define convert_half3_sat convert_half3
2579#define convert_half4_sat convert_half4
2580#define convert_half8_sat convert_half8
2581#define convert_half16_sat convert_half16
2582
2583#define convert_float1 convert_float
2584#define convert_half1 convert_half
2585#define convert_char1 convert_char
2586#define convert_uchar1 convert_uchar
2587#define convert_short1 convert_short
2588#define convert_ushort1 convert_ushort
2589#define convert_int1 convert_int
2590#define convert_uint1 convert_uint
2591#define convert_long1 convert_long
2592#define convert_ulong1 convert_ulong
2593#define convert_double1 convert_double
2594
2595#define convert_char1_sat convert_char_sat
2596#define convert_uchar1_sat convert_uchar_sat
2597#define convert_short1_sat convert_short_sat
2598#define convert_ushort1_sat convert_ushort_sat
2599#define convert_int1_sat convert_int_sat
2600#define convert_uint1_sat convert_uint_sat
2601#define convert_long1_sat convert_long_sat
2602#define convert_ulong1_sat convert_ulong_sat
2603#define convert_double1_sat convert_double_sat
2604
2605#define VEC_DATA_TYPE_STR(type, size) type##size
2606#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2607
2608#define CONVERT_STR(x, type) (convert_##type((x)))
2609#define CONVERT(x, type) CONVERT_STR(x, type)
2610
2611#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2612#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2613
2614#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2615#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2616
2617#define select_vec_dt_uchar(size) uchar##size
2618#define select_vec_dt_char(size) char##size
2619#define select_vec_dt_ushort(size) ushort##size
2620#define select_vec_dt_short(size) short##size
2621#define select_vec_dt_half(size) short##size
2622#define select_vec_dt_uint(size) uint##size
2623#define select_vec_dt_int(size) int##size
2624#define select_vec_dt_float(size) int##size
2625#define select_vec_dt_ulong(size) ulong##size
2626#define select_vec_dt_long(size) long##size
2627
2628#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2629#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2630#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2631
2632#define sum_reduce_1(x) (x)
2633#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2634#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2635#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2636#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2637#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2638
2639#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2640#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2641
2642#define max_reduce_1(x) (x)
2643#define max_reduce_2(x) max(((x).s0), ((x).s1))
2644#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2645#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2646#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2647#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2648
2649#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2650#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2651
2652#define VECTOR_DECLARATION(name)     \
2653    __global uchar *name##_ptr,      \
2654    uint        name##_stride_x, \
2655    uint        name##_step_x,   \
2656    uint        name##_offset_first_element_in_bytes
2657
2658#define IMAGE_DECLARATION(name)      \
2659    __global uchar *name##_ptr,      \
2660    uint        name##_stride_x, \
2661    uint        name##_step_x,   \
2662    uint        name##_stride_y, \
2663    uint        name##_step_y,   \
2664    uint        name##_offset_first_element_in_bytes
2665
2666#define TENSOR3D_DECLARATION(name)   \
2667    __global uchar *name##_ptr,      \
2668    uint        name##_stride_x, \
2669    uint        name##_step_x,   \
2670    uint        name##_stride_y, \
2671    uint        name##_step_y,   \
2672    uint        name##_stride_z, \
2673    uint        name##_step_z,   \
2674    uint        name##_offset_first_element_in_bytes
2675
2676#define TENSOR4D_DECLARATION(name)   \
2677    __global uchar *name##_ptr,      \
2678    uint        name##_stride_x, \
2679    uint        name##_step_x,   \
2680    uint        name##_stride_y, \
2681    uint        name##_step_y,   \
2682    uint        name##_stride_z, \
2683    uint        name##_step_z,   \
2684    uint        name##_stride_w, \
2685    uint        name##_step_w,   \
2686    uint        name##_offset_first_element_in_bytes
2687
2688#define CONVERT_TO_VECTOR_STRUCT(name) \
2689    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2690
2691#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2692    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2693
2694#define CONVERT_TO_IMAGE_STRUCT(name) \
2695    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2696
2697#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2698    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2699
2700#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2701    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2702
2703#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2704    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2705
2706#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2707    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2708
2709#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2710    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2711                                 name##_stride_z, name##_step_z)
2712
2713#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2714    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2715
2716#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2717    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2718                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2719
2720#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2721    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2722
2723#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2724    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2725                           name##_stride_z, name##_step_z)
2726
2727/** Structure to hold Vector information */
2728typedef struct Vector
2729{
2730    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2731    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2732    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2733} Vector;
2734
2735/** Structure to hold Image information */
2736typedef struct Image
2737{
2738    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2739    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2740    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2741    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
2742} Image;
2743
2744/** Structure to hold 3D tensor information */
2745typedef struct Tensor3D
2746{
2747    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2748    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2749    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2750    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
2751    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
2752} Tensor3D;
2753
2754/** Structure to hold 4D tensor information */
2755typedef struct Tensor4D
2756{
2757    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2758    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2759    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2760    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
2761    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
2762    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
2763} Tensor4D;
2764
2765/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
2766 *
2767 * @param[in] ptr                           Pointer to the starting postion of the buffer
2768 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
2769 * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
2770 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2771 *
2772 * @return An image object
2773 */
2774inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2775{
2776    Vector vector =
2777    {
2778        .ptr                           = ptr,
2779        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2780        .stride_x                      = stride_x,
2781    };
2782    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2783    return vector;
2784}
2785
2786/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
2787 *
2788 * @param[in] ptr                           Pointer to the starting postion of the buffer
2789 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2790 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2791 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2792 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2793 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2794 *
2795 * @return An image object
2796 */
2797inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2798{
2799    Image img =
2800    {
2801        .ptr                           = ptr,
2802        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2803        .stride_x                      = stride_x,
2804        .stride_y                      = stride_y
2805    };
2806    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2807    return img;
2808}
2809
2810/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
2811 *
2812 * @param[in] ptr                           Pointer to the starting postion of the buffer
2813 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2814 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2815 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2816 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2817 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2818 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
2819 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
2820 *
2821 * @return A 3D tensor object
2822 */
2823inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2824{
2825    Image img =
2826    {
2827        .ptr                           = ptr,
2828        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2829        .stride_x                      = stride_x,
2830        .stride_y                      = stride_y
2831    };
2832    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2833    return img;
2834}
2835
2836/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
2837 *
2838 * @param[in] ptr                           Pointer to the starting postion of the buffer
2839 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2840 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2841 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2842 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2843 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2844 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
2845 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
2846 *
2847 * @return A 3D tensor object
2848 */
2849inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2850{
2851    Tensor3D tensor =
2852    {
2853        .ptr                           = ptr,
2854        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2855        .stride_x                      = stride_x,
2856        .stride_y                      = stride_y,
2857        .stride_z                      = stride_z
2858    };
2859    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2860    return tensor;
2861}
2862
2863/** Wrap 3D tensor information into an tensor structure.
2864 *
2865 * @param[in] ptr                           Pointer to the starting postion of the buffer
2866 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2867 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2868 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2869 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2870 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2871 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
2872 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
2873 *
2874 * @return A 3D tensor object
2875 */
2876inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2877{
2878    Tensor3D tensor =
2879    {
2880        .ptr                           = ptr,
2881        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2882        .stride_x                      = stride_x,
2883        .stride_y                      = stride_y,
2884        .stride_z                      = stride_z
2885    };
2886    return tensor;
2887}
2888
2889inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2890                                             uint step_w,
2891                                             uint mod_size)
2892{
2893    Tensor4D tensor =
2894    {
2895        .ptr                           = ptr,
2896        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2897        .stride_x                      = stride_x,
2898        .stride_y                      = stride_y,
2899        .stride_z                      = stride_z,
2900        .stride_w                      = stride_w
2901    };
2902
2903    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2904    return tensor;
2905}
2906
2907/** Get the pointer position of a Vector
2908 *
2909 * @param[in] vec Pointer to the starting position of the buffer
2910 * @param[in] x   Relative X position
2911 */
2912inline __global const uchar *vector_offset(const Vector *vec, int x)
2913{
2914    return vec->ptr + x * vec->stride_x;
2915}
2916
2917/** Get the pointer position of a Image
2918 *
2919 * @param[in] img Pointer to the starting position of the buffer
2920 * @param[in] x   Relative X position
2921 * @param[in] y   Relative Y position
2922 */
2923inline __global uchar *offset(const Image *img, int x, int y)
2924{
2925    return img->ptr + x * img->stride_x + y * img->stride_y;
2926}
2927
2928/** Get the pointer position of a Tensor3D
2929 *
2930 * @param[in] tensor Pointer to the starting position of the buffer
2931 * @param[in] x      Relative X position
2932 * @param[in] y      Relative Y position
2933 * @param[in] z      Relative Z position
2934 */
2935inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2936{
2937    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2938}
2939
2940/** Get the pointer position of a Tensor4D
2941 *
2942 * @param[in] tensor Pointer to the starting position of the buffer
2943 * @param[in] x      Relative X position
2944 * @param[in] y      Relative Y position
2945 * @param[in] z      Relative Z position
2946 * @param[in] w      Relative W position
2947 */
2948inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2949{
2950    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2951}
2952
2953/** Get the offset for a given linear index of a Tensor3D
2954 *
2955 * @param[in] tensor Pointer to the starting position of the buffer
2956 * @param[in] width  Width of the input tensor
2957 * @param[in] height Height of the input tensor
2958 * @param[in] depth  Depth of the input tensor
2959 * @param[in] index  Linear index
2960 */
2961inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2962{
2963    uint num_elements = width * height;
2964
2965    const uint z = index / num_elements;
2966
2967    index %= num_elements;
2968
2969    const uint y = index / width;
2970
2971    index %= width;
2972
2973    const uint x = index;
2974
2975    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2976}
2977
2978#endif // _HELPER_H
2979
2980/** Clamps the given coordinates to the borders according to the border size.
2981 *
2982 * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
2983 * @param[in] width       Width of the image
2984 * @param[in] height      Height of the image
2985 * @param[in] border_size Border size of the image
2986 *
2987 */
2988inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
2989{
2990    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
2991    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
2992    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
2993}
2994
2995/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
2996/** Clamps the given coordinates to the borders.
2997 *
2998 * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
2999 * @param[in] width  Width of the image
3000 * @param[in] height Height of the image
3001 *
3002 */
3003inline const float8 clamp_to_border(float8 coords, const float width, const float height)
3004{
3005    return clamp_to_border_with_size(coords, width, height, 1);
3006}
3007
3008/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
3009 *
3010 * @param[in] in     Pointer to the source image.
3011 * @param[in] coords Vector of coordinates to be read from the image.
3012 */
3013inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
3014{
3015    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
3016                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
3017                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
3018                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
3019}
3020
3021/** Returns the current thread coordinates. */
3022inline const float2 get_current_coords()
3023{
3024    return (float2)(get_global_id(0) * 4, get_global_id(1));
3025}
3026
3027/** Given a texel coordinates this function will return the following array of coordinates:
3028 * [ P, right neighbour, below neighbour, below right neighbour ]
3029 *
3030 * @note No checks to see if the coordinates are out of the image are done here.
3031 *
3032 * @param[in] coord Input coordinates
3033 *
3034 * @return vector of 8 floats with the coordinates, even positions are x and odd y.
3035 */
3036inline const float8 get_neighbour_coords(const float2 coord)
3037{
3038    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
3039}
3040
3041/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
3042 *
3043 * @param[in] in          Pointer to the source image.
3044 * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
3045 * @param[in] width       Width of the image
3046 * @param[in] height      Height of the image
3047 * @param[in] border_size Border size
3048 */
3049inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
3050{
3051    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
3052
3053    // Sets the 4x4 coordinates for each of the four input texels
3054    const float8  fc = floor(coords);
3055    const float16 c1 = (float16)(
3056                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
3057                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
3058    const float16 c2 = (float16)(
3059                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
3060                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
3061
3062    // Loads the values from the input image
3063    const float16 t = (float16)(
3064                          /* tl, tr, bl, br */
3065                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
3066                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
3067                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
3068                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
3069                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
3070                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
3071                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
3072                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
3073    const float8 a  = coords - fc;
3074    const float8 b  = ((float8)(1.f)) - a;
3075    const float4 fr = (float4)(
3076                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
3077                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
3078                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
3079                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
3080    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
3081}
3082
3083/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
3084/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
3085 *
3086 * @param[in] in     Pointer to the source image.
3087 * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
3088 * @param[in] width  Width of the image
3089 * @param[in] height Height of the image
3090 */
3091inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
3092{
3093    return bilinear_interpolate_with_border(in, coords, width, height, 1);
3094}
3095
3096#if defined(DATA_TYPE) && defined(OPERATION)
3097
3098// Calculate exponential
3099#define exp_op(input) exp(input)
3100// Calculate reverse square root
3101#define rsqrt_op(input) rsqrt(input)
3102// Calculate negative
3103#define neg_op(input) (-input)
3104// Calculate sine
3105#define sin_op(input) sin(input)
3106// Calculate abs for floating point values
3107#define fabs_op(input) fabs(input)
3108// Calculate natural_log
3109#define natural_log_op(input) log(input)
3110// Calculate round (Cannot use round function as it rounds halfway cases away from zero).
3111#if defined(VEC_SIZE)
3112#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
3113#define logical_not_op(input) CONVERT((!input) & 0x1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
3114#else // defined(VEC_SIZE)
3115#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
3116#define logical_not_op(input) ((!input) & 0x1)
3117#endif // defined(VEC_SIZE)
3118
3119/** Applies element wise unary operator in a tensor.
3120 *
3121 * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
3122 * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
3123 * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
3124 * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
3125 * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
3126 * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
3127 * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
3128 * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
3129 * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
3130 * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
3131 * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
3132 * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
3133 * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
3134 * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
3135 * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
3136 * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
3137 */
3138__kernel void elementwise_unary(
3139    TENSOR3D_DECLARATION(in),
3140    TENSOR3D_DECLARATION(out))
3141{
3142    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
3143    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
3144
3145#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
3146    // Check if access on width gets out of bounds
3147    // If it does shift access vector to access elements within bounds
3148    const int xi = (int)(get_global_id(0) * VEC_SIZE);
3149    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
3150    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
3151
3152    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
3153    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
3154
3155    VSTORE(VEC_SIZE)
3156    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
3157#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
3158    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
3159#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
3160}
3161#endif // defined(DATA_TYPE) && defined(OPERATION)
3162
3163)"