• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1R"(
2
3/*
4 * Copyright (c) 2016-2020 Arm Limited.
5 *
6 * SPDX-License-Identifier: MIT
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to
10 * deal in the Software without restriction, including without limitation the
11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
12 * sell copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in all
16 * copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26/*
27 * Copyright (c) 2019-2020 Arm Limited.
28 *
29 * SPDX-License-Identifier: MIT
30 *
31 * Permission is hereby granted, free of charge, to any person obtaining a copy
32 * of this software and associated documentation files (the "Software"), to
33 * deal in the Software without restriction, including without limitation the
34 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
35 * sell copies of the Software, and to permit persons to whom the Software is
36 * furnished to do so, subject to the following conditions:
37 *
38 * The above copyright notice and this permission notice shall be included in all
39 * copies or substantial portions of the Software.
40 *
41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
46 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
47 * SOFTWARE.
48 */
49
50/*
51 * Copyright (c) 2016-2020 Arm Limited.
52 *
53 * SPDX-License-Identifier: MIT
54 *
55 * Permission is hereby granted, free of charge, to any person obtaining a copy
56 * of this software and associated documentation files (the "Software"), to
57 * deal in the Software without restriction, including without limitation the
58 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
59 * sell copies of the Software, and to permit persons to whom the Software is
60 * furnished to do so, subject to the following conditions:
61 *
62 * The above copyright notice and this permission notice shall be included in all
63 * copies or substantial portions of the Software.
64 *
65 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
66 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
67 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
68 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
70 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
71 * SOFTWARE.
72 */
73#ifndef ARM_COMPUTE_HELPER_H
74#define ARM_COMPUTE_HELPER_H
75
76/*
77 * Copyright (c) 2020 Arm Limited.
78 *
79 * SPDX-License-Identifier: MIT
80 *
81 * Permission is hereby granted, free of charge, to any person obtaining a copy
82 * of this software and associated documentation files (the "Software"), to
83 * deal in the Software without restriction, including without limitation the
84 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
85 * sell copies of the Software, and to permit persons to whom the Software is
86 * furnished to do so, subject to the following conditions:
87 *
88 * The above copyright notice and this permission notice shall be included in all
89 * copies or substantial portions of the Software.
90 *
91 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
96 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
97 * SOFTWARE.
98 */
99
100/** Store the 0 to (n-1)th rows of the given variables
101 * @name STORE_ROW_n
102 *
103 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
104 * @param[in] DATA_TYPE The data type of the vectors
105 * @param[in] BASENAME  The basename of the variables
106 * @param[in] PTR       The base pointer
107 * @param[in] STRIDE_Y  The stride value in y-axis direction
108 * @param[in] Z         The offset in z-axis direction
109 * @{
110 */
111#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
112    VSTORE(N0)                                                 \
113    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
114
115#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
116    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
117    VSTORE(N0)                                                 \
118    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
119
120#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
121    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
122    VSTORE(N0)                                                 \
123    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
124
125#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
126    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
127    VSTORE(N0)                                                 \
128    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
129
130#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
131    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
132    VSTORE(N0)                                                 \
133    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
134
135#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
136    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
137    VSTORE(N0)                                                 \
138    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
139
140#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
141    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
142    VSTORE(N0)                                                 \
143    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
144
145#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
146    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
147    VSTORE(N0)                                                 \
148    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
149
150#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
151    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
152    VSTORE(N0)                                                 \
153    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
154
155#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
156    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
157    VSTORE(N0)                                                  \
158    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
159
160#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
161    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
162    VSTORE(N0)                                                  \
163    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
164
165#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
166    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
167    VSTORE(N0)                                                  \
168    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
169
170#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
171    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
172    VSTORE(N0)                                                  \
173    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
174
175#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
176    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
177    VSTORE(N0)                                                  \
178    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
179
180#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
181    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
182    VSTORE(N0)                                                  \
183    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
184
185#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
187    VSTORE(N0)                                                  \
188    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
189/** @} */ // end of groupd STORE_ROW_n
190
191/** Convert and store the 0th to (n-1)th rows of the given variables
192 * @name CONVERT_STORE_ROW_n
193 *
194 * @param[in] N0        The size of the vectors
195 * @param[in] DATA_TYPE The data type of the vectors
196 * @param[in] BASENAME  The basename of the variables
197 * @param[in] PTR       The base pointer
198 * @param[in] STRIDE_Y  The stride value in y-axis direction
199 * @param[in] Z         The offset in z-axis direction
200 * @{
201 */
202#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
203    VSTORE(N0)                                                         \
204    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
205
206#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
207    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
208    VSTORE(N0)                                                         \
209    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
210
211#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
212    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
213    VSTORE(N0)                                                         \
214    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
215
216#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
217    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
218    VSTORE(N0)                                                         \
219    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
220
221#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
222    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
223    VSTORE(N0)                                                         \
224    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
225
226#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
227    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
228    VSTORE(N0)                                                         \
229    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
230
231#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
232    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
233    VSTORE(N0)                                                         \
234    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
235
236#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
237    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
238    VSTORE(N0)                                                         \
239    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
240
241#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
242    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
243    VSTORE(N0)                                                         \
244    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
245
246#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
247    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
248    VSTORE(N0)                                                     \
249    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
250
251#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
252    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
253    VSTORE(N0)                                                          \
254    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
255
256#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
257    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
258    VSTORE(N0)                                                          \
259    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
260
261#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
262    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
263    VSTORE(N0)                                                          \
264    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
265
266#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
267    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
268    VSTORE(N0)                                                          \
269    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
270
271#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
272    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
273    VSTORE(N0)                                                          \
274    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
275
276#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
277    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
278    VSTORE(N0)                                                          \
279    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
280
281/** @} */ // end of groupd CONVERT_STORE_ROW_n
282
283/** Store a block of the given size M0xN0
284 * @name STORE_BLOCK
285 *
286 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
287 * The data to store is expected to have consecutive names for each row.
288 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
289 * The Z offset is expected to have consecutive names.
290 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
291 *
292 * @param[in] M0        The number of rows to store
293 * @param[in] N0        The size of each vector
294 * @param[in] DATA_TYPE The data type of the vectors
295 * @param[in] BASENAME  The basename of the variables
296 * @param[in] PTR       The base pointer
297 * @param[in] STRIDE_Y  The stride value in y-axis direction
298 * @param[in] Z         The offset in z-axis direction
299 * @{
300 */
301#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
302#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
303/** @} */ // end of group STORE_BLOCK
304
305/** Convert and store a block of the given size M0xN0
306 * @name CONVERT_STORE_BLOCK
307 *
308 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
309 * The data to store is expected to have consecutive names for each row.
310 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
311 * The Z offset is expected to have consecutive names.
312 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
313 *
314 * @param[in] M0        The number of rows to store
315 * @param[in] N0        The size of each vector
316 * @param[in] DATA_TYPE The data type of the vectors
317 * @param[in] BASENAME  The basename of the variables
318 * @param[in] PTR       The base pointer
319 * @param[in] STRIDE_Y  The stride value in y-axis direction
320 * @param[in] Z         The offset in z-axis direction
321 * @{
322 */
323#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
324#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
325/** @} */ // end of group CONVERT_STORE_BLOCK
326
327/** Partially store the 0 to (n-1)th rows of the given variables
328 * @name STORE_ROW_PARTIAL_n
329 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
330 *
331 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
332 *
333 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
334 * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
335 * @param[in] DATA_TYPE The data type of the vectors
336 * @param[in] BASENAME  The basename of the variables
337 * @param[in] PTR       The base pointer
338 * @param[in] STRIDE_Y  The stride value in y-axis direction
339 * @param[in] Z         The offset in z-axis direction
340 * @{
341 */
342#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
343    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
344    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
345
346#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
347    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
348    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
349    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
350
351#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
352    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
353    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
354    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
355
356#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
357    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
358    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
359    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
360
361#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
362    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
363    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
364    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
365
366#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
367    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
368    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
369    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
370
371#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
372    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
373    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
374    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
375
376#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
377    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
378    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
379    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
380
381#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
382    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
383    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
384    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
385
386#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
387    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
388    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
389    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
390
391#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
392    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
393    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
394    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
395
396#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
397    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
398    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
399    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
400
401#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
402    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
403    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
404    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
405
406#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
407    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
408    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
409    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
410
411#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
412    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
413    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
414    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
415
416#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
417    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
418    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
419    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
420/** @} */ // end of groupd STORE_ROW_PARTIAL_n
421
422/** Partially store a block of the given size STORE_M0xSTORE_N0
423 * @name STORE_BLOCK_PARTIAL
424 *
425 * @note The vector width @p N0 is also required for correct partial storing behaviour.
426 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
427 *
428 * The data to store is expected to have consecutive names for each row.
429 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
430 * The Z offset is expected to have consecutive names.
431 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
432 *
433 * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
434 * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
435 * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
436 * @param[in] DATA_TYPE The data type of the vectors
437 * @param[in] BASENAME  The basename of the variables
438 * @param[in] PTR       The base pointer
439 * @param[in] STRIDE_Y  The stride value in y-axis direction
440 * @param[in] Z         The offset in z-axis direction
441 * @{
442 */
443#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
444#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
445/** Store a block that can be partial in both x and y dimensions
446 *
447 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
448 *
449 * The data to store is expected to have consecutive names for each row.
450 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
451 * The Z offset is expected to have consecutive names.
452 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
453 *
454 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
455 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
456 * @param[in] DATA_TYPE        The data type of the vectors
457 * @param[in] BASENAME         The basename of the variables
458 * @param[in] PTR              The base pointer
459 * @param[in] STRIDE_Y         The stride value in y-axis direction
460 * @param[in] Z                The offset in z-axis direction
461 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
462 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
463 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
464 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
465 */
466#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
467    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
468    {                                                                                                                                                     \
469        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
470    }                                                                                                                                                     \
471    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
472    {                                                                                                                                                     \
473        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
474    }                                                                                                                                                     \
475    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
476    {                                                                                                                                                     \
477        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
478    }                                                                                                                                                     \
479    else                                                                                                                                                  \
480    {                                                                                                                                                     \
481        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
482    }
483/** Store a block that can only be partial in x but not y.
484 *
485 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
486 *
487 * The data to store is expected to have consecutive names for each row.
488 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
489 * The Z offset is expected to have consecutive names.
490 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
491 *
492 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
493 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
494 * @param[in] DATA_TYPE        The data type of the vectors
495 * @param[in] BASENAME         The basename of the variables
496 * @param[in] PTR              The base pointer
497 * @param[in] STRIDE_Y         The stride value in y-axis direction
498 * @param[in] Z                The offset in z-axis direction
499 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
500 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
501 */
502#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
503    if(!(PARTIAL_COND_X))                                                                                         \
504    {                                                                                                             \
505        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
506    }                                                                                                             \
507    else                                                                                                          \
508    {                                                                                                             \
509        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
510    }
511/** Store a block that can only be partial in y but not x.
512 *
513 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
514 *
515 * The data to store is expected to have consecutive names for each row.
516 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
517 * The Z offset is expected to have consecutive names.
518 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
519 *
520 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
521 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
522 * @param[in] DATA_TYPE        The data type of the vectors
523 * @param[in] BASENAME         The basename of the variables
524 * @param[in] PTR              The base pointer
525 * @param[in] STRIDE_Y         The stride value in y-axis direction
526 * @param[in] Z                The offset in z-axis direction
527 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
528 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
529 */
530#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
531    if(!(PARTIAL_COND_Y))                                                                                         \
532    {                                                                                                             \
533        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
534    }                                                                                                             \
535    else                                                                                                          \
536    {                                                                                                             \
537        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
538    }
539/** @} */ // end of group STORE_BLOCK_PARTIAL
540
541#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
542
543/** Boundary-aware GEMM block store
544 * @name STORE_BLOCK_BOUNDARY_AWARE
545 * This macro assumes the following schemes to achieve boundary-awareness:
546 *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
547 *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
548 *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
549 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
550 *
551 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
552 * blocks **at the end**.
553 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
554 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
555 *
556 *  *--x-->                         x == 0                        x == 1
557 *  |                  |<------------------------------N-------------------------->|
558 *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
559 *  |     -------------#############################################################
560 *  *     |          | |...............................|...........................|
561 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
562 *        |          | |...............................|...........................|
563 *        M          --#############################################################
564 *        |          | |                               |...........................|
565 * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
566 *        |          | |                               |...........................|
567 *        |------------#############################################################
568 *
569 * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
570 *
571 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
572 *
573 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
574 * and select corresponding store methods such that the boundary detection logic is only added when needed.
575 *
576 * The data to store is expected to have consecutive names for each row.
577 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
578 * The Z offset is expected to have consecutive names.
579 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
580 *
581 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
582 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
583 * @param[in] DATA_TYPE        The data type of the vectors
584 * @param[in] BASENAME         The basename of the variables
585 * @param[in] PTR              The base pointer
586 * @param[in] STRIDE_Y         The stride value in y-axis direction
587 * @param[in] Z                The offset in z-axis direction
588 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
589 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
590 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
591 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
592 * @{
593 */
594#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
595// Case1: No partial blocks in either x or y
596#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
597    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
598
599#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
600// Case2: Partial blocks in y
601#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
602    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
603
604#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
605// Case3: Partial blocks in x
606#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
607    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
608
609#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
610// Case4: Partial blocks in both x and y
611#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
612    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
613
614#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
615
616#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
617/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
618
619#if defined(PARTIAL_STORE_M0)
620/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
621 * @name COMPUTE_M0_START_ROW
622 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
623 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
624 * blocks in the y dimension to avoid any padding.
625 * EG: M0=4, PARTIAL_STORE_M0=1:
626 *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
627 * block 0 (partial)| start row = 0   | start row = 0
628 * block 1 (full)   | start row = 4   | start row = 1
629 * block 2 (full)   | start row = 8   | start row = 5
630 *
631 * @param[in] y                Global id of current block in y.
632 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
633 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
634 * @{
635 */
636#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
637    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
638#else // defined(PARTIAL_STORE_M0)
639#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
640    ((uint)(y * M0))
641#endif    // defined(PARTIAL_STORE_M0)
642/** @} */ // end of group COMPUTE_M0_START_ROW
643
644/** Store a vector that can only be partial in x.
645 *
646 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
647 *
648 * The data to store is expected to end in a 0.
649 * E.g., for basename=c, the expected name is c0.
650 *
651 * @param[in] basename  The name of the variable without trailing 0
652 * @param[in] data_type The data type of the vector
653 * @param[in] ptr       The base pointer
654 * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
655 * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
656 * @param[in] cond      Condition to select either vec_size0 or vec_size1
657 * @{
658 */
659#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
660    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
661/** @} */ // end of group STORE_VECTOR_SELECT
662
663#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
664#pragma OPENCL EXTENSION cl_khr_fp16 : enable
665#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666
667#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
668#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
669#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
670
671#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
672#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
673#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
674
675#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
676#pragma OPENCL EXTENSION cl_arm_printf : enable
677#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
678
679#define GPU_ARCH_MIDGARD 0x100
680#define GPU_ARCH_BIFROST 0x200
681
682/** Concatenate two inputs.
683 *
684 * @param[in] a The first input to be concatenated
685 * @param[in] b The second input to be concatenated
686 *
687 * @return The concatenated output
688 */
689#define CONCAT(a, b) a##b
690
691/** Expand the given vector
692 *
693 * @param[in] x The vector to be expanded
694 *
695 * @return The expanded output
696 */
697#define EXPAND(x) x
698
699/** Clamp the given value between an upper and lower bound.
700 *
701 * @param[in] x       The value to be clamped
702 * @param[in] min_val The lower bound
703 * @param[in] max_val The upper bound
704 *
705 * @return The clamped value.
706 */
707#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
708
709/** REVn reverses the given vector whose size is n.
710 * @name REVn
711 *
712 * @param[in] x The vector to be reversed
713 *
714 * @return The reversed vector
715 * @{
716 */
717#define REV1(x) ((x))
718#define REV2(x) ((x).s10)
719#define REV3(x) ((x).s210)
720#define REV4(x) ((x).s3210)
721#define REV8(x) ((x).s76543210)
722#define REV16(x) ((x).sFEDCBA9876543210)
723/** @} */ // end of group REVn
724
725/** Reverse the given vector.
726 * @name REVERSE
727 *
728 * @param[in] x The vector to be reversed
729 * @param[in] s The size of the vector
730 *
731 * @return The reversed vector
732 * @{
733 */
734#define REVERSE_STR(x, s) REV##s((x))
735#define REVERSE(x, s) REVERSE_STR(x, s)
736/** @} */ // end of group REVERSE
737
738/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
739 * @name ROTs_n
740 *
741 * @param[in] x The vector to be shifted
742 *
743 * @return The shifted vector
744 * @{
745 */
746#define ROT1_0(x) ((x))
747
748#define ROT2_0(x) ((x))
749#define ROT2_1(x) ((x).s10)
750
751#define ROT3_0(x) ((x))
752#define ROT3_1(x) ((x).s201)
753#define ROT3_2(x) ((x).s120)
754
755#define ROT4_0(x) ((x))
756#define ROT4_1(x) ((x).s3012)
757#define ROT4_2(x) ((x).s2301)
758#define ROT4_3(x) ((x).s1230)
759
760#define ROT8_0(x) ((x))
761#define ROT8_1(x) ((x).s70123456)
762#define ROT8_2(x) ((x).s67012345)
763#define ROT8_3(x) ((x).s56701234)
764#define ROT8_4(x) ((x).s45670123)
765#define ROT8_5(x) ((x).s34567012)
766#define ROT8_6(x) ((x).s23456701)
767#define ROT8_7(x) ((x).s12345670)
768
769#define ROT16_0(x) ((x))
770#define ROT16_1(x) ((x).sF0123456789ABCDE)
771#define ROT16_2(x) ((x).sEF0123456789ABCD)
772#define ROT16_3(x) ((x).sDEF0123456789ABC)
773#define ROT16_4(x) ((x).sCDEF0123456789AB)
774#define ROT16_5(x) ((x).sBCDEF0123456789A)
775#define ROT16_6(x) ((x).sABCDEF0123456789)
776#define ROT16_7(x) ((x).s9ABCDEF012345678)
777#define ROT16_8(x) ((x).s89ABCDEF01234567)
778#define ROT16_9(x) ((x).s789ABCDEF0123456)
779#define ROT16_10(x) ((x).s6789ABCDEF012345)
780#define ROT16_11(x) ((x).s56789ABCDEF01234)
781#define ROT16_12(x) ((x).s456789ABCDEF0123)
782#define ROT16_13(x) ((x).s3456789ABCDEF012)
783#define ROT16_14(x) ((x).s23456789ABCDEF01)
784#define ROT16_15(x) ((x).s123456789ABCDEF0)
785/** @} */ // end of group ROTs_n
786
787/** Circular-right-shift (rotate-right) the given vector by the given amount.
788 * @name ROTATE
789 *
790 * @param[in] x The vector to be shifted
791 * @param[in] s The size of the vector
792 * @param[in] n The amount to be shifted
793 *
794 * @return The shifted vector
795 * @{
796 */
797#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
798#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
799/** @} */ // end of group ROTATE
800
801/** Creates a vector of size n filled with offset values corresponding to the location of each element.
802 * @name V_OFFSn
803 *
804 * @param[in] dt The data type of the output vector
805 *
806 * @return The vector filled with offset values
807 * @{
808 */
809#define V_OFFS1(dt) (dt##1)(0)
810#define V_OFFS2(dt) (dt##2)(0, 1)
811#define V_OFFS3(dt) (dt##3)(0, 1, 2)
812#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
813#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
814#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
815/** @} */ // end of group V_OFFSn
816
817/** Create a vector filled with offset values corresponding to the location of each element.
818 * @name VEC_OFFS
819 *
820 * @param[in] dt The data type of the output vector
821 * @param[in] s  The size of the output vector
822 *
823 * @return The vector filled with offset values
824 * @{
825 */
826#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
827#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
828/** @} */ // end of group VEC_OFFS
829
830#define VLOAD_STR(size) vload##size
831#define VLOAD(size) VLOAD_STR(size)
832
833#define PIXEL_UNIT4 1
834#define PIXEL_UNIT8 2
835#define PIXEL_UNIT16 4
836
837/** Utility macro to convert a vector size in pixel unit.
838 *
839 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
840 *
841 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
842 *
843 * @return The pixel unit (number of pixels)
844 * @{
845 */
846#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
847#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
848/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
849
850#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
851#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
852#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
853
854#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
855#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
856#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
857#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
858#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
859
860/** Utility macro to read a 2D OpenCL image object.
861 *
862 * @note Coordinates are not normalized
863 *
864 * @param[in] data_type Data type
865 * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
866 * @param[in] img       OpenCL image object
867 * @param[in] x_coord   The x coordinate for the top-left pixel
868 * @param[in] y_coord   The y coordinate for the top-left pixel
869 *
870 * @return Pixels from the 2D OpenCL image object
871 * @{
872 */
873#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
874#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
875
876#define VSTORE_STR(size) vstore##size
877#define VSTORE(size) VSTORE_STR(size)
878
879#define float1 float
880#define half1 half
881#define char1 char
882#define uchar1 uchar
883#define short1 short
884#define ushort1 ushort
885#define int1 int
886#define uint1 uint
887#define long1 long
888#define ulong1 ulong
889#define double1 double
890
891#define vload1(OFFSET, PTR) *(OFFSET + PTR)
892#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
893
894/** Extended partial vstore that correctly handles scalar values as well.
895 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
896 * @name VSTORE_PARTIAL
897 *
898 * @note With this macro, the passed data can be both a vector and a scalar
899 * @note @p store_size needs to be <= @p size
900 * eg 1: Valid
901 * VSTORE_PARTIAL(16, 15) ...;
902 * eg 2: Invalid
903 * VSTORE_PARTIAL(4, 7) ...;
904 *
905 * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
906 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
907 * @{
908 */
909#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
910#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
911
912#define NO_STORE(data, offs, ptr) \
913    {                             \
914    }
915
916// Size == 1 (scalar)
917#define vstore_partial_1_0 NO_STORE
918#define vstore_partial_1_1 vstore1
919#define vstore_partial_1_2 NO_STORE
920#define vstore_partial_1_3 NO_STORE
921#define vstore_partial_1_4 NO_STORE
922#define vstore_partial_1_5 NO_STORE
923#define vstore_partial_1_6 NO_STORE
924#define vstore_partial_1_7 NO_STORE
925#define vstore_partial_1_8 NO_STORE
926#define vstore_partial_1_9 NO_STORE
927#define vstore_partial_1_10 NO_STORE
928#define vstore_partial_1_11 NO_STORE
929#define vstore_partial_1_12 NO_STORE
930#define vstore_partial_1_13 NO_STORE
931#define vstore_partial_1_14 NO_STORE
932#define vstore_partial_1_15 NO_STORE
933#define vstore_partial_1_16 NO_STORE
934// Size == 2
935#define vstore_partial_2_0 NO_STORE
936#define vstore_partial_2_1 vstore_partial_1
937#define vstore_partial_2_2 vstore_partial_2
938#define vstore_partial_2_3 NO_STORE
939#define vstore_partial_2_4 NO_STORE
940#define vstore_partial_2_5 NO_STORE
941#define vstore_partial_2_6 NO_STORE
942#define vstore_partial_2_7 NO_STORE
943#define vstore_partial_2_8 NO_STORE
944#define vstore_partial_2_9 NO_STORE
945#define vstore_partial_2_10 NO_STORE
946#define vstore_partial_2_11 NO_STORE
947#define vstore_partial_2_12 NO_STORE
948#define vstore_partial_2_13 NO_STORE
949#define vstore_partial_2_14 NO_STORE
950#define vstore_partial_2_15 NO_STORE
951#define vstore_partial_2_16 NO_STORE
952// Size == 3
953#define vstore_partial_3_0 NO_STORE
954#define vstore_partial_3_1 vstore_partial_1
955#define vstore_partial_3_2 vstore_partial_2
956#define vstore_partial_3_3 vstore_partial_3
957#define vstore_partial_3_4 NO_STORE
958#define vstore_partial_3_5 NO_STORE
959#define vstore_partial_3_6 NO_STORE
960#define vstore_partial_3_7 NO_STORE
961#define vstore_partial_3_8 NO_STORE
962#define vstore_partial_3_9 NO_STORE
963#define vstore_partial_3_10 NO_STORE
964#define vstore_partial_3_11 NO_STORE
965#define vstore_partial_3_12 NO_STORE
966#define vstore_partial_3_13 NO_STORE
967#define vstore_partial_3_14 NO_STORE
968#define vstore_partial_3_15 NO_STORE
969#define vstore_partial_3_16 NO_STORE
970// Size == 4
971#define vstore_partial_4_0 NO_STORE
972#define vstore_partial_4_1 vstore_partial_1
973#define vstore_partial_4_2 vstore_partial_2
974#define vstore_partial_4_3 vstore_partial_3
975#define vstore_partial_4_4 vstore_partial_4
976#define vstore_partial_4_5 NO_STORE
977#define vstore_partial_4_6 NO_STORE
978#define vstore_partial_4_7 NO_STORE
979#define vstore_partial_4_8 NO_STORE
980#define vstore_partial_4_9 NO_STORE
981#define vstore_partial_4_10 NO_STORE
982#define vstore_partial_4_11 NO_STORE
983#define vstore_partial_4_12 NO_STORE
984#define vstore_partial_4_13 NO_STORE
985#define vstore_partial_4_14 NO_STORE
986#define vstore_partial_4_15 NO_STORE
987#define vstore_partial_4_16 NO_STORE
988// Size == 8
989#define vstore_partial_8_0 NO_STORE
990#define vstore_partial_8_1 vstore_partial_1
991#define vstore_partial_8_2 vstore_partial_2
992#define vstore_partial_8_3 vstore_partial_3
993#define vstore_partial_8_4 vstore_partial_4
994#define vstore_partial_8_5 vstore_partial_5
995#define vstore_partial_8_6 vstore_partial_6
996#define vstore_partial_8_7 vstore_partial_7
997#define vstore_partial_8_8 vstore_partial_8
998#define vstore_partial_8_9 NO_STORE
999#define vstore_partial_8_10 NO_STORE
1000#define vstore_partial_8_11 NO_STORE
1001#define vstore_partial_8_12 NO_STORE
1002#define vstore_partial_8_13 NO_STORE
1003#define vstore_partial_8_14 NO_STORE
1004#define vstore_partial_8_15 NO_STORE
1005#define vstore_partial_8_16 NO_STORE
1006// Size == 16
1007#define vstore_partial_16_0 NO_STORE
1008#define vstore_partial_16_1 vstore_partial_1
1009#define vstore_partial_16_2 vstore_partial_2
1010#define vstore_partial_16_3 vstore_partial_3
1011#define vstore_partial_16_4 vstore_partial_4
1012#define vstore_partial_16_5 vstore_partial_5
1013#define vstore_partial_16_6 vstore_partial_6
1014#define vstore_partial_16_7 vstore_partial_7
1015#define vstore_partial_16_8 vstore_partial_8
1016#define vstore_partial_16_9 vstore_partial_9
1017#define vstore_partial_16_10 vstore_partial_10
1018#define vstore_partial_16_11 vstore_partial_11
1019#define vstore_partial_16_12 vstore_partial_12
1020#define vstore_partial_16_13 vstore_partial_13
1021#define vstore_partial_16_14 vstore_partial_14
1022#define vstore_partial_16_15 vstore_partial_15
1023#define vstore_partial_16_16 vstore_partial_16
1024
1025/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
1026 * @name vstore_partial_n
1027 *
1028 * @note @p DATA needs to be a vector not a scalar
1029 * @note n needs to be <= the vector width of the input variable @p DATA
1030 * eg 1: Valid
1031 * vstore_partial_15(var:float16, 0, 0xabcd);
1032 * eg 2: Invalid
1033 * vstore_partial_7(var:float4, 0, 0xabcd);
1034 *
1035 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
1036 *
1037 * @param[in] DATA   The name of the variable
1038 * @param[in] OFFSET Offset in n
1039 * @param[in] PTR    The base pointer
1040 * @{
1041 */
1042#define vstore_partial_1(DATA, OFFSET, PTR) \
1043    vstore1(DATA.s0, OFFSET, PTR);
1044
1045#define vstore_partial_2(DATA, OFFSET, PTR) \
1046    vstore2(DATA.s01, OFFSET, PTR);
1047
1048#define vstore_partial_3(DATA, OFFSET, PTR) \
1049    vstore3(DATA.s012, OFFSET, PTR);
1050
1051#define vstore_partial_4(DATA, OFFSET, PTR) \
1052    vstore4(DATA.s0123, OFFSET, PTR);
1053
1054#define vstore_partial_5(DATA, OFFSET, PTR)    \
1055    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1056    vstore1(DATA.s4, OFFSET, PTR + 4);
1057
1058#define vstore_partial_6(DATA, OFFSET, PTR)    \
1059    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1060    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
1061
1062#define vstore_partial_7(DATA, OFFSET, PTR)    \
1063    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
1064    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
1065
1066#define vstore_partial_8(DATA, OFFSET, PTR) \
1067    vstore8(DATA.s01234567, OFFSET, PTR);
1068
1069#define vstore_partial_9(DATA, OFFSET, PTR)        \
1070    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1071    vstore1(DATA.s8, OFFSET, PTR + 8);
1072
1073#define vstore_partial_10(DATA, OFFSET, PTR)       \
1074    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1075    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
1076
1077#define vstore_partial_11(DATA, OFFSET, PTR)       \
1078    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1079    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
1080
1081#define vstore_partial_12(DATA, OFFSET, PTR)       \
1082    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1083    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
1084
1085#define vstore_partial_13(DATA, OFFSET, PTR)       \
1086    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1087    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
1088
1089#define vstore_partial_14(DATA, OFFSET, PTR)       \
1090    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1091    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
1092
1093#define vstore_partial_15(DATA, OFFSET, PTR)       \
1094    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
1095    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
1096
1097#define vstore_partial_16(DATA, OFFSET, PTR) \
1098    vstore16(DATA, OFFSET, PTR);
1099/** @} */ // end of groupd vstore_partial_n
1100/** @} */ // end of groupd VSTORE_PARTIAL
1101
1102// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
1103// without _sat to overcome this issue
1104#define convert_float_sat convert_float
1105#define convert_float1_sat convert_float
1106#define convert_float2_sat convert_float2
1107#define convert_float3_sat convert_float3
1108#define convert_float4_sat convert_float4
1109#define convert_float8_sat convert_float8
1110#define convert_float16_sat convert_float16
1111#define convert_half_sat convert_float
1112#define convert_half1_sat convert_half
1113#define convert_half2_sat convert_half2
1114#define convert_half3_sat convert_half3
1115#define convert_half4_sat convert_half4
1116#define convert_half8_sat convert_half8
1117#define convert_half16_sat convert_half16
1118
1119#define convert_float1 convert_float
1120#define convert_half1 convert_half
1121#define convert_char1 convert_char
1122#define convert_uchar1 convert_uchar
1123#define convert_short1 convert_short
1124#define convert_ushort1 convert_ushort
1125#define convert_int1 convert_int
1126#define convert_uint1 convert_uint
1127#define convert_long1 convert_long
1128#define convert_ulong1 convert_ulong
1129#define convert_double1 convert_double
1130
1131#define convert_char1_sat convert_char_sat
1132#define convert_uchar1_sat convert_uchar_sat
1133#define convert_short1_sat convert_short_sat
1134#define convert_ushort1_sat convert_ushort_sat
1135#define convert_int1_sat convert_int_sat
1136#define convert_uint1_sat convert_uint_sat
1137#define convert_long1_sat convert_long_sat
1138#define convert_ulong1_sat convert_ulong_sat
1139#define convert_double1_sat convert_double_sat
1140
1141#define VEC_DATA_TYPE_STR(type, size) type##size
1142#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
1143
1144#define CONVERT_STR(x, type) (convert_##type((x)))
1145#define CONVERT(x, type) CONVERT_STR(x, type)
1146
1147#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
1148#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
1149
1150#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
1151#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
1152
1153#define select_vec_dt_uchar(size) uchar##size
1154#define select_vec_dt_char(size) char##size
1155#define select_vec_dt_ushort(size) ushort##size
1156#define select_vec_dt_short(size) short##size
1157#define select_vec_dt_half(size) short##size
1158#define select_vec_dt_uint(size) uint##size
1159#define select_vec_dt_int(size) int##size
1160#define select_vec_dt_float(size) int##size
1161#define select_vec_dt_ulong(size) ulong##size
1162#define select_vec_dt_long(size) long##size
1163
1164#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
1165#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
1166#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
1167
1168#define sum_reduce_1(x) (x)
1169#define sum_reduce_2(x) ((x).s0) + ((x).s1)
1170#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
1171#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
1172#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
1173#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
1174
1175#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
1176#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
1177
1178#define max_reduce_1(x) (x)
1179#define max_reduce_2(x) max(((x).s0), ((x).s1))
1180#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
1181#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
1182#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
1183#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
1184
1185#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
1186#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
1187
1188#define VECTOR_DECLARATION(name)     \
1189    __global uchar *name##_ptr,      \
1190    uint        name##_stride_x, \
1191    uint        name##_step_x,   \
1192    uint        name##_offset_first_element_in_bytes
1193
1194#define IMAGE_DECLARATION(name)      \
1195    __global uchar *name##_ptr,      \
1196    uint        name##_stride_x, \
1197    uint        name##_step_x,   \
1198    uint        name##_stride_y, \
1199    uint        name##_step_y,   \
1200    uint        name##_offset_first_element_in_bytes
1201
1202#define TENSOR3D_DECLARATION(name)   \
1203    __global uchar *name##_ptr,      \
1204    uint        name##_stride_x, \
1205    uint        name##_step_x,   \
1206    uint        name##_stride_y, \
1207    uint        name##_step_y,   \
1208    uint        name##_stride_z, \
1209    uint        name##_step_z,   \
1210    uint        name##_offset_first_element_in_bytes
1211
1212#define TENSOR4D_DECLARATION(name)   \
1213    __global uchar *name##_ptr,      \
1214    uint        name##_stride_x, \
1215    uint        name##_step_x,   \
1216    uint        name##_stride_y, \
1217    uint        name##_step_y,   \
1218    uint        name##_stride_z, \
1219    uint        name##_step_z,   \
1220    uint        name##_stride_w, \
1221    uint        name##_step_w,   \
1222    uint        name##_offset_first_element_in_bytes
1223
1224#define CONVERT_TO_VECTOR_STRUCT(name) \
1225    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1226
1227#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1228    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1229
1230#define CONVERT_TO_IMAGE_STRUCT(name) \
1231    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1232
1233#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1234    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1235
1236#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1237    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1238
1239#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1240    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1241
1242#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1243    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1244
1245#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1246    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1247                                 name##_stride_z, name##_step_z)
1248
1249#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1250    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1251
1252#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1253    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1254                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1255
1256#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1257    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1258
1259#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1260    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1261                           name##_stride_z, name##_step_z)
1262
1263/** Structure to hold Vector information */
1264typedef struct Vector
1265{
1266    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1267    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1268    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1269} Vector;
1270
1271/** Structure to hold Image information */
1272typedef struct Image
1273{
1274    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1275    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1276    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1277    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1278} Image;
1279
1280/** Structure to hold 3D tensor information */
1281typedef struct Tensor3D
1282{
1283    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1284    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1285    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1286    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1287    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
1288} Tensor3D;
1289
1290/** Structure to hold 4D tensor information */
1291typedef struct Tensor4D
1292{
1293    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
1294    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
1295    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
1296    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
1297    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
1298    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
1299} Tensor4D;
1300
1301/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
1302 *
1303 * @param[in] ptr                           Pointer to the starting postion of the buffer
1304 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
1305 * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
1306 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1307 *
1308 * @return An image object
1309 */
1310inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1311{
1312    Vector vector =
1313    {
1314        .ptr                           = ptr,
1315        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1316        .stride_x                      = stride_x,
1317    };
1318    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1319    return vector;
1320}
1321
1322/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
1323 *
1324 * @param[in] ptr                           Pointer to the starting postion of the buffer
1325 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1326 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1327 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1328 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1329 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1330 *
1331 * @return An image object
1332 */
1333inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1334{
1335    Image img =
1336    {
1337        .ptr                           = ptr,
1338        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1339        .stride_x                      = stride_x,
1340        .stride_y                      = stride_y
1341    };
1342    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1343    return img;
1344}
1345
1346/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
1347 *
1348 * @param[in] ptr                           Pointer to the starting postion of the buffer
1349 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1350 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1351 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1352 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1353 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1354 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1355 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1356 *
1357 * @return A 3D tensor object
1358 */
1359inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1360{
1361    Image img =
1362    {
1363        .ptr                           = ptr,
1364        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1365        .stride_x                      = stride_x,
1366        .stride_y                      = stride_y
1367    };
1368    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1369    return img;
1370}
1371
1372/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
1373 *
1374 * @param[in] ptr                           Pointer to the starting postion of the buffer
1375 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1376 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1377 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1378 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1379 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1380 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1381 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1382 *
1383 * @return A 3D tensor object
1384 */
1385inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1386{
1387    Tensor3D tensor =
1388    {
1389        .ptr                           = ptr,
1390        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1391        .stride_x                      = stride_x,
1392        .stride_y                      = stride_y,
1393        .stride_z                      = stride_z
1394    };
1395    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1396    return tensor;
1397}
1398
1399/** Wrap 3D tensor information into an tensor structure.
1400 *
1401 * @param[in] ptr                           Pointer to the starting postion of the buffer
1402 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1403 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
1404 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
1405 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
1406 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
1407 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
1408 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
1409 *
1410 * @return A 3D tensor object
1411 */
1412inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1413{
1414    Tensor3D tensor =
1415    {
1416        .ptr                           = ptr,
1417        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1418        .stride_x                      = stride_x,
1419        .stride_y                      = stride_y,
1420        .stride_z                      = stride_z
1421    };
1422    return tensor;
1423}
1424
1425inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1426                                             uint step_w,
1427                                             uint mod_size)
1428{
1429    Tensor4D tensor =
1430    {
1431        .ptr                           = ptr,
1432        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1433        .stride_x                      = stride_x,
1434        .stride_y                      = stride_y,
1435        .stride_z                      = stride_z,
1436        .stride_w                      = stride_w
1437    };
1438
1439    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1440    return tensor;
1441}
1442
1443/** Get the pointer position of a Vector
1444 *
1445 * @param[in] vec Pointer to the starting position of the buffer
1446 * @param[in] x   Relative X position
1447 */
1448inline __global const uchar *vector_offset(const Vector *vec, int x)
1449{
1450    return vec->ptr + x * vec->stride_x;
1451}
1452
1453/** Get the pointer position of a Image
1454 *
1455 * @param[in] img Pointer to the starting position of the buffer
1456 * @param[in] x   Relative X position
1457 * @param[in] y   Relative Y position
1458 */
1459inline __global uchar *offset(const Image *img, int x, int y)
1460{
1461    return img->ptr + x * img->stride_x + y * img->stride_y;
1462}
1463
1464/** Get the pointer position of a Tensor3D
1465 *
1466 * @param[in] tensor Pointer to the starting position of the buffer
1467 * @param[in] x      Relative X position
1468 * @param[in] y      Relative Y position
1469 * @param[in] z      Relative Z position
1470 */
1471inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1472{
1473    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1474}
1475
1476/** Get the pointer position of a Tensor4D
1477 *
1478 * @param[in] tensor Pointer to the starting position of the buffer
1479 * @param[in] x      Relative X position
1480 * @param[in] y      Relative Y position
1481 * @param[in] z      Relative Z position
1482 * @param[in] w      Relative W position
1483 */
1484inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1485{
1486    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1487}
1488
1489/** Get the offset for a given linear index of a Tensor3D
1490 *
1491 * @param[in] tensor Pointer to the starting position of the buffer
1492 * @param[in] width  Width of the input tensor
1493 * @param[in] height Height of the input tensor
1494 * @param[in] depth  Depth of the input tensor
1495 * @param[in] index  Linear index
1496 */
1497inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1498{
1499    uint num_elements = width * height;
1500
1501    const uint z = index / num_elements;
1502
1503    index %= num_elements;
1504
1505    const uint y = index / width;
1506
1507    index %= width;
1508
1509    const uint x = index;
1510
1511    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1512}
1513
1514#endif // _HELPER_H
1515
1516#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1517
1518#if defined(S1_VAL) && !defined(S2_VAL)
1519#define S2_VAL S1_VAL
1520#endif // defined(S1_VAL) && !defined(S2_VAL)
1521#if defined(O1_VAL) && !defined(O2_VAL)
1522#define O2_VAL O1_VAL
1523#endif // defined(O1_VAL) && !defined(O2_VAL)
1524
1525// RELU Activation
1526inline TYPE relu_op(TYPE x)
1527{
1528    return max((TYPE)CONST_0, x);
1529}
1530// Bounded RELU Activation
1531inline TYPE brelu_op(TYPE x)
1532{
1533    return min((TYPE)A_VAL, max((TYPE)CONST_0, x));
1534}
1535// Lower Upper Bounded RELU Activation
1536inline TYPE lu_brelu_op(TYPE x)
1537{
1538    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
1539}
1540// Hard Swish Activation
1541inline TYPE hard_swish_op(TYPE x)
1542{
1543    return  (x * ((min(max((TYPE)(x + (TYPE)3.f), (TYPE)0.f), (TYPE)6.f)) * (TYPE)0.166666667f));
1544}
1545
1546#define ACTIVATION_OP2(op, x) op##_op(x)
1547#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
1548
1549#if defined(S1_VAL) && defined(S2_VAL)
1550#if defined(O1_VAL) && defined(O2_VAL)
1551#define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
1552    ({                                                                                            \
1553        data = ACTIVATION_OP(act, data);                                                          \
1554        \
1555        VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
1556        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
1557        \
1558        fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
1559        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
1560    })
1561#else // defined(O1_VAL) && defined(O2_VAL)
1562#define PERFORM_ACTIVATION_QUANT(act, data)                             \
1563    ({                                                                  \
1564        data = ACTIVATION_OP(act, data);                                \
1565        \
1566        VEC_DATA_TYPE(float, VEC_SIZE)                                  \
1567        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
1568        \
1569        fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
1570        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
1571    })
1572#endif /* defined(O1_VAL) && defined(O2_VAL) */
1573#else  /* defined(S1_VAL) && defined(S2_VAL) */
1574#define PERFORM_ACTIVATION_QUANT(act, data) \
1575    ({                                      \
1576        data = ACTIVATION_OP(act, data);    \
1577    })
1578#endif /* defined(S1_VAL) && defined(S2_VAL) */
1579
1580#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
1581
1582#if defined(FLOAT_DOMAIN)
1583// Activations performed in the float domain
1584
1585/*
1586 * Copyright (c) 2019-2020 Arm Limited.
1587 *
1588 * SPDX-License-Identifier: MIT
1589 *
1590 * Permission is hereby granted, free of charge, to any person obtaining a copy
1591 * of this software and associated documentation files (the "Software"), to
1592 * deal in the Software without restriction, including without limitation the
1593 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1594 * sell copies of the Software, and to permit persons to whom the Software is
1595 * furnished to do so, subject to the following conditions:
1596 *
1597 * The above copyright notice and this permission notice shall be included in all
1598 * copies or substantial portions of the Software.
1599 *
1600 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1602 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1603 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1604 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1605 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1606 * SOFTWARE.
1607 */
1608
1609/*
1610 * Copyright (c) 2016-2020 Arm Limited.
1611 *
1612 * SPDX-License-Identifier: MIT
1613 *
1614 * Permission is hereby granted, free of charge, to any person obtaining a copy
1615 * of this software and associated documentation files (the "Software"), to
1616 * deal in the Software without restriction, including without limitation the
1617 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1618 * sell copies of the Software, and to permit persons to whom the Software is
1619 * furnished to do so, subject to the following conditions:
1620 *
1621 * The above copyright notice and this permission notice shall be included in all
1622 * copies or substantial portions of the Software.
1623 *
1624 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1625 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1626 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1627 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1628 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1629 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1630 * SOFTWARE.
1631 */
1632#ifndef ARM_COMPUTE_HELPER_H
1633#define ARM_COMPUTE_HELPER_H
1634
1635/*
1636 * Copyright (c) 2020 Arm Limited.
1637 *
1638 * SPDX-License-Identifier: MIT
1639 *
1640 * Permission is hereby granted, free of charge, to any person obtaining a copy
1641 * of this software and associated documentation files (the "Software"), to
1642 * deal in the Software without restriction, including without limitation the
1643 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1644 * sell copies of the Software, and to permit persons to whom the Software is
1645 * furnished to do so, subject to the following conditions:
1646 *
1647 * The above copyright notice and this permission notice shall be included in all
1648 * copies or substantial portions of the Software.
1649 *
1650 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1651 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1652 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1653 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1654 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1655 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1656 * SOFTWARE.
1657 */
1658
1659/** Store the 0 to (n-1)th rows of the given variables
1660 * @name STORE_ROW_n
1661 *
1662 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
1663 * @param[in] DATA_TYPE The data type of the vectors
1664 * @param[in] BASENAME  The basename of the variables
1665 * @param[in] PTR       The base pointer
1666 * @param[in] STRIDE_Y  The stride value in y-axis direction
1667 * @param[in] Z         The offset in z-axis direction
1668 * @{
1669 */
1670#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1671    VSTORE(N0)                                                 \
1672    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1673
1674#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1675    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1676    VSTORE(N0)                                                 \
1677    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1678
1679#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1680    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1681    VSTORE(N0)                                                 \
1682    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1683
1684#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1685    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1686    VSTORE(N0)                                                 \
1687    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1688
1689#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1690    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1691    VSTORE(N0)                                                 \
1692    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1693
1694#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1695    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1696    VSTORE(N0)                                                 \
1697    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1698
1699#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1700    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1701    VSTORE(N0)                                                 \
1702    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1703
1704#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1705    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1706    VSTORE(N0)                                                 \
1707    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1708
1709#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1710    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1711    VSTORE(N0)                                                 \
1712    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1713
1714#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1715    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1716    VSTORE(N0)                                                  \
1717    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1718
1719#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1720    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1721    VSTORE(N0)                                                  \
1722    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1723
1724#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1725    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1726    VSTORE(N0)                                                  \
1727    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1728
1729#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1730    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1731    VSTORE(N0)                                                  \
1732    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1733
1734#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1735    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1736    VSTORE(N0)                                                  \
1737    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1738
1739#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1740    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1741    VSTORE(N0)                                                  \
1742    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1743
1744#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1745    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1746    VSTORE(N0)                                                  \
1747    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1748/** @} */ // end of groupd STORE_ROW_n
1749
1750/** Convert and store the 0th to (n-1)th rows of the given variables
1751 * @name CONVERT_STORE_ROW_n
1752 *
1753 * @param[in] N0        The size of the vectors
1754 * @param[in] DATA_TYPE The data type of the vectors
1755 * @param[in] BASENAME  The basename of the variables
1756 * @param[in] PTR       The base pointer
1757 * @param[in] STRIDE_Y  The stride value in y-axis direction
1758 * @param[in] Z         The offset in z-axis direction
1759 * @{
1760 */
1761#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1762    VSTORE(N0)                                                         \
1763    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1764
1765#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1766    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1767    VSTORE(N0)                                                         \
1768    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1769
1770#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1771    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1772    VSTORE(N0)                                                         \
1773    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1774
1775#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1776    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1777    VSTORE(N0)                                                         \
1778    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1779
1780#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1781    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1782    VSTORE(N0)                                                         \
1783    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1784
1785#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1786    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1787    VSTORE(N0)                                                         \
1788    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1789
1790#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1791    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1792    VSTORE(N0)                                                         \
1793    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1794
1795#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1796    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1797    VSTORE(N0)                                                         \
1798    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1799
1800#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1801    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1802    VSTORE(N0)                                                         \
1803    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1804
1805#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1806    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1807    VSTORE(N0)                                                     \
1808    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1809
1810#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1811    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1812    VSTORE(N0)                                                          \
1813    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1814
1815#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1816    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1817    VSTORE(N0)                                                          \
1818    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1819
1820#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1821    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1822    VSTORE(N0)                                                          \
1823    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1824
1825#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1826    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1827    VSTORE(N0)                                                          \
1828    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1829
1830#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1831    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1832    VSTORE(N0)                                                          \
1833    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1834
1835#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1836    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1837    VSTORE(N0)                                                          \
1838    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1839
1840/** @} */ // end of groupd CONVERT_STORE_ROW_n
1841
1842/** Store a block of the given size M0xN0
1843 * @name STORE_BLOCK
1844 *
1845 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
1846 * The data to store is expected to have consecutive names for each row.
1847 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1848 * The Z offset is expected to have consecutive names.
1849 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1850 *
1851 * @param[in] M0        The number of rows to store
1852 * @param[in] N0        The size of each vector
1853 * @param[in] DATA_TYPE The data type of the vectors
1854 * @param[in] BASENAME  The basename of the variables
1855 * @param[in] PTR       The base pointer
1856 * @param[in] STRIDE_Y  The stride value in y-axis direction
1857 * @param[in] Z         The offset in z-axis direction
1858 * @{
1859 */
1860#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1861#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1862/** @} */ // end of group STORE_BLOCK
1863
1864/** Convert and store a block of the given size M0xN0
1865 * @name CONVERT_STORE_BLOCK
1866 *
1867 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
1868 * The data to store is expected to have consecutive names for each row.
1869 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
1870 * The Z offset is expected to have consecutive names.
1871 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1872 *
1873 * @param[in] M0        The number of rows to store
1874 * @param[in] N0        The size of each vector
1875 * @param[in] DATA_TYPE The data type of the vectors
1876 * @param[in] BASENAME  The basename of the variables
1877 * @param[in] PTR       The base pointer
1878 * @param[in] STRIDE_Y  The stride value in y-axis direction
1879 * @param[in] Z         The offset in z-axis direction
1880 * @{
1881 */
1882#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1883#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1884/** @} */ // end of group CONVERT_STORE_BLOCK
1885
1886/** Partially store the 0 to (n-1)th rows of the given variables
1887 * @name STORE_ROW_PARTIAL_n
1888 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
1889 *
1890 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1891 *
1892 * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
1893 * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
1894 * @param[in] DATA_TYPE The data type of the vectors
1895 * @param[in] BASENAME  The basename of the variables
1896 * @param[in] PTR       The base pointer
1897 * @param[in] STRIDE_Y  The stride value in y-axis direction
1898 * @param[in] Z         The offset in z-axis direction
1899 * @{
1900 */
1901#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1902    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1903    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1904
1905#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1906    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1907    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1908    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1909
1910#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1911    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1912    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1913    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1914
1915#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1916    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1917    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1918    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1919
1920#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1921    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1922    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1923    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1924
1925#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1926    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1927    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1928    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1929
1930#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1931    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1932    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1933    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1934
1935#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1936    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1937    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1938    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1939
1940#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1941    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1942    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1943    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1944
1945#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1946    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1947    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1948    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1949
1950#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1951    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1952    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1953    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1954
1955#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1956    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1957    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1958    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1959
1960#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1961    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1962    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1963    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1964
1965#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1966    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1967    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1968    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1969
1970#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1971    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1972    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1973    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1974
1975#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1976    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1977    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1978    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1979/** @} */ // end of groupd STORE_ROW_PARTIAL_n
1980
1981/** Partially store a block of the given size STORE_M0xSTORE_N0
1982 * @name STORE_BLOCK_PARTIAL
1983 *
1984 * @note The vector width @p N0 is also required for correct partial storing behaviour.
1985 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
1986 *
1987 * The data to store is expected to have consecutive names for each row.
1988 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
1989 * The Z offset is expected to have consecutive names.
1990 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
1991 *
1992 * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
1993 * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
1994 * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
1995 * @param[in] DATA_TYPE The data type of the vectors
1996 * @param[in] BASENAME  The basename of the variables
1997 * @param[in] PTR       The base pointer
1998 * @param[in] STRIDE_Y  The stride value in y-axis direction
1999 * @param[in] Z         The offset in z-axis direction
2000 * @{
2001 */
2002#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2003#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2004/** Store a block that can be partial in both x and y dimensions
2005 *
2006 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2007 *
2008 * The data to store is expected to have consecutive names for each row.
2009 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2010 * The Z offset is expected to have consecutive names.
2011 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2012 *
2013 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2014 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2015 * @param[in] DATA_TYPE        The data type of the vectors
2016 * @param[in] BASENAME         The basename of the variables
2017 * @param[in] PTR              The base pointer
2018 * @param[in] STRIDE_Y         The stride value in y-axis direction
2019 * @param[in] Z                The offset in z-axis direction
2020 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
2021 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
2022 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2023 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2024 */
2025#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2026    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
2027    {                                                                                                                                                     \
2028        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
2029    }                                                                                                                                                     \
2030    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
2031    {                                                                                                                                                     \
2032        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2033    }                                                                                                                                                     \
2034    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
2035    {                                                                                                                                                     \
2036        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
2037    }                                                                                                                                                     \
2038    else                                                                                                                                                  \
2039    {                                                                                                                                                     \
2040        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
2041    }
2042/** Store a block that can only be partial in x but not y.
2043 *
2044 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2045 *
2046 * The data to store is expected to have consecutive names for each row.
2047 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2048 * The Z offset is expected to have consecutive names.
2049 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2050 *
2051 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2052 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2053 * @param[in] DATA_TYPE        The data type of the vectors
2054 * @param[in] BASENAME         The basename of the variables
2055 * @param[in] PTR              The base pointer
2056 * @param[in] STRIDE_Y         The stride value in y-axis direction
2057 * @param[in] Z                The offset in z-axis direction
2058 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
2059 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2060 */
2061#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2062    if(!(PARTIAL_COND_X))                                                                                         \
2063    {                                                                                                             \
2064        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2065    }                                                                                                             \
2066    else                                                                                                          \
2067    {                                                                                                             \
2068        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2069    }
2070/** Store a block that can only be partial in y but not x.
2071 *
2072 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2073 *
2074 * The data to store is expected to have consecutive names for each row.
2075 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2076 * The Z offset is expected to have consecutive names.
2077 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2078 *
2079 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2080 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2081 * @param[in] DATA_TYPE        The data type of the vectors
2082 * @param[in] BASENAME         The basename of the variables
2083 * @param[in] PTR              The base pointer
2084 * @param[in] STRIDE_Y         The stride value in y-axis direction
2085 * @param[in] Z                The offset in z-axis direction
2086 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
2087 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2088 */
2089#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2090    if(!(PARTIAL_COND_Y))                                                                                         \
2091    {                                                                                                             \
2092        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
2093    }                                                                                                             \
2094    else                                                                                                          \
2095    {                                                                                                             \
2096        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
2097    }
2098/** @} */ // end of group STORE_BLOCK_PARTIAL
2099
2100#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2101
2102/** Boundary-aware GEMM block store
2103 * @name STORE_BLOCK_BOUNDARY_AWARE
2104 * This macro assumes the following schemes to achieve boundary-awareness:
2105 *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
2106 *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
2107 *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
2108 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
2109 *
2110 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
2111 * blocks **at the end**.
2112 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
2113 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
2114 *
2115 *  *--x-->                         x == 0                        x == 1
2116 *  |                  |<------------------------------N-------------------------->|
2117 *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
2118 *  |     -------------#############################################################
2119 *  *     |          | |...............................|...........................|
2120 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
2121 *        |          | |...............................|...........................|
2122 *        M          --#############################################################
2123 *        |          | |                               |...........................|
2124 * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
2125 *        |          | |                               |...........................|
2126 *        |------------#############################################################
2127 *
2128 * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
2129 *
2130 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2131 *
2132 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
2133 * and select corresponding store methods such that the boundary detection logic is only added when needed.
2134 *
2135 * The data to store is expected to have consecutive names for each row.
2136 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
2137 * The Z offset is expected to have consecutive names.
2138 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
2139 *
2140 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2141 * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
2142 * @param[in] DATA_TYPE        The data type of the vectors
2143 * @param[in] BASENAME         The basename of the variables
2144 * @param[in] PTR              The base pointer
2145 * @param[in] STRIDE_Y         The stride value in y-axis direction
2146 * @param[in] Z                The offset in z-axis direction
2147 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
2148 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
2149 * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
2150 * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
2151 * @{
2152 */
2153#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2154// Case1: No partial blocks in either x or y
2155#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2156    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2157
2158#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2159// Case2: Partial blocks in y
2160#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2161    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2162
2163#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2164// Case3: Partial blocks in x
2165#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2166    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2167
2168#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2169// Case4: Partial blocks in both x and y
2170#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2171    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2172
2173#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2174
2175#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
2176/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
2177
2178#if defined(PARTIAL_STORE_M0)
2179/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
2180 * @name COMPUTE_M0_START_ROW
2181 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
2182 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
2183 * blocks in the y dimension to avoid any padding.
2184 * EG: M0=4, PARTIAL_STORE_M0=1:
2185 *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
2186 * block 0 (partial)| start row = 0   | start row = 0
2187 * block 1 (full)   | start row = 4   | start row = 1
2188 * block 2 (full)   | start row = 8   | start row = 5
2189 *
2190 * @param[in] y                Global id of current block in y.
2191 * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
2192 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
2193 * @{
2194 */
2195#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2196    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
2197#else // defined(PARTIAL_STORE_M0)
2198#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
2199    ((uint)(y * M0))
2200#endif    // defined(PARTIAL_STORE_M0)
2201/** @} */ // end of group COMPUTE_M0_START_ROW
2202
2203/** Store a vector that can only be partial in x.
2204 *
2205 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
2206 *
2207 * The data to store is expected to end in a 0.
2208 * E.g., for basename=c, the expected name is c0.
2209 *
2210 * @param[in] basename  The name of the variable without trailing 0
2211 * @param[in] data_type The data type of the vector
2212 * @param[in] ptr       The base pointer
2213 * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
2214 * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
2215 * @param[in] cond      Condition to select either vec_size0 or vec_size1
2216 * @{
2217 */
2218#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
2219    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
2220/** @} */ // end of group STORE_VECTOR_SELECT
2221
2222#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2223#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2224#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2225
2226#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2227#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
2228#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2229
2230#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2231#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
2232#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
2233
2234#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2235#pragma OPENCL EXTENSION cl_arm_printf : enable
2236#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
2237
2238#define GPU_ARCH_MIDGARD 0x100
2239#define GPU_ARCH_BIFROST 0x200
2240
2241/** Concatenate two inputs.
2242 *
2243 * @param[in] a The first input to be concatenated
2244 * @param[in] b The second input to be concatenated
2245 *
2246 * @return The concatenated output
2247 */
2248#define CONCAT(a, b) a##b
2249
2250/** Expand the given vector
2251 *
2252 * @param[in] x The vector to be expanded
2253 *
2254 * @return The expanded output
2255 */
2256#define EXPAND(x) x
2257
2258/** Clamp the given value between an upper and lower bound.
2259 *
2260 * @param[in] x       The value to be clamped
2261 * @param[in] min_val The lower bound
2262 * @param[in] max_val The upper bound
2263 *
2264 * @return The clamped value.
2265 */
2266#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
2267
2268/** REVn reverses the given vector whose size is n.
2269 * @name REVn
2270 *
2271 * @param[in] x The vector to be reversed
2272 *
2273 * @return The reversed vector
2274 * @{
2275 */
2276#define REV1(x) ((x))
2277#define REV2(x) ((x).s10)
2278#define REV3(x) ((x).s210)
2279#define REV4(x) ((x).s3210)
2280#define REV8(x) ((x).s76543210)
2281#define REV16(x) ((x).sFEDCBA9876543210)
2282/** @} */ // end of group REVn
2283
2284/** Reverse the given vector.
2285 * @name REVERSE
2286 *
2287 * @param[in] x The vector to be reversed
2288 * @param[in] s The size of the vector
2289 *
2290 * @return The reversed vector
2291 * @{
2292 */
2293#define REVERSE_STR(x, s) REV##s((x))
2294#define REVERSE(x, s) REVERSE_STR(x, s)
2295/** @} */ // end of group REVERSE
2296
2297/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
2298 * @name ROTs_n
2299 *
2300 * @param[in] x The vector to be shifted
2301 *
2302 * @return The shifted vector
2303 * @{
2304 */
2305#define ROT1_0(x) ((x))
2306
2307#define ROT2_0(x) ((x))
2308#define ROT2_1(x) ((x).s10)
2309
2310#define ROT3_0(x) ((x))
2311#define ROT3_1(x) ((x).s201)
2312#define ROT3_2(x) ((x).s120)
2313
2314#define ROT4_0(x) ((x))
2315#define ROT4_1(x) ((x).s3012)
2316#define ROT4_2(x) ((x).s2301)
2317#define ROT4_3(x) ((x).s1230)
2318
2319#define ROT8_0(x) ((x))
2320#define ROT8_1(x) ((x).s70123456)
2321#define ROT8_2(x) ((x).s67012345)
2322#define ROT8_3(x) ((x).s56701234)
2323#define ROT8_4(x) ((x).s45670123)
2324#define ROT8_5(x) ((x).s34567012)
2325#define ROT8_6(x) ((x).s23456701)
2326#define ROT8_7(x) ((x).s12345670)
2327
2328#define ROT16_0(x) ((x))
2329#define ROT16_1(x) ((x).sF0123456789ABCDE)
2330#define ROT16_2(x) ((x).sEF0123456789ABCD)
2331#define ROT16_3(x) ((x).sDEF0123456789ABC)
2332#define ROT16_4(x) ((x).sCDEF0123456789AB)
2333#define ROT16_5(x) ((x).sBCDEF0123456789A)
2334#define ROT16_6(x) ((x).sABCDEF0123456789)
2335#define ROT16_7(x) ((x).s9ABCDEF012345678)
2336#define ROT16_8(x) ((x).s89ABCDEF01234567)
2337#define ROT16_9(x) ((x).s789ABCDEF0123456)
2338#define ROT16_10(x) ((x).s6789ABCDEF012345)
2339#define ROT16_11(x) ((x).s56789ABCDEF01234)
2340#define ROT16_12(x) ((x).s456789ABCDEF0123)
2341#define ROT16_13(x) ((x).s3456789ABCDEF012)
2342#define ROT16_14(x) ((x).s23456789ABCDEF01)
2343#define ROT16_15(x) ((x).s123456789ABCDEF0)
2344/** @} */ // end of group ROTs_n
2345
2346/** Circular-right-shift (rotate-right) the given vector by the given amount.
2347 * @name ROTATE
2348 *
2349 * @param[in] x The vector to be shifted
2350 * @param[in] s The size of the vector
2351 * @param[in] n The amount to be shifted
2352 *
2353 * @return The shifted vector
2354 * @{
2355 */
2356#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
2357#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
2358/** @} */ // end of group ROTATE
2359
2360/** Creates a vector of size n filled with offset values corresponding to the location of each element.
2361 * @name V_OFFSn
2362 *
2363 * @param[in] dt The data type of the output vector
2364 *
2365 * @return The vector filled with offset values
2366 * @{
2367 */
2368#define V_OFFS1(dt) (dt##1)(0)
2369#define V_OFFS2(dt) (dt##2)(0, 1)
2370#define V_OFFS3(dt) (dt##3)(0, 1, 2)
2371#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
2372#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
2373#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
2374/** @} */ // end of group V_OFFSn
2375
2376/** Create a vector filled with offset values corresponding to the location of each element.
2377 * @name VEC_OFFS
2378 *
2379 * @param[in] dt The data type of the output vector
2380 * @param[in] s  The size of the output vector
2381 *
2382 * @return The vector filled with offset values
2383 * @{
2384 */
2385#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
2386#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
2387/** @} */ // end of group VEC_OFFS
2388
2389#define VLOAD_STR(size) vload##size
2390#define VLOAD(size) VLOAD_STR(size)
2391
2392#define PIXEL_UNIT4 1
2393#define PIXEL_UNIT8 2
2394#define PIXEL_UNIT16 4
2395
2396/** Utility macro to convert a vector size in pixel unit.
2397 *
2398 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
2399 *
2400 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
2401 *
2402 * @return The pixel unit (number of pixels)
2403 * @{
2404 */
2405#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
2406#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
2407/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
2408
2409#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
2410#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
2411#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
2412
2413#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2414#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
2415#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
2416#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
2417#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2418
2419/** Utility macro to read a 2D OpenCL image object.
2420 *
2421 * @note Coordinates are not normalized
2422 *
2423 * @param[in] data_type Data type
2424 * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
2425 * @param[in] img       OpenCL image object
2426 * @param[in] x_coord   The x coordinate for the top-left pixel
2427 * @param[in] y_coord   The y coordinate for the top-left pixel
2428 *
2429 * @return Pixels from the 2D OpenCL image object
2430 * @{
2431 */
2432#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
2433#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
2434
2435#define VSTORE_STR(size) vstore##size
2436#define VSTORE(size) VSTORE_STR(size)
2437
2438#define float1 float
2439#define half1 half
2440#define char1 char
2441#define uchar1 uchar
2442#define short1 short
2443#define ushort1 ushort
2444#define int1 int
2445#define uint1 uint
2446#define long1 long
2447#define ulong1 ulong
2448#define double1 double
2449
2450#define vload1(OFFSET, PTR) *(OFFSET + PTR)
2451#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
2452
2453/** Extended partial vstore that correctly handles scalar values as well.
2454 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
2455 * @name VSTORE_PARTIAL
2456 *
2457 * @note With this macro, the passed data can be both a vector and a scalar
2458 * @note @p store_size needs to be <= @p size
2459 * eg 1: Valid
2460 * VSTORE_PARTIAL(16, 15) ...;
2461 * eg 2: Invalid
2462 * VSTORE_PARTIAL(4, 7) ...;
2463 *
2464 * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
2465 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
2466 * @{
2467 */
2468#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
2469#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
2470
2471#define NO_STORE(data, offs, ptr) \
2472    {                             \
2473    }
2474
2475// Size == 1 (scalar)
2476#define vstore_partial_1_0 NO_STORE
2477#define vstore_partial_1_1 vstore1
2478#define vstore_partial_1_2 NO_STORE
2479#define vstore_partial_1_3 NO_STORE
2480#define vstore_partial_1_4 NO_STORE
2481#define vstore_partial_1_5 NO_STORE
2482#define vstore_partial_1_6 NO_STORE
2483#define vstore_partial_1_7 NO_STORE
2484#define vstore_partial_1_8 NO_STORE
2485#define vstore_partial_1_9 NO_STORE
2486#define vstore_partial_1_10 NO_STORE
2487#define vstore_partial_1_11 NO_STORE
2488#define vstore_partial_1_12 NO_STORE
2489#define vstore_partial_1_13 NO_STORE
2490#define vstore_partial_1_14 NO_STORE
2491#define vstore_partial_1_15 NO_STORE
2492#define vstore_partial_1_16 NO_STORE
2493// Size == 2
2494#define vstore_partial_2_0 NO_STORE
2495#define vstore_partial_2_1 vstore_partial_1
2496#define vstore_partial_2_2 vstore_partial_2
2497#define vstore_partial_2_3 NO_STORE
2498#define vstore_partial_2_4 NO_STORE
2499#define vstore_partial_2_5 NO_STORE
2500#define vstore_partial_2_6 NO_STORE
2501#define vstore_partial_2_7 NO_STORE
2502#define vstore_partial_2_8 NO_STORE
2503#define vstore_partial_2_9 NO_STORE
2504#define vstore_partial_2_10 NO_STORE
2505#define vstore_partial_2_11 NO_STORE
2506#define vstore_partial_2_12 NO_STORE
2507#define vstore_partial_2_13 NO_STORE
2508#define vstore_partial_2_14 NO_STORE
2509#define vstore_partial_2_15 NO_STORE
2510#define vstore_partial_2_16 NO_STORE
2511// Size == 3
2512#define vstore_partial_3_0 NO_STORE
2513#define vstore_partial_3_1 vstore_partial_1
2514#define vstore_partial_3_2 vstore_partial_2
2515#define vstore_partial_3_3 vstore_partial_3
2516#define vstore_partial_3_4 NO_STORE
2517#define vstore_partial_3_5 NO_STORE
2518#define vstore_partial_3_6 NO_STORE
2519#define vstore_partial_3_7 NO_STORE
2520#define vstore_partial_3_8 NO_STORE
2521#define vstore_partial_3_9 NO_STORE
2522#define vstore_partial_3_10 NO_STORE
2523#define vstore_partial_3_11 NO_STORE
2524#define vstore_partial_3_12 NO_STORE
2525#define vstore_partial_3_13 NO_STORE
2526#define vstore_partial_3_14 NO_STORE
2527#define vstore_partial_3_15 NO_STORE
2528#define vstore_partial_3_16 NO_STORE
2529// Size == 4
2530#define vstore_partial_4_0 NO_STORE
2531#define vstore_partial_4_1 vstore_partial_1
2532#define vstore_partial_4_2 vstore_partial_2
2533#define vstore_partial_4_3 vstore_partial_3
2534#define vstore_partial_4_4 vstore_partial_4
2535#define vstore_partial_4_5 NO_STORE
2536#define vstore_partial_4_6 NO_STORE
2537#define vstore_partial_4_7 NO_STORE
2538#define vstore_partial_4_8 NO_STORE
2539#define vstore_partial_4_9 NO_STORE
2540#define vstore_partial_4_10 NO_STORE
2541#define vstore_partial_4_11 NO_STORE
2542#define vstore_partial_4_12 NO_STORE
2543#define vstore_partial_4_13 NO_STORE
2544#define vstore_partial_4_14 NO_STORE
2545#define vstore_partial_4_15 NO_STORE
2546#define vstore_partial_4_16 NO_STORE
2547// Size == 8
2548#define vstore_partial_8_0 NO_STORE
2549#define vstore_partial_8_1 vstore_partial_1
2550#define vstore_partial_8_2 vstore_partial_2
2551#define vstore_partial_8_3 vstore_partial_3
2552#define vstore_partial_8_4 vstore_partial_4
2553#define vstore_partial_8_5 vstore_partial_5
2554#define vstore_partial_8_6 vstore_partial_6
2555#define vstore_partial_8_7 vstore_partial_7
2556#define vstore_partial_8_8 vstore_partial_8
2557#define vstore_partial_8_9 NO_STORE
2558#define vstore_partial_8_10 NO_STORE
2559#define vstore_partial_8_11 NO_STORE
2560#define vstore_partial_8_12 NO_STORE
2561#define vstore_partial_8_13 NO_STORE
2562#define vstore_partial_8_14 NO_STORE
2563#define vstore_partial_8_15 NO_STORE
2564#define vstore_partial_8_16 NO_STORE
2565// Size == 16
2566#define vstore_partial_16_0 NO_STORE
2567#define vstore_partial_16_1 vstore_partial_1
2568#define vstore_partial_16_2 vstore_partial_2
2569#define vstore_partial_16_3 vstore_partial_3
2570#define vstore_partial_16_4 vstore_partial_4
2571#define vstore_partial_16_5 vstore_partial_5
2572#define vstore_partial_16_6 vstore_partial_6
2573#define vstore_partial_16_7 vstore_partial_7
2574#define vstore_partial_16_8 vstore_partial_8
2575#define vstore_partial_16_9 vstore_partial_9
2576#define vstore_partial_16_10 vstore_partial_10
2577#define vstore_partial_16_11 vstore_partial_11
2578#define vstore_partial_16_12 vstore_partial_12
2579#define vstore_partial_16_13 vstore_partial_13
2580#define vstore_partial_16_14 vstore_partial_14
2581#define vstore_partial_16_15 vstore_partial_15
2582#define vstore_partial_16_16 vstore_partial_16
2583
2584/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
2585 * @name vstore_partial_n
2586 *
2587 * @note @p DATA needs to be a vector not a scalar
2588 * @note n needs to be <= the vector width of the input variable @p DATA
2589 * eg 1: Valid
2590 * vstore_partial_15(var:float16, 0, 0xabcd);
2591 * eg 2: Invalid
2592 * vstore_partial_7(var:float4, 0, 0xabcd);
2593 *
2594 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
2595 *
2596 * @param[in] DATA   The name of the variable
2597 * @param[in] OFFSET Offset in n
2598 * @param[in] PTR    The base pointer
2599 * @{
2600 */
2601#define vstore_partial_1(DATA, OFFSET, PTR) \
2602    vstore1(DATA.s0, OFFSET, PTR);
2603
2604#define vstore_partial_2(DATA, OFFSET, PTR) \
2605    vstore2(DATA.s01, OFFSET, PTR);
2606
2607#define vstore_partial_3(DATA, OFFSET, PTR) \
2608    vstore3(DATA.s012, OFFSET, PTR);
2609
2610#define vstore_partial_4(DATA, OFFSET, PTR) \
2611    vstore4(DATA.s0123, OFFSET, PTR);
2612
2613#define vstore_partial_5(DATA, OFFSET, PTR)    \
2614    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2615    vstore1(DATA.s4, OFFSET, PTR + 4);
2616
2617#define vstore_partial_6(DATA, OFFSET, PTR)    \
2618    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2619    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2620
2621#define vstore_partial_7(DATA, OFFSET, PTR)    \
2622    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2623    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2624
2625#define vstore_partial_8(DATA, OFFSET, PTR) \
2626    vstore8(DATA.s01234567, OFFSET, PTR);
2627
2628#define vstore_partial_9(DATA, OFFSET, PTR)        \
2629    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2630    vstore1(DATA.s8, OFFSET, PTR + 8);
2631
2632#define vstore_partial_10(DATA, OFFSET, PTR)       \
2633    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2634    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2635
2636#define vstore_partial_11(DATA, OFFSET, PTR)       \
2637    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2638    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2639
2640#define vstore_partial_12(DATA, OFFSET, PTR)       \
2641    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2642    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2643
2644#define vstore_partial_13(DATA, OFFSET, PTR)       \
2645    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2646    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2647
2648#define vstore_partial_14(DATA, OFFSET, PTR)       \
2649    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2650    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2651
2652#define vstore_partial_15(DATA, OFFSET, PTR)       \
2653    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2654    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2655
2656#define vstore_partial_16(DATA, OFFSET, PTR) \
2657    vstore16(DATA, OFFSET, PTR);
2658/** @} */ // end of groupd vstore_partial_n
2659/** @} */ // end of groupd VSTORE_PARTIAL
2660
2661// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
2662// without _sat to overcome this issue
2663#define convert_float_sat convert_float
2664#define convert_float1_sat convert_float
2665#define convert_float2_sat convert_float2
2666#define convert_float3_sat convert_float3
2667#define convert_float4_sat convert_float4
2668#define convert_float8_sat convert_float8
2669#define convert_float16_sat convert_float16
2670#define convert_half_sat convert_float
2671#define convert_half1_sat convert_half
2672#define convert_half2_sat convert_half2
2673#define convert_half3_sat convert_half3
2674#define convert_half4_sat convert_half4
2675#define convert_half8_sat convert_half8
2676#define convert_half16_sat convert_half16
2677
2678#define convert_float1 convert_float
2679#define convert_half1 convert_half
2680#define convert_char1 convert_char
2681#define convert_uchar1 convert_uchar
2682#define convert_short1 convert_short
2683#define convert_ushort1 convert_ushort
2684#define convert_int1 convert_int
2685#define convert_uint1 convert_uint
2686#define convert_long1 convert_long
2687#define convert_ulong1 convert_ulong
2688#define convert_double1 convert_double
2689
2690#define convert_char1_sat convert_char_sat
2691#define convert_uchar1_sat convert_uchar_sat
2692#define convert_short1_sat convert_short_sat
2693#define convert_ushort1_sat convert_ushort_sat
2694#define convert_int1_sat convert_int_sat
2695#define convert_uint1_sat convert_uint_sat
2696#define convert_long1_sat convert_long_sat
2697#define convert_ulong1_sat convert_ulong_sat
2698#define convert_double1_sat convert_double_sat
2699
2700#define VEC_DATA_TYPE_STR(type, size) type##size
2701#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2702
2703#define CONVERT_STR(x, type) (convert_##type((x)))
2704#define CONVERT(x, type) CONVERT_STR(x, type)
2705
2706#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2707#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2708
2709#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2710#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2711
2712#define select_vec_dt_uchar(size) uchar##size
2713#define select_vec_dt_char(size) char##size
2714#define select_vec_dt_ushort(size) ushort##size
2715#define select_vec_dt_short(size) short##size
2716#define select_vec_dt_half(size) short##size
2717#define select_vec_dt_uint(size) uint##size
2718#define select_vec_dt_int(size) int##size
2719#define select_vec_dt_float(size) int##size
2720#define select_vec_dt_ulong(size) ulong##size
2721#define select_vec_dt_long(size) long##size
2722
2723#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2724#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2725#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2726
2727#define sum_reduce_1(x) (x)
2728#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2729#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2730#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2731#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2732#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2733
2734#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2735#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2736
2737#define max_reduce_1(x) (x)
2738#define max_reduce_2(x) max(((x).s0), ((x).s1))
2739#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2740#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2741#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2742#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2743
2744#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2745#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2746
2747#define VECTOR_DECLARATION(name)     \
2748    __global uchar *name##_ptr,      \
2749    uint        name##_stride_x, \
2750    uint        name##_step_x,   \
2751    uint        name##_offset_first_element_in_bytes
2752
2753#define IMAGE_DECLARATION(name)      \
2754    __global uchar *name##_ptr,      \
2755    uint        name##_stride_x, \
2756    uint        name##_step_x,   \
2757    uint        name##_stride_y, \
2758    uint        name##_step_y,   \
2759    uint        name##_offset_first_element_in_bytes
2760
2761#define TENSOR3D_DECLARATION(name)   \
2762    __global uchar *name##_ptr,      \
2763    uint        name##_stride_x, \
2764    uint        name##_step_x,   \
2765    uint        name##_stride_y, \
2766    uint        name##_step_y,   \
2767    uint        name##_stride_z, \
2768    uint        name##_step_z,   \
2769    uint        name##_offset_first_element_in_bytes
2770
2771#define TENSOR4D_DECLARATION(name)   \
2772    __global uchar *name##_ptr,      \
2773    uint        name##_stride_x, \
2774    uint        name##_step_x,   \
2775    uint        name##_stride_y, \
2776    uint        name##_step_y,   \
2777    uint        name##_stride_z, \
2778    uint        name##_step_z,   \
2779    uint        name##_stride_w, \
2780    uint        name##_step_w,   \
2781    uint        name##_offset_first_element_in_bytes
2782
2783#define CONVERT_TO_VECTOR_STRUCT(name) \
2784    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2785
2786#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2787    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2788
2789#define CONVERT_TO_IMAGE_STRUCT(name) \
2790    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2791
2792#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2793    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2794
2795#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2796    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2797
2798#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2799    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2800
2801#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2802    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2803
2804#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2805    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2806                                 name##_stride_z, name##_step_z)
2807
2808#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2809    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2810
2811#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2812    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2813                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2814
2815#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2816    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2817
2818#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2819    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2820                           name##_stride_z, name##_step_z)
2821
2822/** Structure to hold Vector information */
2823typedef struct Vector
2824{
2825    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2826    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2827    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2828} Vector;
2829
2830/** Structure to hold Image information */
2831typedef struct Image
2832{
2833    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2834    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2835    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2836    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
2837} Image;
2838
2839/** Structure to hold 3D tensor information */
2840typedef struct Tensor3D
2841{
2842    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2843    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2844    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2845    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
2846    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
2847} Tensor3D;
2848
2849/** Structure to hold 4D tensor information */
2850typedef struct Tensor4D
2851{
2852    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
2853    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
2854    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
2855    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
2856    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
2857    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
2858} Tensor4D;
2859
2860/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
2861 *
2862 * @param[in] ptr                           Pointer to the starting postion of the buffer
2863 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
2864 * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
2865 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2866 *
2867 * @return An image object
2868 */
2869inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2870{
2871    Vector vector =
2872    {
2873        .ptr                           = ptr,
2874        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2875        .stride_x                      = stride_x,
2876    };
2877    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2878    return vector;
2879}
2880
2881/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
2882 *
2883 * @param[in] ptr                           Pointer to the starting postion of the buffer
2884 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2885 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2886 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2887 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2888 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2889 *
2890 * @return An image object
2891 */
2892inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2893{
2894    Image img =
2895    {
2896        .ptr                           = ptr,
2897        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2898        .stride_x                      = stride_x,
2899        .stride_y                      = stride_y
2900    };
2901    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2902    return img;
2903}
2904
2905/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
2906 *
2907 * @param[in] ptr                           Pointer to the starting postion of the buffer
2908 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2909 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2910 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2911 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2912 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2913 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
2914 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
2915 *
2916 * @return A 3D tensor object
2917 */
2918inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2919{
2920    Image img =
2921    {
2922        .ptr                           = ptr,
2923        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2924        .stride_x                      = stride_x,
2925        .stride_y                      = stride_y
2926    };
2927    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2928    return img;
2929}
2930
2931/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
2932 *
2933 * @param[in] ptr                           Pointer to the starting postion of the buffer
2934 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2935 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2936 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2937 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2938 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2939 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
2940 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
2941 *
2942 * @return A 3D tensor object
2943 */
2944inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2945{
2946    Tensor3D tensor =
2947    {
2948        .ptr                           = ptr,
2949        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2950        .stride_x                      = stride_x,
2951        .stride_y                      = stride_y,
2952        .stride_z                      = stride_z
2953    };
2954    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2955    return tensor;
2956}
2957
2958/** Wrap 3D tensor information into an tensor structure.
2959 *
2960 * @param[in] ptr                           Pointer to the starting postion of the buffer
2961 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
2962 * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
2963 * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
2964 * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
2965 * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
2966 * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
2967 * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
2968 *
2969 * @return A 3D tensor object
2970 */
2971inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2972{
2973    Tensor3D tensor =
2974    {
2975        .ptr                           = ptr,
2976        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2977        .stride_x                      = stride_x,
2978        .stride_y                      = stride_y,
2979        .stride_z                      = stride_z
2980    };
2981    return tensor;
2982}
2983
2984inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2985                                             uint step_w,
2986                                             uint mod_size)
2987{
2988    Tensor4D tensor =
2989    {
2990        .ptr                           = ptr,
2991        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2992        .stride_x                      = stride_x,
2993        .stride_y                      = stride_y,
2994        .stride_z                      = stride_z,
2995        .stride_w                      = stride_w
2996    };
2997
2998    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2999    return tensor;
3000}
3001
3002/** Get the pointer position of a Vector
3003 *
3004 * @param[in] vec Pointer to the starting position of the buffer
3005 * @param[in] x   Relative X position
3006 */
3007inline __global const uchar *vector_offset(const Vector *vec, int x)
3008{
3009    return vec->ptr + x * vec->stride_x;
3010}
3011
3012/** Get the pointer position of a Image
3013 *
3014 * @param[in] img Pointer to the starting position of the buffer
3015 * @param[in] x   Relative X position
3016 * @param[in] y   Relative Y position
3017 */
3018inline __global uchar *offset(const Image *img, int x, int y)
3019{
3020    return img->ptr + x * img->stride_x + y * img->stride_y;
3021}
3022
3023/** Get the pointer position of a Tensor3D
3024 *
3025 * @param[in] tensor Pointer to the starting position of the buffer
3026 * @param[in] x      Relative X position
3027 * @param[in] y      Relative Y position
3028 * @param[in] z      Relative Z position
3029 */
3030inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
3031{
3032    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
3033}
3034
3035/** Get the pointer position of a Tensor4D
3036 *
3037 * @param[in] tensor Pointer to the starting position of the buffer
3038 * @param[in] x      Relative X position
3039 * @param[in] y      Relative Y position
3040 * @param[in] z      Relative Z position
3041 * @param[in] w      Relative W position
3042 */
3043inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
3044{
3045    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
3046}
3047
3048/** Get the offset for a given linear index of a Tensor3D
3049 *
3050 * @param[in] tensor Pointer to the starting position of the buffer
3051 * @param[in] width  Width of the input tensor
3052 * @param[in] height Height of the input tensor
3053 * @param[in] depth  Depth of the input tensor
3054 * @param[in] index  Linear index
3055 */
3056inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
3057{
3058    uint num_elements = width * height;
3059
3060    const uint z = index / num_elements;
3061
3062    index %= num_elements;
3063
3064    const uint y = index / width;
3065
3066    index %= width;
3067
3068    const uint x = index;
3069
3070    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
3071}
3072
3073#endif // _HELPER_H
3074
3075#if GPU_ARCH == GPU_ARCH_BIFROST
3076#define MLA(a, b, c) (fma(c, b, a))
3077#else // GPU_ARCH == GPU_ARCH_BIFROST
3078#define MLA(a, b, c) ((b) * (c) + (a))
3079#endif // GPU_ARCH == GPU_ARCH_BIFROST
3080
3081// Hard-Swish
3082#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
3083
3084// Logistic Activation
3085#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
3086
3087// Hyperbolic Tangent Activation
3088#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
3089
3090// RELU Tangent Activation
3091#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
3092
3093// Bounded RELU Activation
3094#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
3095
3096// Lower Upper Bounded RELU Activation
3097#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
3098
3099// Leaky RELU Activation
3100#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
3101
3102// Soft RELU Activation
3103#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
3104
3105// ELU Activation
3106#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
3107
3108// Absolute Activation
3109#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
3110
3111// Square Activation
3112#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
3113
3114// Square-root Activation
3115#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
3116
3117// Linear Activation
3118#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
3119
3120// Identity Activation
3121#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
3122
3123#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
3124
3125#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
3126
3127/** This performs an activation function on quantized inputs with float transformations.
3128 *
3129 * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
3130 *
3131 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
3132 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
3133 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
3134 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
3135 * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
3136 * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
3137 * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
3138 *
3139 * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
3140 * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
3141 * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
3142 * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
3143 * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
3144 * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
3145 * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
3146 * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
3147 * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
3148 * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
3149 * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
3150 * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
3151 * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
3152 * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
3153 * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
3154 * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
3155 */
3156__kernel void activation_layer_quant_f32(
3157    TENSOR3D_DECLARATION(input)
3158#ifndef IN_PLACE
3159    ,
3160    TENSOR3D_DECLARATION(output)
3161#endif /* not IN_PLACE */
3162)
3163{
3164    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
3165
3166    // Get pixels pointer
3167    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
3168#ifdef IN_PLACE
3169    __global uchar *output_addr = input_addr;
3170#else  /* IN_PLACE */
3171    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
3172#endif /* IN_PLACE */
3173
3174    // Load data
3175    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
3176
3177    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
3178#if defined(O1_VAL)
3179    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
3180#else  // defined(O1_VAL)
3181    data_flt                    = round(data_flt) * ((float)S1_VAL);
3182#endif // defined(O1_VAL)
3183    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
3184
3185#if defined(O2_VAL)
3186    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
3187#else  // defined(O2_VAL)
3188    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
3189#endif // defined(O2_VAL)
3190
3191    // Store result
3192    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
3193}
3194
3195#else // defined(FLOAT_DOMAIN)
3196// Activations performed in the quantized domain
3197
3198#if defined(ACT)
3199/** This performs an activation function on quantized inputs.
3200 *
3201 * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
3202 *
3203 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
3204 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
3205 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
3206 * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
3207 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
3208 * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
3209 * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
3210 * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
3211 *
3212 * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
3213 * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
3214 * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
3215 * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
3216 * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
3217 * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
3218 * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
3219 * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
3220 * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
3221 * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
3222 * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
3223 * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
3224 * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
3225 * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
3226 * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
3227 * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
3228 */
3229__kernel void activation_layer_quant(
3230    TENSOR3D_DECLARATION(input)
3231#ifndef IN_PLACE
3232    ,
3233    TENSOR3D_DECLARATION(output)
3234#endif /* not IN_PLACE */
3235)
3236{
3237    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
3238
3239    // Get pixels pointer
3240    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
3241#ifdef IN_PLACE
3242    __global uchar *output_addr = input_addr;
3243#else  /* IN_PLACE */
3244    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
3245#endif /* IN_PLACE */
3246
3247    // Load data
3248    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
3249
3250    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
3251
3252    // Store result
3253    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
3254}
3255#endif // defined(ACT)
3256#endif // defined(FLOAT_DOMAIN)
3257
3258)"