• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1R"(
2
3
4
5#ifndef ARM_COMPUTE_HELPER_H
6#define ARM_COMPUTE_HELPER_H
7
8
9
10
11#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
12    VSTORE(N0)                                                 \
13    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
14
15#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
16    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
17    VSTORE(N0)                                                 \
18    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
19
20#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
21    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
22    VSTORE(N0)                                                 \
23    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
24
25#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
26    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
27    VSTORE(N0)                                                 \
28    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
29
30#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
31    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
32    VSTORE(N0)                                                 \
33    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
34
35#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
36    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
37    VSTORE(N0)                                                 \
38    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
39
40#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
41    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
42    VSTORE(N0)                                                 \
43    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
44
45#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
46    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
47    VSTORE(N0)                                                 \
48    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
49
50#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
51    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
52    VSTORE(N0)                                                 \
53    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
54
55#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
56    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
57    VSTORE(N0)                                                  \
58    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
59
60#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
61    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
62    VSTORE(N0)                                                  \
63    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
64
65#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
66    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
67    VSTORE(N0)                                                  \
68    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
69
70#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
71    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
72    VSTORE(N0)                                                  \
73    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
74
75#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
76    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
77    VSTORE(N0)                                                  \
78    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
79
80#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
81    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
82    VSTORE(N0)                                                  \
83    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
84
85#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
86    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
87    VSTORE(N0)                                                  \
88    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
89
90
91
92#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
93    VSTORE(N0)                                                         \
94    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
95
96#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
97    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
98    VSTORE(N0)                                                         \
99    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
100
101#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
102    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
103    VSTORE(N0)                                                         \
104    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
105
106#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
107    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
108    VSTORE(N0)                                                         \
109    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
110
111#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
112    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
113    VSTORE(N0)                                                         \
114    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
115
116#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
117    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
118    VSTORE(N0)                                                         \
119    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
120
121#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
122    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
123    VSTORE(N0)                                                         \
124    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
125
126#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
127    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
128    VSTORE(N0)                                                         \
129    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
130
131#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
132    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
133    VSTORE(N0)                                                         \
134    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
135
136#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
137    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
138    VSTORE(N0)                                                     \
139    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
140
141#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
142    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
143    VSTORE(N0)                                                          \
144    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
145
146#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
147    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
148    VSTORE(N0)                                                          \
149    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
150
151#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
152    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
153    VSTORE(N0)                                                          \
154    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
155
156#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
157    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
158    VSTORE(N0)                                                          \
159    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
160
161#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
162    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
163    VSTORE(N0)                                                          \
164    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
165
166#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
167    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
168    VSTORE(N0)                                                          \
169    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
170
171
172
173
174#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
175#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176
177
178
179#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
180#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181
182
183
184#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
185    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
186    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
187
188#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
189    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
190    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
191    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
192
193#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
194    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
195    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
196    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
197
198#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
199    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
200    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
201    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
202
203#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
204    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
205    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
206    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
207
208#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
209    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
210    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
211    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
212
213#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
214    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
215    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
216    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
217
218#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
219    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
220    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
221    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
222
223#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
224    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
225    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
226    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
227
228#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
229    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
230    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
231    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
232
233#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
234    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
235    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
236    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
237
238#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
239    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
240    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
241    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
242
243#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
244    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
245    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
246    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
247
248#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
249    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
250    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
251    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
252
253#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
254    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
255    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
256    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
257
258#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
259    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
260    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
261    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
262
263
264
265#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
266#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267
268#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
269    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
270    {                                                                                                                                                     \
271        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
272    }                                                                                                                                                     \
273    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
274    {                                                                                                                                                     \
275        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
276    }                                                                                                                                                     \
277    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
278    {                                                                                                                                                     \
279        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
280    }                                                                                                                                                     \
281    else                                                                                                                                                  \
282    {                                                                                                                                                     \
283        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
284    }
285
286#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
287    if(!(PARTIAL_COND_X))                                                                                         \
288    {                                                                                                             \
289        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
290    }                                                                                                             \
291    else                                                                                                          \
292    {                                                                                                             \
293        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
294    }
295
296#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
297    if(!(PARTIAL_COND_Y))                                                                                         \
298    {                                                                                                             \
299        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
300    }                                                                                                             \
301    else                                                                                                          \
302    {                                                                                                             \
303        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
304    }
305
306
307#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
308
309
310#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
311
312#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
313    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
314
315#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
316
317#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
318    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
319
320#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
321
322#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
323    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
324
325#else
326
327#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
328    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
329
330#endif
331
332#endif
333
334
335#if defined(PARTIAL_STORE_M0)
336
337#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
338    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
339#else
340#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
341    ((uint)(y * M0))
342#endif
343
344
345
346#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
347    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
348
349
350#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
351#pragma OPENCL EXTENSION cl_khr_fp16 : enable
352#endif
353
354#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
355#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
356#endif
357
358#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
359#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
360#endif
361
362#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
363#pragma OPENCL EXTENSION cl_arm_printf : enable
364#endif
365
366#define GPU_ARCH_MIDGARD 0x100
367#define GPU_ARCH_BIFROST 0x200
368#define GPU_ARCH_VALHALL 0x300
369
370
371#define CONCAT(a, b) a##b
372
373
374#define EXPAND(x) x
375
376
377#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
378
379
380#define REV1(x) ((x))
381#define REV2(x) ((x).s10)
382#define REV3(x) ((x).s210)
383#define REV4(x) ((x).s3210)
384#define REV8(x) ((x).s76543210)
385#define REV16(x) ((x).sFEDCBA9876543210)
386
387
388
389#define REVERSE_STR(x, s) REV##s((x))
390#define REVERSE(x, s) REVERSE_STR(x, s)
391
392
393
394#define ROT1_0(x) ((x))
395#define ROT1_1(x) ((x))
396
397#define ROT2_0(x) ((x))
398#define ROT2_1(x) ((x).s10)
399#define ROT2_2(x) ((x))
400
401#define ROT3_0(x) ((x))
402#define ROT3_1(x) ((x).s201)
403#define ROT3_2(x) ((x).s120)
404#define ROT3_3(x) ((x))
405
406#define ROT4_0(x) ((x))
407#define ROT4_1(x) ((x).s3012)
408#define ROT4_2(x) ((x).s2301)
409#define ROT4_3(x) ((x).s1230)
410#define ROT4_4(x) ((x))
411
412#define ROT8_0(x) ((x))
413#define ROT8_1(x) ((x).s70123456)
414#define ROT8_2(x) ((x).s67012345)
415#define ROT8_3(x) ((x).s56701234)
416#define ROT8_4(x) ((x).s45670123)
417#define ROT8_5(x) ((x).s34567012)
418#define ROT8_6(x) ((x).s23456701)
419#define ROT8_7(x) ((x).s12345670)
420#define ROT8_8(x) ((x))
421
422#define ROT16_0(x) ((x))
423#define ROT16_1(x) ((x).sF0123456789ABCDE)
424#define ROT16_2(x) ((x).sEF0123456789ABCD)
425#define ROT16_3(x) ((x).sDEF0123456789ABC)
426#define ROT16_4(x) ((x).sCDEF0123456789AB)
427#define ROT16_5(x) ((x).sBCDEF0123456789A)
428#define ROT16_6(x) ((x).sABCDEF0123456789)
429#define ROT16_7(x) ((x).s9ABCDEF012345678)
430#define ROT16_8(x) ((x).s89ABCDEF01234567)
431#define ROT16_9(x) ((x).s789ABCDEF0123456)
432#define ROT16_10(x) ((x).s6789ABCDEF012345)
433#define ROT16_11(x) ((x).s56789ABCDEF01234)
434#define ROT16_12(x) ((x).s456789ABCDEF0123)
435#define ROT16_13(x) ((x).s3456789ABCDEF012)
436#define ROT16_14(x) ((x).s23456789ABCDEF01)
437#define ROT16_15(x) ((x).s123456789ABCDEF0)
438#define ROT16_16(x) ((x))
439
440
441
442#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
443#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
444
445
446
447#define V_OFFS1(dt) (dt##1)(0)
448#define V_OFFS2(dt) (dt##2)(0, 1)
449#define V_OFFS3(dt) (dt##3)(0, 1, 2)
450#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
451#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
452#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
453
454
455
456#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
457#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
458
459
460#define VLOAD_STR(size) vload##size
461#define VLOAD(size) VLOAD_STR(size)
462
463
464#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
465#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
466
467#define NO_LOAD(data, offs, ptr) \
468    {                            \
469    }
470
471
472#define vload_partial_1_0 NO_LOAD
473#define vload_partial_1_1 vload1
474#define vload_partial_1_2 NO_LOAD
475#define vload_partial_1_3 NO_LOAD
476#define vload_partial_1_4 NO_LOAD
477#define vload_partial_1_5 NO_LOAD
478#define vload_partial_1_6 NO_LOAD
479#define vload_partial_1_7 NO_LOAD
480#define vload_partial_1_8 NO_LOAD
481#define vload_partial_1_9 NO_LOAD
482#define vload_partial_1_10 NO_LOAD
483#define vload_partial_1_11 NO_LOAD
484#define vload_partial_1_12 NO_LOAD
485#define vload_partial_1_13 NO_LOAD
486#define vload_partial_1_14 NO_LOAD
487#define vload_partial_1_15 NO_LOAD
488#define vload_partial_1_16 NO_LOAD
489
490#define vload_partial_2_0 NO_LOAD
491#define vload_partial_2_1 vload_partial_1
492#define vload_partial_2_2 vload_partial_2
493#define vload_partial_2_3 NO_LOAD
494#define vload_partial_2_4 NO_LOAD
495#define vload_partial_2_5 NO_LOAD
496#define vload_partial_2_6 NO_LOAD
497#define vload_partial_2_7 NO_LOAD
498#define vload_partial_2_8 NO_LOAD
499#define vload_partial_2_9 NO_LOAD
500#define vload_partial_2_10 NO_LOAD
501#define vload_partial_2_11 NO_LOAD
502#define vload_partial_2_12 NO_LOAD
503#define vload_partial_2_13 NO_LOAD
504#define vload_partial_2_14 NO_LOAD
505#define vload_partial_2_15 NO_LOAD
506#define vload_partial_2_16 NO_LOAD
507
508#define vload_partial_3_0 NO_LOAD
509#define vload_partial_3_1 vload_partial_1
510#define vload_partial_3_2 vload_partial_2
511#define vload_partial_3_3 vload_partial_3
512#define vload_partial_3_4 NO_LOAD
513#define vload_partial_3_5 NO_LOAD
514#define vload_partial_3_6 NO_LOAD
515#define vload_partial_3_7 NO_LOAD
516#define vload_partial_3_8 NO_LOAD
517#define vload_partial_3_9 NO_LOAD
518#define vload_partial_3_10 NO_LOAD
519#define vload_partial_3_11 NO_LOAD
520#define vload_partial_3_12 NO_LOAD
521#define vload_partial_3_13 NO_LOAD
522#define vload_partial_3_14 NO_LOAD
523#define vload_partial_3_15 NO_LOAD
524#define vload_partial_3_16 NO_LOAD
525
526#define vload_partial_4_0 NO_LOAD
527#define vload_partial_4_1 vload_partial_1
528#define vload_partial_4_2 vload_partial_2
529#define vload_partial_4_3 vload_partial_3
530#define vload_partial_4_4 vload_partial_4
531#define vload_partial_4_5 NO_LOAD
532#define vload_partial_4_6 NO_LOAD
533#define vload_partial_4_7 NO_LOAD
534#define vload_partial_4_8 NO_LOAD
535#define vload_partial_4_9 NO_LOAD
536#define vload_partial_4_10 NO_LOAD
537#define vload_partial_4_11 NO_LOAD
538#define vload_partial_4_12 NO_LOAD
539#define vload_partial_4_13 NO_LOAD
540#define vload_partial_4_14 NO_LOAD
541#define vload_partial_4_15 NO_LOAD
542#define vload_partial_4_16 NO_LOAD
543
544#define vload_partial_8_0 NO_LOAD
545#define vload_partial_8_1 vload_partial_1
546#define vload_partial_8_2 vload_partial_2
547#define vload_partial_8_3 vload_partial_3
548#define vload_partial_8_4 vload_partial_4
549#define vload_partial_8_5 vload_partial_5
550#define vload_partial_8_6 vload_partial_6
551#define vload_partial_8_7 vload_partial_7
552#define vload_partial_8_8 vload_partial_8
553#define vload_partial_8_9 NO_LOAD
554#define vload_partial_8_10 NO_LOAD
555#define vload_partial_8_11 NO_LOAD
556#define vload_partial_8_12 NO_LOAD
557#define vload_partial_8_13 NO_LOAD
558#define vload_partial_8_14 NO_LOAD
559#define vload_partial_8_15 NO_LOAD
560#define vload_partial_8_16 NO_LOAD
561
562#define vload_partial_16_0 NO_LOAD
563#define vload_partial_16_1 vload_partial_1
564#define vload_partial_16_2 vload_partial_2
565#define vload_partial_16_3 vload_partial_3
566#define vload_partial_16_4 vload_partial_4
567#define vload_partial_16_5 vload_partial_5
568#define vload_partial_16_6 vload_partial_6
569#define vload_partial_16_7 vload_partial_7
570#define vload_partial_16_8 vload_partial_8
571#define vload_partial_16_9 vload_partial_9
572#define vload_partial_16_10 vload_partial_10
573#define vload_partial_16_11 vload_partial_11
574#define vload_partial_16_12 vload_partial_12
575#define vload_partial_16_13 vload_partial_13
576#define vload_partial_16_14 vload_partial_14
577#define vload_partial_16_15 vload_partial_15
578#define vload_partial_16_16 vload_partial_16
579
580
581#define vload_partial_1(DATA, OFFSET, PTR) \
582    DATA.s0 = vload1(OFFSET, PTR);
583
584#define vload_partial_2(DATA, OFFSET, PTR) \
585    DATA.s01 = vload2(OFFSET, PTR);
586
587#define vload_partial_3(DATA, OFFSET, PTR) \
588    DATA.s012 = vload3(OFFSET, PTR);
589
590#define vload_partial_4(DATA, OFFSET, PTR) \
591    DATA.s0123 = vload4(OFFSET, PTR);
592
593#define vload_partial_5(DATA, OFFSET, PTR)    \
594    vload_partial_4(DATA.s0123, OFFSET, PTR); \
595    DATA.s4 = vload1(OFFSET, PTR + 4);
596
597#define vload_partial_6(DATA, OFFSET, PTR)    \
598    vload_partial_4(DATA.s0123, OFFSET, PTR); \
599    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
600
601#define vload_partial_7(DATA, OFFSET, PTR)    \
602    vload_partial_4(DATA.s0123, OFFSET, PTR); \
603    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
604
605#define vload_partial_8(DATA, OFFSET, PTR) \
606    DATA.s01234567 = vload8(OFFSET, PTR);
607
608#define vload_partial_9(DATA, OFFSET, PTR)        \
609    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
610    DATA.s8 = vload1(OFFSET, PTR + 8);
611
612#define vload_partial_10(DATA, OFFSET, PTR)       \
613    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
614    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
615
616#define vload_partial_11(DATA, OFFSET, PTR)       \
617    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
618    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
619
620#define vload_partial_12(DATA, OFFSET, PTR)       \
621    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
622    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
623
624#define vload_partial_13(DATA, OFFSET, PTR)       \
625    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
626    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
627
628#define vload_partial_14(DATA, OFFSET, PTR)       \
629    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
630    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
631
632#define vload_partial_15(DATA, OFFSET, PTR)       \
633    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
634    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
635
636#define vload_partial_16(DATA, OFFSET, PTR) \
637    DATA = vload16(OFFSET, PTR);
638
639
640
641#define PIXEL_UNIT4 1
642#define PIXEL_UNIT8 2
643#define PIXEL_UNIT16 4
644
645
646#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
648
649
650#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
651#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
652#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
653
654#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
655#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
656#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
657#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
658#endif
659
660#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
661#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
662#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
663
664#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
665#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
666#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
667#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
668#endif
669
670
671#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
672#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
673
674
675#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
676#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
677
678#define VSTORE_STR(size) vstore##size
679#define VSTORE(size) VSTORE_STR(size)
680
681#define float1 float
682#define half1 half
683#define char1 char
684#define uchar1 uchar
685#define short1 short
686#define ushort1 ushort
687#define int1 int
688#define uint1 uint
689#define long1 long
690#define ulong1 ulong
691#define double1 double
692
693#define vload1(OFFSET, PTR) *(OFFSET + PTR)
694#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
695
696
697#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
698#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
699
700#define NO_STORE(data, offs, ptr) \
701    {                             \
702    }
703
704
705#define vstore_partial_1_0 NO_STORE
706#define vstore_partial_1_1 vstore1
707#define vstore_partial_1_2 NO_STORE
708#define vstore_partial_1_3 NO_STORE
709#define vstore_partial_1_4 NO_STORE
710#define vstore_partial_1_5 NO_STORE
711#define vstore_partial_1_6 NO_STORE
712#define vstore_partial_1_7 NO_STORE
713#define vstore_partial_1_8 NO_STORE
714#define vstore_partial_1_9 NO_STORE
715#define vstore_partial_1_10 NO_STORE
716#define vstore_partial_1_11 NO_STORE
717#define vstore_partial_1_12 NO_STORE
718#define vstore_partial_1_13 NO_STORE
719#define vstore_partial_1_14 NO_STORE
720#define vstore_partial_1_15 NO_STORE
721#define vstore_partial_1_16 NO_STORE
722
723#define vstore_partial_2_0 NO_STORE
724#define vstore_partial_2_1 vstore_partial_1
725#define vstore_partial_2_2 vstore_partial_2
726#define vstore_partial_2_3 NO_STORE
727#define vstore_partial_2_4 NO_STORE
728#define vstore_partial_2_5 NO_STORE
729#define vstore_partial_2_6 NO_STORE
730#define vstore_partial_2_7 NO_STORE
731#define vstore_partial_2_8 NO_STORE
732#define vstore_partial_2_9 NO_STORE
733#define vstore_partial_2_10 NO_STORE
734#define vstore_partial_2_11 NO_STORE
735#define vstore_partial_2_12 NO_STORE
736#define vstore_partial_2_13 NO_STORE
737#define vstore_partial_2_14 NO_STORE
738#define vstore_partial_2_15 NO_STORE
739#define vstore_partial_2_16 NO_STORE
740
741#define vstore_partial_3_0 NO_STORE
742#define vstore_partial_3_1 vstore_partial_1
743#define vstore_partial_3_2 vstore_partial_2
744#define vstore_partial_3_3 vstore_partial_3
745#define vstore_partial_3_4 NO_STORE
746#define vstore_partial_3_5 NO_STORE
747#define vstore_partial_3_6 NO_STORE
748#define vstore_partial_3_7 NO_STORE
749#define vstore_partial_3_8 NO_STORE
750#define vstore_partial_3_9 NO_STORE
751#define vstore_partial_3_10 NO_STORE
752#define vstore_partial_3_11 NO_STORE
753#define vstore_partial_3_12 NO_STORE
754#define vstore_partial_3_13 NO_STORE
755#define vstore_partial_3_14 NO_STORE
756#define vstore_partial_3_15 NO_STORE
757#define vstore_partial_3_16 NO_STORE
758
759#define vstore_partial_4_0 NO_STORE
760#define vstore_partial_4_1 vstore_partial_1
761#define vstore_partial_4_2 vstore_partial_2
762#define vstore_partial_4_3 vstore_partial_3
763#define vstore_partial_4_4 vstore_partial_4
764#define vstore_partial_4_5 NO_STORE
765#define vstore_partial_4_6 NO_STORE
766#define vstore_partial_4_7 NO_STORE
767#define vstore_partial_4_8 NO_STORE
768#define vstore_partial_4_9 NO_STORE
769#define vstore_partial_4_10 NO_STORE
770#define vstore_partial_4_11 NO_STORE
771#define vstore_partial_4_12 NO_STORE
772#define vstore_partial_4_13 NO_STORE
773#define vstore_partial_4_14 NO_STORE
774#define vstore_partial_4_15 NO_STORE
775#define vstore_partial_4_16 NO_STORE
776
777#define vstore_partial_8_0 NO_STORE
778#define vstore_partial_8_1 vstore_partial_1
779#define vstore_partial_8_2 vstore_partial_2
780#define vstore_partial_8_3 vstore_partial_3
781#define vstore_partial_8_4 vstore_partial_4
782#define vstore_partial_8_5 vstore_partial_5
783#define vstore_partial_8_6 vstore_partial_6
784#define vstore_partial_8_7 vstore_partial_7
785#define vstore_partial_8_8 vstore_partial_8
786#define vstore_partial_8_9 NO_STORE
787#define vstore_partial_8_10 NO_STORE
788#define vstore_partial_8_11 NO_STORE
789#define vstore_partial_8_12 NO_STORE
790#define vstore_partial_8_13 NO_STORE
791#define vstore_partial_8_14 NO_STORE
792#define vstore_partial_8_15 NO_STORE
793#define vstore_partial_8_16 NO_STORE
794
795#define vstore_partial_16_0 NO_STORE
796#define vstore_partial_16_1 vstore_partial_1
797#define vstore_partial_16_2 vstore_partial_2
798#define vstore_partial_16_3 vstore_partial_3
799#define vstore_partial_16_4 vstore_partial_4
800#define vstore_partial_16_5 vstore_partial_5
801#define vstore_partial_16_6 vstore_partial_6
802#define vstore_partial_16_7 vstore_partial_7
803#define vstore_partial_16_8 vstore_partial_8
804#define vstore_partial_16_9 vstore_partial_9
805#define vstore_partial_16_10 vstore_partial_10
806#define vstore_partial_16_11 vstore_partial_11
807#define vstore_partial_16_12 vstore_partial_12
808#define vstore_partial_16_13 vstore_partial_13
809#define vstore_partial_16_14 vstore_partial_14
810#define vstore_partial_16_15 vstore_partial_15
811#define vstore_partial_16_16 vstore_partial_16
812
813
814#define vstore_partial_1(DATA, OFFSET, PTR) \
815    vstore1(DATA.s0, OFFSET, PTR);
816
817#define vstore_partial_2(DATA, OFFSET, PTR) \
818    vstore2(DATA.s01, OFFSET, PTR);
819
820#define vstore_partial_3(DATA, OFFSET, PTR) \
821    vstore3(DATA.s012, OFFSET, PTR);
822
823#define vstore_partial_4(DATA, OFFSET, PTR) \
824    vstore4(DATA.s0123, OFFSET, PTR);
825
826#define vstore_partial_5(DATA, OFFSET, PTR)    \
827    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
828    vstore1(DATA.s4, OFFSET, PTR + 4);
829
830#define vstore_partial_6(DATA, OFFSET, PTR)    \
831    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
832    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
833
834#define vstore_partial_7(DATA, OFFSET, PTR)    \
835    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
836    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
837
838#define vstore_partial_8(DATA, OFFSET, PTR) \
839    vstore8(DATA.s01234567, OFFSET, PTR);
840
841#define vstore_partial_9(DATA, OFFSET, PTR)        \
842    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
843    vstore1(DATA.s8, OFFSET, PTR + 8);
844
845#define vstore_partial_10(DATA, OFFSET, PTR)       \
846    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
847    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
848
849#define vstore_partial_11(DATA, OFFSET, PTR)       \
850    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
851    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
852
853#define vstore_partial_12(DATA, OFFSET, PTR)       \
854    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
855    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
856
857#define vstore_partial_13(DATA, OFFSET, PTR)       \
858    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
859    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
860
861#define vstore_partial_14(DATA, OFFSET, PTR)       \
862    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
863    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
864
865#define vstore_partial_15(DATA, OFFSET, PTR)       \
866    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
867    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
868
869#define vstore_partial_16(DATA, OFFSET, PTR) \
870    vstore16(DATA, OFFSET, PTR);
871
872
873
874
875
876#define convert_float_sat convert_float
877#define convert_float1_sat convert_float
878#define convert_float2_sat convert_float2
879#define convert_float3_sat convert_float3
880#define convert_float4_sat convert_float4
881#define convert_float8_sat convert_float8
882#define convert_float16_sat convert_float16
883#define convert_half_sat convert_float
884#define convert_half1_sat convert_half
885#define convert_half2_sat convert_half2
886#define convert_half3_sat convert_half3
887#define convert_half4_sat convert_half4
888#define convert_half8_sat convert_half8
889#define convert_half16_sat convert_half16
890
891#define convert_float1 convert_float
892#define convert_half1 convert_half
893#define convert_char1 convert_char
894#define convert_uchar1 convert_uchar
895#define convert_short1 convert_short
896#define convert_ushort1 convert_ushort
897#define convert_int1 convert_int
898#define convert_uint1 convert_uint
899#define convert_long1 convert_long
900#define convert_ulong1 convert_ulong
901#define convert_double1 convert_double
902
903#define convert_char1_sat convert_char_sat
904#define convert_uchar1_sat convert_uchar_sat
905#define convert_uchar2_sat convert_uchar2_sat
906#define convert_uchar3_sat convert_uchar3_sat
907#define convert_uchar4_sat convert_uchar4_sat
908#define convert_uchar8_sat convert_uchar8_sat
909#define convert_uchar16_sat convert_uchar16_sat
910#define convert_short1_sat convert_short_sat
911#define convert_ushort1_sat convert_ushort_sat
912#define convert_int1_sat convert_int_sat
913#define convert_uint1_sat convert_uint_sat
914#define convert_long1_sat convert_long_sat
915#define convert_ulong1_sat convert_ulong_sat
916#define convert_double1_sat convert_double_sat
917
918#define VEC_DATA_TYPE_STR(type, size) type##size
919#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
920
921#define CONVERT_STR(x, type) (convert_##type((x)))
922#define CONVERT(x, type) CONVERT_STR(x, type)
923
924#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
925#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
926
927#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
928#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
929
930#define select_vec_dt_uchar(size) uchar##size
931#define select_vec_dt_char(size) char##size
932#define select_vec_dt_ushort(size) ushort##size
933#define select_vec_dt_short(size) short##size
934#define select_vec_dt_half(size) short##size
935#define select_vec_dt_uint(size) uint##size
936#define select_vec_dt_int(size) int##size
937#define select_vec_dt_float(size) int##size
938#define select_vec_dt_ulong(size) ulong##size
939#define select_vec_dt_long(size) long##size
940
941#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
942#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
943#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
944
945#define signed_int_vec_dt_uchar(size) char##size
946#define signed_int_vec_dt_char(size) char##size
947#define signed_int_vec_dt_ushort(size) short##size
948#define signed_int_vec_dt_short(size) short##size
949#define signed_int_vec_dt_half(size) short##size
950#define signed_int_vec_dt_uint(size) int##size
951#define signed_int_vec_dt_int(size) int##size
952#define signed_int_vec_dt_float(size) int##size
953#define signed_int_vec_dt_ulong(size) long##size
954#define signed_int_vec_dt_long(size) long##size
955
956#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
957#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
958#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
959
960#define sum_reduce_1(x) (x)
961#define sum_reduce_2(x) ((x).s0) + ((x).s1)
962#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
963#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
964#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
965#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
966
967#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
968#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
969
970#define prod_reduce_1(x) (x)
971#define prod_reduce_2(x) ((x).s0) * ((x).s1)
972#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
973#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
974#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
975#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
976
977#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
978#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
979
980#define max_reduce_1(x) (x)
981#define max_reduce_2(x) max(((x).s0), ((x).s1))
982#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
983#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
984#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
985#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
986
987#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
988#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
989
990#define VECTOR_DECLARATION(name)     \
991    __global uchar *name##_ptr,      \
992    uint        name##_stride_x, \
993    uint        name##_step_x,   \
994    uint        name##_offset_first_element_in_bytes
995
996#define IMAGE_DECLARATION(name)      \
997    __global uchar *name##_ptr,      \
998    uint        name##_stride_x, \
999    uint        name##_step_x,   \
1000    uint        name##_stride_y, \
1001    uint        name##_step_y,   \
1002    uint        name##_offset_first_element_in_bytes
1003
1004#define TENSOR3D_DECLARATION(name)   \
1005    __global uchar *name##_ptr,      \
1006    uint        name##_stride_x, \
1007    uint        name##_step_x,   \
1008    uint        name##_stride_y, \
1009    uint        name##_step_y,   \
1010    uint        name##_stride_z, \
1011    uint        name##_step_z,   \
1012    uint        name##_offset_first_element_in_bytes
1013
1014#define TENSOR4D_DECLARATION(name)   \
1015    __global uchar *name##_ptr,      \
1016    uint        name##_stride_x, \
1017    uint        name##_step_x,   \
1018    uint        name##_stride_y, \
1019    uint        name##_step_y,   \
1020    uint        name##_stride_z, \
1021    uint        name##_step_z,   \
1022    uint        name##_stride_w, \
1023    uint        name##_step_w,   \
1024    uint        name##_offset_first_element_in_bytes
1025
1026#define TENSOR5D_DECLARATION(name)   \
1027    __global uchar *name##_ptr,      \
1028    uint        name##_stride_x, \
1029    uint        name##_step_x,   \
1030    uint        name##_stride_y, \
1031    uint        name##_step_y,   \
1032    uint        name##_stride_z, \
1033    uint        name##_step_z,   \
1034    uint        name##_stride_w, \
1035    uint        name##_step_w,   \
1036    uint        name##_stride_v, \
1037    uint        name##_step_v,   \
1038    uint        name##_offset_first_element_in_bytes
1039
1040#define CONVERT_TO_VECTOR_STRUCT(name) \
1041    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1042
1043#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1044    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1045
1046#define CONVERT_TO_IMAGE_STRUCT(name) \
1047    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1048
1049#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1050    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1051
1052#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1053    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1054
1055#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1056    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1057
1058#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1059    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1060
1061#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1062    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1063                                 name##_stride_z, name##_step_z)
1064
1065#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1066    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1067
1068#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1069    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1070                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1071
1072#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1073    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1074
1075#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1076    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1077                           name##_stride_z, name##_step_z)
1078
1079
1080typedef struct Vector
1081{
1082    __global uchar *ptr;
1083    int             offset_first_element_in_bytes;
1084    int             stride_x;
1085} Vector;
1086
1087
1088typedef struct Image
1089{
1090    __global uchar *ptr;
1091    int             offset_first_element_in_bytes;
1092    int             stride_x;
1093    int             stride_y;
1094} Image;
1095
1096
1097typedef struct Tensor3D
1098{
1099    __global uchar *ptr;
1100    int             offset_first_element_in_bytes;
1101    int             stride_x;
1102    int             stride_y;
1103    int             stride_z;
1104} Tensor3D;
1105
1106
1107typedef struct Tensor4D
1108{
1109    __global uchar *ptr;
1110    int             offset_first_element_in_bytes;
1111    int             stride_x;
1112    int             stride_y;
1113    int             stride_z;
1114    int             stride_w;
1115} Tensor4D;
1116
1117
1118inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1119{
1120    Vector vector =
1121    {
1122        .ptr                           = ptr,
1123        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1124        .stride_x                      = stride_x,
1125    };
1126    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1127    return vector;
1128}
1129
1130
1131inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1132{
1133    Image img =
1134    {
1135        .ptr                           = ptr,
1136        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1137        .stride_x                      = stride_x,
1138        .stride_y                      = stride_y
1139    };
1140    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1141    return img;
1142}
1143
1144
1145inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1146{
1147    Image img =
1148    {
1149        .ptr                           = ptr,
1150        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1151        .stride_x                      = stride_x,
1152        .stride_y                      = stride_y
1153    };
1154    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1155    return img;
1156}
1157
1158
1159inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1160{
1161    Tensor3D tensor =
1162    {
1163        .ptr                           = ptr,
1164        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1165        .stride_x                      = stride_x,
1166        .stride_y                      = stride_y,
1167        .stride_z                      = stride_z
1168    };
1169    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1170    return tensor;
1171}
1172
1173
1174inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1175{
1176    Tensor3D tensor =
1177    {
1178        .ptr                           = ptr,
1179        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1180        .stride_x                      = stride_x,
1181        .stride_y                      = stride_y,
1182        .stride_z                      = stride_z
1183    };
1184    return tensor;
1185}
1186
1187inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1188                                             uint step_w,
1189                                             uint mod_size)
1190{
1191    Tensor4D tensor =
1192    {
1193        .ptr                           = ptr,
1194        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1195        .stride_x                      = stride_x,
1196        .stride_y                      = stride_y,
1197        .stride_z                      = stride_z,
1198        .stride_w                      = stride_w
1199    };
1200
1201    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1202    return tensor;
1203}
1204
1205
1206inline __global const uchar *vector_offset(const Vector *vec, int x)
1207{
1208    return vec->ptr + x * vec->stride_x;
1209}
1210
1211
1212inline __global uchar *offset(const Image *img, int x, int y)
1213{
1214    return img->ptr + x * img->stride_x + y * img->stride_y;
1215}
1216
1217
1218inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1219{
1220    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1221}
1222
1223
1224inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1225{
1226    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1227}
1228
1229
1230inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1231{
1232    uint num_elements = width * height;
1233
1234    const uint z = index / num_elements;
1235
1236    index %= num_elements;
1237
1238    const uint y = index / width;
1239
1240    index %= width;
1241
1242    const uint x = index;
1243
1244    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1245}
1246
1247#endif
1248
1249#if GPU_ARCH == GPU_ARCH_BIFROST
1250#define MLA(a, b, c) (fma(c, b, a))
1251#else
1252#define MLA(a, b, c) ((b) * (c) + (a))
1253#endif
1254
1255
1256#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1257
1258
1259#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1260
1261
1262#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1263
1264
1265#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1266
1267
1268#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1269
1270
1271#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1272
1273
1274#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1275
1276
1277#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1278
1279
1280#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1281
1282
1283#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1284
1285
1286#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1287
1288
1289#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1290
1291
1292#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1293
1294
1295#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1296
1297
1298#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1299
1300#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1301
1302#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1303
1304#ifndef ARM_COMPUTE_HELPER_H
1305#define ARM_COMPUTE_HELPER_H
1306
1307
1308
1309
1310#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1311    VSTORE(N0)                                                 \
1312    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1313
1314#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1315    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1316    VSTORE(N0)                                                 \
1317    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1318
1319#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1320    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1321    VSTORE(N0)                                                 \
1322    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1323
1324#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1325    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1326    VSTORE(N0)                                                 \
1327    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1328
1329#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1330    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1331    VSTORE(N0)                                                 \
1332    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1333
1334#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1335    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1336    VSTORE(N0)                                                 \
1337    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1338
1339#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1340    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1341    VSTORE(N0)                                                 \
1342    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1343
1344#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1345    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1346    VSTORE(N0)                                                 \
1347    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1348
1349#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1350    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1351    VSTORE(N0)                                                 \
1352    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1353
1354#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1355    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1356    VSTORE(N0)                                                  \
1357    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1358
1359#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1360    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1361    VSTORE(N0)                                                  \
1362    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1363
1364#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1365    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1366    VSTORE(N0)                                                  \
1367    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1368
1369#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1370    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1371    VSTORE(N0)                                                  \
1372    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1373
1374#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1375    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1376    VSTORE(N0)                                                  \
1377    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1378
1379#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1380    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1381    VSTORE(N0)                                                  \
1382    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1383
1384#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1385    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1386    VSTORE(N0)                                                  \
1387    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1388
1389
1390
1391#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1392    VSTORE(N0)                                                         \
1393    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1394
1395#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1396    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1397    VSTORE(N0)                                                         \
1398    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1399
1400#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1401    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1402    VSTORE(N0)                                                         \
1403    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1404
1405#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1406    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1407    VSTORE(N0)                                                         \
1408    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1409
1410#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1411    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1412    VSTORE(N0)                                                         \
1413    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1414
1415#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1416    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1417    VSTORE(N0)                                                         \
1418    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1419
1420#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1421    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1422    VSTORE(N0)                                                         \
1423    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1424
1425#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1426    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1427    VSTORE(N0)                                                         \
1428    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1429
1430#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1431    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1432    VSTORE(N0)                                                         \
1433    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1434
1435#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1436    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1437    VSTORE(N0)                                                     \
1438    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1439
1440#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1441    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1442    VSTORE(N0)                                                          \
1443    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1444
1445#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1446    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1447    VSTORE(N0)                                                          \
1448    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1449
1450#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1451    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1452    VSTORE(N0)                                                          \
1453    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1454
1455#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1456    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1457    VSTORE(N0)                                                          \
1458    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1459
1460#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1461    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1462    VSTORE(N0)                                                          \
1463    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1464
1465#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1466    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1467    VSTORE(N0)                                                          \
1468    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1469
1470
1471
1472
1473#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1474#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475
1476
1477
1478#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1479#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480
1481
1482
1483#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1484    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1485    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1486
1487#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1488    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1489    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1490    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1491
1492#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1493    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1494    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1495    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1496
1497#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1498    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1499    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1500    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1501
1502#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1503    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1504    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1505    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1506
1507#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1508    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1509    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1510    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1511
1512#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1513    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1514    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1515    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1516
1517#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1518    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1519    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1520    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1521
1522#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1523    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1524    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1525    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1526
1527#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1528    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1529    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1530    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1531
1532#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1533    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1534    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1535    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1536
1537#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1538    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1539    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1540    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1541
1542#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1543    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1544    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1545    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1546
1547#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1548    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1549    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1550    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1551
1552#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1553    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1554    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1555    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1556
1557#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1558    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1559    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1560    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1561
1562
1563
1564#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1565#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566
1567#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1568    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1569    {                                                                                                                                                     \
1570        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1571    }                                                                                                                                                     \
1572    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1573    {                                                                                                                                                     \
1574        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1575    }                                                                                                                                                     \
1576    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1577    {                                                                                                                                                     \
1578        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1579    }                                                                                                                                                     \
1580    else                                                                                                                                                  \
1581    {                                                                                                                                                     \
1582        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1583    }
1584
1585#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1586    if(!(PARTIAL_COND_X))                                                                                         \
1587    {                                                                                                             \
1588        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1589    }                                                                                                             \
1590    else                                                                                                          \
1591    {                                                                                                             \
1592        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1593    }
1594
1595#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1596    if(!(PARTIAL_COND_Y))                                                                                         \
1597    {                                                                                                             \
1598        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1599    }                                                                                                             \
1600    else                                                                                                          \
1601    {                                                                                                             \
1602        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1603    }
1604
1605
1606#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1607
1608
1609#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1610
1611#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1612    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1613
1614#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1615
1616#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1617    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1618
1619#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1620
1621#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1622    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1623
1624#else
1625
1626#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1627    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1628
1629#endif
1630
1631#endif
1632
1633
1634#if defined(PARTIAL_STORE_M0)
1635
1636#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1637    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1638#else
1639#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1640    ((uint)(y * M0))
1641#endif
1642
1643
1644
1645#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1646    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1647
1648
1649#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1650#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1651#endif
1652
1653#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1654#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1655#endif
1656
1657#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1658#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1659#endif
1660
1661#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1662#pragma OPENCL EXTENSION cl_arm_printf : enable
1663#endif
1664
1665#define GPU_ARCH_MIDGARD 0x100
1666#define GPU_ARCH_BIFROST 0x200
1667#define GPU_ARCH_VALHALL 0x300
1668
1669
1670#define CONCAT(a, b) a##b
1671
1672
1673#define EXPAND(x) x
1674
1675
1676#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1677
1678
1679#define REV1(x) ((x))
1680#define REV2(x) ((x).s10)
1681#define REV3(x) ((x).s210)
1682#define REV4(x) ((x).s3210)
1683#define REV8(x) ((x).s76543210)
1684#define REV16(x) ((x).sFEDCBA9876543210)
1685
1686
1687
1688#define REVERSE_STR(x, s) REV##s((x))
1689#define REVERSE(x, s) REVERSE_STR(x, s)
1690
1691
1692
1693#define ROT1_0(x) ((x))
1694#define ROT1_1(x) ((x))
1695
1696#define ROT2_0(x) ((x))
1697#define ROT2_1(x) ((x).s10)
1698#define ROT2_2(x) ((x))
1699
1700#define ROT3_0(x) ((x))
1701#define ROT3_1(x) ((x).s201)
1702#define ROT3_2(x) ((x).s120)
1703#define ROT3_3(x) ((x))
1704
1705#define ROT4_0(x) ((x))
1706#define ROT4_1(x) ((x).s3012)
1707#define ROT4_2(x) ((x).s2301)
1708#define ROT4_3(x) ((x).s1230)
1709#define ROT4_4(x) ((x))
1710
1711#define ROT8_0(x) ((x))
1712#define ROT8_1(x) ((x).s70123456)
1713#define ROT8_2(x) ((x).s67012345)
1714#define ROT8_3(x) ((x).s56701234)
1715#define ROT8_4(x) ((x).s45670123)
1716#define ROT8_5(x) ((x).s34567012)
1717#define ROT8_6(x) ((x).s23456701)
1718#define ROT8_7(x) ((x).s12345670)
1719#define ROT8_8(x) ((x))
1720
1721#define ROT16_0(x) ((x))
1722#define ROT16_1(x) ((x).sF0123456789ABCDE)
1723#define ROT16_2(x) ((x).sEF0123456789ABCD)
1724#define ROT16_3(x) ((x).sDEF0123456789ABC)
1725#define ROT16_4(x) ((x).sCDEF0123456789AB)
1726#define ROT16_5(x) ((x).sBCDEF0123456789A)
1727#define ROT16_6(x) ((x).sABCDEF0123456789)
1728#define ROT16_7(x) ((x).s9ABCDEF012345678)
1729#define ROT16_8(x) ((x).s89ABCDEF01234567)
1730#define ROT16_9(x) ((x).s789ABCDEF0123456)
1731#define ROT16_10(x) ((x).s6789ABCDEF012345)
1732#define ROT16_11(x) ((x).s56789ABCDEF01234)
1733#define ROT16_12(x) ((x).s456789ABCDEF0123)
1734#define ROT16_13(x) ((x).s3456789ABCDEF012)
1735#define ROT16_14(x) ((x).s23456789ABCDEF01)
1736#define ROT16_15(x) ((x).s123456789ABCDEF0)
1737#define ROT16_16(x) ((x))
1738
1739
1740
1741#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1742#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1743
1744
1745
1746#define V_OFFS1(dt) (dt##1)(0)
1747#define V_OFFS2(dt) (dt##2)(0, 1)
1748#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1749#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1750#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1751#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1752
1753
1754
1755#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1756#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1757
1758
1759#define VLOAD_STR(size) vload##size
1760#define VLOAD(size) VLOAD_STR(size)
1761
1762
1763#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1764#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1765
1766#define NO_LOAD(data, offs, ptr) \
1767    {                            \
1768    }
1769
1770
1771#define vload_partial_1_0 NO_LOAD
1772#define vload_partial_1_1 vload1
1773#define vload_partial_1_2 NO_LOAD
1774#define vload_partial_1_3 NO_LOAD
1775#define vload_partial_1_4 NO_LOAD
1776#define vload_partial_1_5 NO_LOAD
1777#define vload_partial_1_6 NO_LOAD
1778#define vload_partial_1_7 NO_LOAD
1779#define vload_partial_1_8 NO_LOAD
1780#define vload_partial_1_9 NO_LOAD
1781#define vload_partial_1_10 NO_LOAD
1782#define vload_partial_1_11 NO_LOAD
1783#define vload_partial_1_12 NO_LOAD
1784#define vload_partial_1_13 NO_LOAD
1785#define vload_partial_1_14 NO_LOAD
1786#define vload_partial_1_15 NO_LOAD
1787#define vload_partial_1_16 NO_LOAD
1788
1789#define vload_partial_2_0 NO_LOAD
1790#define vload_partial_2_1 vload_partial_1
1791#define vload_partial_2_2 vload_partial_2
1792#define vload_partial_2_3 NO_LOAD
1793#define vload_partial_2_4 NO_LOAD
1794#define vload_partial_2_5 NO_LOAD
1795#define vload_partial_2_6 NO_LOAD
1796#define vload_partial_2_7 NO_LOAD
1797#define vload_partial_2_8 NO_LOAD
1798#define vload_partial_2_9 NO_LOAD
1799#define vload_partial_2_10 NO_LOAD
1800#define vload_partial_2_11 NO_LOAD
1801#define vload_partial_2_12 NO_LOAD
1802#define vload_partial_2_13 NO_LOAD
1803#define vload_partial_2_14 NO_LOAD
1804#define vload_partial_2_15 NO_LOAD
1805#define vload_partial_2_16 NO_LOAD
1806
1807#define vload_partial_3_0 NO_LOAD
1808#define vload_partial_3_1 vload_partial_1
1809#define vload_partial_3_2 vload_partial_2
1810#define vload_partial_3_3 vload_partial_3
1811#define vload_partial_3_4 NO_LOAD
1812#define vload_partial_3_5 NO_LOAD
1813#define vload_partial_3_6 NO_LOAD
1814#define vload_partial_3_7 NO_LOAD
1815#define vload_partial_3_8 NO_LOAD
1816#define vload_partial_3_9 NO_LOAD
1817#define vload_partial_3_10 NO_LOAD
1818#define vload_partial_3_11 NO_LOAD
1819#define vload_partial_3_12 NO_LOAD
1820#define vload_partial_3_13 NO_LOAD
1821#define vload_partial_3_14 NO_LOAD
1822#define vload_partial_3_15 NO_LOAD
1823#define vload_partial_3_16 NO_LOAD
1824
1825#define vload_partial_4_0 NO_LOAD
1826#define vload_partial_4_1 vload_partial_1
1827#define vload_partial_4_2 vload_partial_2
1828#define vload_partial_4_3 vload_partial_3
1829#define vload_partial_4_4 vload_partial_4
1830#define vload_partial_4_5 NO_LOAD
1831#define vload_partial_4_6 NO_LOAD
1832#define vload_partial_4_7 NO_LOAD
1833#define vload_partial_4_8 NO_LOAD
1834#define vload_partial_4_9 NO_LOAD
1835#define vload_partial_4_10 NO_LOAD
1836#define vload_partial_4_11 NO_LOAD
1837#define vload_partial_4_12 NO_LOAD
1838#define vload_partial_4_13 NO_LOAD
1839#define vload_partial_4_14 NO_LOAD
1840#define vload_partial_4_15 NO_LOAD
1841#define vload_partial_4_16 NO_LOAD
1842
1843#define vload_partial_8_0 NO_LOAD
1844#define vload_partial_8_1 vload_partial_1
1845#define vload_partial_8_2 vload_partial_2
1846#define vload_partial_8_3 vload_partial_3
1847#define vload_partial_8_4 vload_partial_4
1848#define vload_partial_8_5 vload_partial_5
1849#define vload_partial_8_6 vload_partial_6
1850#define vload_partial_8_7 vload_partial_7
1851#define vload_partial_8_8 vload_partial_8
1852#define vload_partial_8_9 NO_LOAD
1853#define vload_partial_8_10 NO_LOAD
1854#define vload_partial_8_11 NO_LOAD
1855#define vload_partial_8_12 NO_LOAD
1856#define vload_partial_8_13 NO_LOAD
1857#define vload_partial_8_14 NO_LOAD
1858#define vload_partial_8_15 NO_LOAD
1859#define vload_partial_8_16 NO_LOAD
1860
1861#define vload_partial_16_0 NO_LOAD
1862#define vload_partial_16_1 vload_partial_1
1863#define vload_partial_16_2 vload_partial_2
1864#define vload_partial_16_3 vload_partial_3
1865#define vload_partial_16_4 vload_partial_4
1866#define vload_partial_16_5 vload_partial_5
1867#define vload_partial_16_6 vload_partial_6
1868#define vload_partial_16_7 vload_partial_7
1869#define vload_partial_16_8 vload_partial_8
1870#define vload_partial_16_9 vload_partial_9
1871#define vload_partial_16_10 vload_partial_10
1872#define vload_partial_16_11 vload_partial_11
1873#define vload_partial_16_12 vload_partial_12
1874#define vload_partial_16_13 vload_partial_13
1875#define vload_partial_16_14 vload_partial_14
1876#define vload_partial_16_15 vload_partial_15
1877#define vload_partial_16_16 vload_partial_16
1878
1879
1880#define vload_partial_1(DATA, OFFSET, PTR) \
1881    DATA.s0 = vload1(OFFSET, PTR);
1882
1883#define vload_partial_2(DATA, OFFSET, PTR) \
1884    DATA.s01 = vload2(OFFSET, PTR);
1885
1886#define vload_partial_3(DATA, OFFSET, PTR) \
1887    DATA.s012 = vload3(OFFSET, PTR);
1888
1889#define vload_partial_4(DATA, OFFSET, PTR) \
1890    DATA.s0123 = vload4(OFFSET, PTR);
1891
1892#define vload_partial_5(DATA, OFFSET, PTR)    \
1893    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1894    DATA.s4 = vload1(OFFSET, PTR + 4);
1895
1896#define vload_partial_6(DATA, OFFSET, PTR)    \
1897    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1898    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1899
1900#define vload_partial_7(DATA, OFFSET, PTR)    \
1901    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1902    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1903
1904#define vload_partial_8(DATA, OFFSET, PTR) \
1905    DATA.s01234567 = vload8(OFFSET, PTR);
1906
1907#define vload_partial_9(DATA, OFFSET, PTR)        \
1908    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1909    DATA.s8 = vload1(OFFSET, PTR + 8);
1910
1911#define vload_partial_10(DATA, OFFSET, PTR)       \
1912    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1913    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1914
1915#define vload_partial_11(DATA, OFFSET, PTR)       \
1916    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1917    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1918
1919#define vload_partial_12(DATA, OFFSET, PTR)       \
1920    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1921    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1922
1923#define vload_partial_13(DATA, OFFSET, PTR)       \
1924    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1925    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1926
1927#define vload_partial_14(DATA, OFFSET, PTR)       \
1928    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1929    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1930
1931#define vload_partial_15(DATA, OFFSET, PTR)       \
1932    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1933    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1934
1935#define vload_partial_16(DATA, OFFSET, PTR) \
1936    DATA = vload16(OFFSET, PTR);
1937
1938
1939
1940#define PIXEL_UNIT4 1
1941#define PIXEL_UNIT8 2
1942#define PIXEL_UNIT16 4
1943
1944
1945#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1947
1948
1949#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1950#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1951#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1952
1953#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1954#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1955#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1956#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1957#endif
1958
1959#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1960#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1961#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1962
1963#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1964#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1965#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1966#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1967#endif
1968
1969
1970#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1971#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1972
1973
1974#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1975#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1976
1977#define VSTORE_STR(size) vstore##size
1978#define VSTORE(size) VSTORE_STR(size)
1979
1980#define float1 float
1981#define half1 half
1982#define char1 char
1983#define uchar1 uchar
1984#define short1 short
1985#define ushort1 ushort
1986#define int1 int
1987#define uint1 uint
1988#define long1 long
1989#define ulong1 ulong
1990#define double1 double
1991
1992#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1993#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1994
1995
1996#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1997#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1998
1999#define NO_STORE(data, offs, ptr) \
2000    {                             \
2001    }
2002
2003
2004#define vstore_partial_1_0 NO_STORE
2005#define vstore_partial_1_1 vstore1
2006#define vstore_partial_1_2 NO_STORE
2007#define vstore_partial_1_3 NO_STORE
2008#define vstore_partial_1_4 NO_STORE
2009#define vstore_partial_1_5 NO_STORE
2010#define vstore_partial_1_6 NO_STORE
2011#define vstore_partial_1_7 NO_STORE
2012#define vstore_partial_1_8 NO_STORE
2013#define vstore_partial_1_9 NO_STORE
2014#define vstore_partial_1_10 NO_STORE
2015#define vstore_partial_1_11 NO_STORE
2016#define vstore_partial_1_12 NO_STORE
2017#define vstore_partial_1_13 NO_STORE
2018#define vstore_partial_1_14 NO_STORE
2019#define vstore_partial_1_15 NO_STORE
2020#define vstore_partial_1_16 NO_STORE
2021
2022#define vstore_partial_2_0 NO_STORE
2023#define vstore_partial_2_1 vstore_partial_1
2024#define vstore_partial_2_2 vstore_partial_2
2025#define vstore_partial_2_3 NO_STORE
2026#define vstore_partial_2_4 NO_STORE
2027#define vstore_partial_2_5 NO_STORE
2028#define vstore_partial_2_6 NO_STORE
2029#define vstore_partial_2_7 NO_STORE
2030#define vstore_partial_2_8 NO_STORE
2031#define vstore_partial_2_9 NO_STORE
2032#define vstore_partial_2_10 NO_STORE
2033#define vstore_partial_2_11 NO_STORE
2034#define vstore_partial_2_12 NO_STORE
2035#define vstore_partial_2_13 NO_STORE
2036#define vstore_partial_2_14 NO_STORE
2037#define vstore_partial_2_15 NO_STORE
2038#define vstore_partial_2_16 NO_STORE
2039
2040#define vstore_partial_3_0 NO_STORE
2041#define vstore_partial_3_1 vstore_partial_1
2042#define vstore_partial_3_2 vstore_partial_2
2043#define vstore_partial_3_3 vstore_partial_3
2044#define vstore_partial_3_4 NO_STORE
2045#define vstore_partial_3_5 NO_STORE
2046#define vstore_partial_3_6 NO_STORE
2047#define vstore_partial_3_7 NO_STORE
2048#define vstore_partial_3_8 NO_STORE
2049#define vstore_partial_3_9 NO_STORE
2050#define vstore_partial_3_10 NO_STORE
2051#define vstore_partial_3_11 NO_STORE
2052#define vstore_partial_3_12 NO_STORE
2053#define vstore_partial_3_13 NO_STORE
2054#define vstore_partial_3_14 NO_STORE
2055#define vstore_partial_3_15 NO_STORE
2056#define vstore_partial_3_16 NO_STORE
2057
2058#define vstore_partial_4_0 NO_STORE
2059#define vstore_partial_4_1 vstore_partial_1
2060#define vstore_partial_4_2 vstore_partial_2
2061#define vstore_partial_4_3 vstore_partial_3
2062#define vstore_partial_4_4 vstore_partial_4
2063#define vstore_partial_4_5 NO_STORE
2064#define vstore_partial_4_6 NO_STORE
2065#define vstore_partial_4_7 NO_STORE
2066#define vstore_partial_4_8 NO_STORE
2067#define vstore_partial_4_9 NO_STORE
2068#define vstore_partial_4_10 NO_STORE
2069#define vstore_partial_4_11 NO_STORE
2070#define vstore_partial_4_12 NO_STORE
2071#define vstore_partial_4_13 NO_STORE
2072#define vstore_partial_4_14 NO_STORE
2073#define vstore_partial_4_15 NO_STORE
2074#define vstore_partial_4_16 NO_STORE
2075
2076#define vstore_partial_8_0 NO_STORE
2077#define vstore_partial_8_1 vstore_partial_1
2078#define vstore_partial_8_2 vstore_partial_2
2079#define vstore_partial_8_3 vstore_partial_3
2080#define vstore_partial_8_4 vstore_partial_4
2081#define vstore_partial_8_5 vstore_partial_5
2082#define vstore_partial_8_6 vstore_partial_6
2083#define vstore_partial_8_7 vstore_partial_7
2084#define vstore_partial_8_8 vstore_partial_8
2085#define vstore_partial_8_9 NO_STORE
2086#define vstore_partial_8_10 NO_STORE
2087#define vstore_partial_8_11 NO_STORE
2088#define vstore_partial_8_12 NO_STORE
2089#define vstore_partial_8_13 NO_STORE
2090#define vstore_partial_8_14 NO_STORE
2091#define vstore_partial_8_15 NO_STORE
2092#define vstore_partial_8_16 NO_STORE
2093
2094#define vstore_partial_16_0 NO_STORE
2095#define vstore_partial_16_1 vstore_partial_1
2096#define vstore_partial_16_2 vstore_partial_2
2097#define vstore_partial_16_3 vstore_partial_3
2098#define vstore_partial_16_4 vstore_partial_4
2099#define vstore_partial_16_5 vstore_partial_5
2100#define vstore_partial_16_6 vstore_partial_6
2101#define vstore_partial_16_7 vstore_partial_7
2102#define vstore_partial_16_8 vstore_partial_8
2103#define vstore_partial_16_9 vstore_partial_9
2104#define vstore_partial_16_10 vstore_partial_10
2105#define vstore_partial_16_11 vstore_partial_11
2106#define vstore_partial_16_12 vstore_partial_12
2107#define vstore_partial_16_13 vstore_partial_13
2108#define vstore_partial_16_14 vstore_partial_14
2109#define vstore_partial_16_15 vstore_partial_15
2110#define vstore_partial_16_16 vstore_partial_16
2111
2112
2113#define vstore_partial_1(DATA, OFFSET, PTR) \
2114    vstore1(DATA.s0, OFFSET, PTR);
2115
2116#define vstore_partial_2(DATA, OFFSET, PTR) \
2117    vstore2(DATA.s01, OFFSET, PTR);
2118
2119#define vstore_partial_3(DATA, OFFSET, PTR) \
2120    vstore3(DATA.s012, OFFSET, PTR);
2121
2122#define vstore_partial_4(DATA, OFFSET, PTR) \
2123    vstore4(DATA.s0123, OFFSET, PTR);
2124
2125#define vstore_partial_5(DATA, OFFSET, PTR)    \
2126    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2127    vstore1(DATA.s4, OFFSET, PTR + 4);
2128
2129#define vstore_partial_6(DATA, OFFSET, PTR)    \
2130    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2131    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2132
2133#define vstore_partial_7(DATA, OFFSET, PTR)    \
2134    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2135    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2136
2137#define vstore_partial_8(DATA, OFFSET, PTR) \
2138    vstore8(DATA.s01234567, OFFSET, PTR);
2139
2140#define vstore_partial_9(DATA, OFFSET, PTR)        \
2141    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2142    vstore1(DATA.s8, OFFSET, PTR + 8);
2143
2144#define vstore_partial_10(DATA, OFFSET, PTR)       \
2145    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2146    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2147
2148#define vstore_partial_11(DATA, OFFSET, PTR)       \
2149    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2150    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2151
2152#define vstore_partial_12(DATA, OFFSET, PTR)       \
2153    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2154    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2155
2156#define vstore_partial_13(DATA, OFFSET, PTR)       \
2157    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2158    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2159
2160#define vstore_partial_14(DATA, OFFSET, PTR)       \
2161    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2162    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2163
2164#define vstore_partial_15(DATA, OFFSET, PTR)       \
2165    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2166    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2167
2168#define vstore_partial_16(DATA, OFFSET, PTR) \
2169    vstore16(DATA, OFFSET, PTR);
2170
2171
2172
2173
2174
2175#define convert_float_sat convert_float
2176#define convert_float1_sat convert_float
2177#define convert_float2_sat convert_float2
2178#define convert_float3_sat convert_float3
2179#define convert_float4_sat convert_float4
2180#define convert_float8_sat convert_float8
2181#define convert_float16_sat convert_float16
2182#define convert_half_sat convert_float
2183#define convert_half1_sat convert_half
2184#define convert_half2_sat convert_half2
2185#define convert_half3_sat convert_half3
2186#define convert_half4_sat convert_half4
2187#define convert_half8_sat convert_half8
2188#define convert_half16_sat convert_half16
2189
2190#define convert_float1 convert_float
2191#define convert_half1 convert_half
2192#define convert_char1 convert_char
2193#define convert_uchar1 convert_uchar
2194#define convert_short1 convert_short
2195#define convert_ushort1 convert_ushort
2196#define convert_int1 convert_int
2197#define convert_uint1 convert_uint
2198#define convert_long1 convert_long
2199#define convert_ulong1 convert_ulong
2200#define convert_double1 convert_double
2201
2202#define convert_char1_sat convert_char_sat
2203#define convert_uchar1_sat convert_uchar_sat
2204#define convert_uchar2_sat convert_uchar2_sat
2205#define convert_uchar3_sat convert_uchar3_sat
2206#define convert_uchar4_sat convert_uchar4_sat
2207#define convert_uchar8_sat convert_uchar8_sat
2208#define convert_uchar16_sat convert_uchar16_sat
2209#define convert_short1_sat convert_short_sat
2210#define convert_ushort1_sat convert_ushort_sat
2211#define convert_int1_sat convert_int_sat
2212#define convert_uint1_sat convert_uint_sat
2213#define convert_long1_sat convert_long_sat
2214#define convert_ulong1_sat convert_ulong_sat
2215#define convert_double1_sat convert_double_sat
2216
2217#define VEC_DATA_TYPE_STR(type, size) type##size
2218#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2219
2220#define CONVERT_STR(x, type) (convert_##type((x)))
2221#define CONVERT(x, type) CONVERT_STR(x, type)
2222
2223#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2224#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2225
2226#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2227#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2228
2229#define select_vec_dt_uchar(size) uchar##size
2230#define select_vec_dt_char(size) char##size
2231#define select_vec_dt_ushort(size) ushort##size
2232#define select_vec_dt_short(size) short##size
2233#define select_vec_dt_half(size) short##size
2234#define select_vec_dt_uint(size) uint##size
2235#define select_vec_dt_int(size) int##size
2236#define select_vec_dt_float(size) int##size
2237#define select_vec_dt_ulong(size) ulong##size
2238#define select_vec_dt_long(size) long##size
2239
2240#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2241#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2242#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2243
2244#define signed_int_vec_dt_uchar(size) char##size
2245#define signed_int_vec_dt_char(size) char##size
2246#define signed_int_vec_dt_ushort(size) short##size
2247#define signed_int_vec_dt_short(size) short##size
2248#define signed_int_vec_dt_half(size) short##size
2249#define signed_int_vec_dt_uint(size) int##size
2250#define signed_int_vec_dt_int(size) int##size
2251#define signed_int_vec_dt_float(size) int##size
2252#define signed_int_vec_dt_ulong(size) long##size
2253#define signed_int_vec_dt_long(size) long##size
2254
2255#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2256#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2257#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2258
2259#define sum_reduce_1(x) (x)
2260#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2261#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2262#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2263#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2264#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2265
2266#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2267#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2268
2269#define prod_reduce_1(x) (x)
2270#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2271#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2272#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2273#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2274#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2275
2276#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2277#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2278
2279#define max_reduce_1(x) (x)
2280#define max_reduce_2(x) max(((x).s0), ((x).s1))
2281#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2282#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2283#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2284#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2285
2286#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2287#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2288
2289#define VECTOR_DECLARATION(name)     \
2290    __global uchar *name##_ptr,      \
2291    uint        name##_stride_x, \
2292    uint        name##_step_x,   \
2293    uint        name##_offset_first_element_in_bytes
2294
2295#define IMAGE_DECLARATION(name)      \
2296    __global uchar *name##_ptr,      \
2297    uint        name##_stride_x, \
2298    uint        name##_step_x,   \
2299    uint        name##_stride_y, \
2300    uint        name##_step_y,   \
2301    uint        name##_offset_first_element_in_bytes
2302
2303#define TENSOR3D_DECLARATION(name)   \
2304    __global uchar *name##_ptr,      \
2305    uint        name##_stride_x, \
2306    uint        name##_step_x,   \
2307    uint        name##_stride_y, \
2308    uint        name##_step_y,   \
2309    uint        name##_stride_z, \
2310    uint        name##_step_z,   \
2311    uint        name##_offset_first_element_in_bytes
2312
2313#define TENSOR4D_DECLARATION(name)   \
2314    __global uchar *name##_ptr,      \
2315    uint        name##_stride_x, \
2316    uint        name##_step_x,   \
2317    uint        name##_stride_y, \
2318    uint        name##_step_y,   \
2319    uint        name##_stride_z, \
2320    uint        name##_step_z,   \
2321    uint        name##_stride_w, \
2322    uint        name##_step_w,   \
2323    uint        name##_offset_first_element_in_bytes
2324
2325#define TENSOR5D_DECLARATION(name)   \
2326    __global uchar *name##_ptr,      \
2327    uint        name##_stride_x, \
2328    uint        name##_step_x,   \
2329    uint        name##_stride_y, \
2330    uint        name##_step_y,   \
2331    uint        name##_stride_z, \
2332    uint        name##_step_z,   \
2333    uint        name##_stride_w, \
2334    uint        name##_step_w,   \
2335    uint        name##_stride_v, \
2336    uint        name##_step_v,   \
2337    uint        name##_offset_first_element_in_bytes
2338
2339#define CONVERT_TO_VECTOR_STRUCT(name) \
2340    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2341
2342#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2343    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2344
2345#define CONVERT_TO_IMAGE_STRUCT(name) \
2346    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2347
2348#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2349    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2350
2351#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2352    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2353
2354#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2355    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2356
2357#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2358    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2359
2360#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2361    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2362                                 name##_stride_z, name##_step_z)
2363
2364#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2365    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2366
2367#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2368    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2369                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2370
2371#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2372    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2373
2374#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2375    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2376                           name##_stride_z, name##_step_z)
2377
2378
2379typedef struct Vector
2380{
2381    __global uchar *ptr;
2382    int             offset_first_element_in_bytes;
2383    int             stride_x;
2384} Vector;
2385
2386
2387typedef struct Image
2388{
2389    __global uchar *ptr;
2390    int             offset_first_element_in_bytes;
2391    int             stride_x;
2392    int             stride_y;
2393} Image;
2394
2395
2396typedef struct Tensor3D
2397{
2398    __global uchar *ptr;
2399    int             offset_first_element_in_bytes;
2400    int             stride_x;
2401    int             stride_y;
2402    int             stride_z;
2403} Tensor3D;
2404
2405
2406typedef struct Tensor4D
2407{
2408    __global uchar *ptr;
2409    int             offset_first_element_in_bytes;
2410    int             stride_x;
2411    int             stride_y;
2412    int             stride_z;
2413    int             stride_w;
2414} Tensor4D;
2415
2416
2417inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2418{
2419    Vector vector =
2420    {
2421        .ptr                           = ptr,
2422        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2423        .stride_x                      = stride_x,
2424    };
2425    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2426    return vector;
2427}
2428
2429
2430inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2431{
2432    Image img =
2433    {
2434        .ptr                           = ptr,
2435        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2436        .stride_x                      = stride_x,
2437        .stride_y                      = stride_y
2438    };
2439    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2440    return img;
2441}
2442
2443
2444inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2445{
2446    Image img =
2447    {
2448        .ptr                           = ptr,
2449        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2450        .stride_x                      = stride_x,
2451        .stride_y                      = stride_y
2452    };
2453    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2454    return img;
2455}
2456
2457
2458inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2459{
2460    Tensor3D tensor =
2461    {
2462        .ptr                           = ptr,
2463        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2464        .stride_x                      = stride_x,
2465        .stride_y                      = stride_y,
2466        .stride_z                      = stride_z
2467    };
2468    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2469    return tensor;
2470}
2471
2472
2473inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2474{
2475    Tensor3D tensor =
2476    {
2477        .ptr                           = ptr,
2478        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2479        .stride_x                      = stride_x,
2480        .stride_y                      = stride_y,
2481        .stride_z                      = stride_z
2482    };
2483    return tensor;
2484}
2485
2486inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2487                                             uint step_w,
2488                                             uint mod_size)
2489{
2490    Tensor4D tensor =
2491    {
2492        .ptr                           = ptr,
2493        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2494        .stride_x                      = stride_x,
2495        .stride_y                      = stride_y,
2496        .stride_z                      = stride_z,
2497        .stride_w                      = stride_w
2498    };
2499
2500    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2501    return tensor;
2502}
2503
2504
2505inline __global const uchar *vector_offset(const Vector *vec, int x)
2506{
2507    return vec->ptr + x * vec->stride_x;
2508}
2509
2510
2511inline __global uchar *offset(const Image *img, int x, int y)
2512{
2513    return img->ptr + x * img->stride_x + y * img->stride_y;
2514}
2515
2516
2517inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2518{
2519    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2520}
2521
2522
2523inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2524{
2525    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2526}
2527
2528
2529inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2530{
2531    uint num_elements = width * height;
2532
2533    const uint z = index / num_elements;
2534
2535    index %= num_elements;
2536
2537    const uint y = index / width;
2538
2539    index %= width;
2540
2541    const uint x = index;
2542
2543    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2544}
2545
2546#endif
2547
2548
2549#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2550#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2551
2552
2553#define scalar_access_0_1(x) ((x).s0)
2554#define scalar_access_0_2(x) ((x).s01)
2555#define scalar_access_0_3(x) ((x).s012)
2556#define scalar_access_0_4(x) ((x).s0123)
2557#define scalar_access_0_8(x) ((x).s01234567)
2558#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2559
2560
2561#define scalar_access_1_1(x) ((x).s1)
2562#define scalar_access_1_2(x) ((x).s12)
2563#define scalar_access_1_3(x) ((x).s123)
2564#define scalar_access_1_4(x) ((x).s1234)
2565#define scalar_access_1_8(x) ((x).s12345678)
2566
2567
2568#define scalar_access_2_1(x) ((x).s2)
2569#define scalar_access_2_2(x) ((x).s23)
2570#define scalar_access_2_3(x) ((x).s234)
2571#define scalar_access_2_4(x) ((x).s2345)
2572#define scalar_access_2_8(x) ((x).s23456789)
2573
2574
2575#define scalar_access_3_1(x) ((x).s3)
2576#define scalar_access_3_2(x) ((x).s34)
2577#define scalar_access_3_3(x) ((x).s345)
2578#define scalar_access_3_4(x) ((x).s3456)
2579#define scalar_access_3_8(x) ((x).s3456789A)
2580
2581
2582#define scalar_access_4_1(x) ((x).s4)
2583#define scalar_access_4_2(x) ((x).s45)
2584#define scalar_access_4_3(x) ((x).s456)
2585#define scalar_access_4_4(x) ((x).s4567)
2586#define scalar_access_4_8(x) ((x).s456789AB)
2587
2588
2589#define scalar_access_8_1(x) ((x).s8)
2590#define scalar_access_8_2(x) ((x).s89)
2591#define scalar_access_8_3(x) ((x).s89A)
2592#define scalar_access_8_4(x) ((x).s89AB)
2593#define scalar_access_8_8(x) ((x).s89ABCDEF)
2594
2595
2596#define scalar_access_12_1(x) ((x).sC)
2597#define scalar_access_12_2(x) ((x).sCD)
2598#define scalar_access_12_3(x) ((x).sCDE)
2599#define scalar_access_12_4(x) ((x).sCDEF)
2600
2601
2602#define scalar_access_16_1(x) ((x).sF)
2603
2604
2605#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2606    ({})
2607
2608#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2609    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2610
2611#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2612    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2613    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2614
2615#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2616    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2617    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2618
2619#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2620    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2621    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2622
2623#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2624    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2625    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2626
2627#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2628    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2629    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2630
2631#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2632    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2633    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2634
2635#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2636    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2637    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2638
2639#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2640    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2641    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2642
2643#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2644    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2645    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2646
2647#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2648    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2649    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2650
2651#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2652    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2653    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2654
2655#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2656    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2657    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2658
2659#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2660    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2661    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2662
2663#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2664    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2665    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2666
2667#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2668    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2669    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2670
2671
2672
2673#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2674#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675
2676
2677
2678#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2679    ({})
2680
2681#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2682    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2683
2684#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2685    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2686
2687#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2688    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2689
2690#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2691    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2692
2693#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2694    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2695    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2696
2697#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2698    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2699    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2700
2701#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2702    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2703    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2704
2705#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2706    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2707
2708#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2709    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2710    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2711
2712#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2713    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2714    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2715
2716#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2717    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2718    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2719
2720#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2721    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2722    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2723
2724#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2725    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2726    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2727    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2728
2729#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2730    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2731    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2732    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2733
2734#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2735    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2736    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2737    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2738
2739#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2740    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2741
2742
2743
2744#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2745#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746
2747
2748#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2749    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2750    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2751
2752#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2753    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2754    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2755    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2756
2757#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2758    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2759    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2760    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2761
2762#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2763    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2764    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2765    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2766
2767#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2768    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2769    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2770    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2771
2772#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2773    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2774    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2775    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2776
2777#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2778    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2779    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2780    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2781
2782#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2783    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2784    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2785    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2786
2787#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2788    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2789    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2790    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2791
2792#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2793    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2794    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2795    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2796
2797#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2798    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2799    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2800    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2801
2802#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2803    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2804    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2805    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2806
2807#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2808    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2809    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2810    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2811
2812#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2813    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2814    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2815    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2816
2817#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2818    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2819    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2820    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2821
2822#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2823    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2824    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2825    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2826
2827
2828
2829
2830#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2831#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832
2833
2834
2835#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2836    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2837    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2838
2839#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2840    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2841    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2842    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2843
2844#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2845    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2846    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2847    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2848
2849#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2850    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2851    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2852    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2853
2854#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2855    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2856    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2857    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2858
2859#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2860    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2861    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2862    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2863
2864#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2865    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2866    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2867    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2868
2869#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2870    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2871    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2872    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2873
2874#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2875    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2876    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2877    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2878
2879#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2880    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2881    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2882    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2883
2884#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2885    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2886    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2887    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2888
2889#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2890    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2891    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2892    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2893
2894#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2895    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2896    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2897    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2898
2899#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2900    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2901    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2902    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2903
2904#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2905    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2906    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2907    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2908
2909#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2910    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2911    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2912    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2913
2914
2915
2916#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2917#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918
2919#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2920    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2921    {                                                                                                                                                            \
2922        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2923    }                                                                                                                                                            \
2924    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2925    {                                                                                                                                                            \
2926        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2927    }                                                                                                                                                            \
2928    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2929    {                                                                                                                                                            \
2930        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2931    }                                                                                                                                                            \
2932    else                                                                                                                                                         \
2933    {                                                                                                                                                            \
2934        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2935    }
2936
2937#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2938    if(!(PARTIAL_COND_X))                                                                                                \
2939    {                                                                                                                    \
2940        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2941    }                                                                                                                    \
2942    else                                                                                                                 \
2943    {                                                                                                                    \
2944        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2945    }
2946
2947#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2948    if(!(PARTIAL_COND_Y))                                                                                                \
2949    {                                                                                                                    \
2950        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2951    }                                                                                                                    \
2952    else                                                                                                                 \
2953    {                                                                                                                    \
2954        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2955    }
2956
2957
2958#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2959
2960#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2961    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2962
2963#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2964
2965#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2966    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2967    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2968
2969#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2970
2971#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2972    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2973    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2974
2975#else
2976
2977#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2978    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2979    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2980
2981#endif
2982
2983
2984#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2985    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2986
2987#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2988    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2989    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2990
2991#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2992    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2993    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2994
2995#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2996    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2997    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2998
2999#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3000    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3001    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3002
3003#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3004    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3005    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3006
3007#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3008    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3009    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3010
3011#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3012    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3013    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3014
3015#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3016    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3017    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3018
3019#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3020    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3021    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3022
3023#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3024    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3025    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3026
3027#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3028    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3029    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3030
3031#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3032    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3033    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3034
3035#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3036    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3037    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3038
3039#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3040    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3041    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3042
3043#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3044    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3045    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3046
3047
3048
3049#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3050#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051
3052
3053
3054#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3055    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3056    BASENAME##0;                                                                            \
3057    if(Y_MASK##0 != 0)                                                                      \
3058        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3059    else                                                                                    \
3060        BASENAME##0 = 0;
3061
3062#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3063    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3064    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3065    BASENAME##1;                                                                            \
3066    if(Y_MASK##1 != 0)                                                                      \
3067        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3068    else                                                                                    \
3069        BASENAME##1 = 0;
3070
3071#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3072    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3073    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3074    BASENAME##2;                                                                            \
3075    if(Y_MASK##2 != 0)                                                                      \
3076        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3077    else                                                                                    \
3078        BASENAME##2 = 0;
3079
3080#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3081    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3082    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3083    BASENAME##3;                                                                            \
3084    if(Y_MASK##3 != 0)                                                                      \
3085        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3086    else                                                                                    \
3087        BASENAME##3 = 0;
3088
3089#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3090    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3091    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3092    BASENAME##4;                                                                            \
3093    if(Y_MASK##4 != 0)                                                                      \
3094        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3095    else                                                                                    \
3096        BASENAME##4 = 0;
3097
3098#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3099    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3100    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3101    BASENAME##5;                                                                            \
3102    if(Y_MASK##5 != 0)                                                                      \
3103        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3104    else                                                                                    \
3105        BASENAME##5 = 0;
3106
3107#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3108    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3109    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3110    BASENAME##6;                                                                            \
3111    if(Y_MASK##6 != 0)                                                                      \
3112        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3113    else                                                                                    \
3114        BASENAME##6 = 0;
3115
3116#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3117    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3118    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3119    BASENAME##7;                                                                            \
3120    if(Y_MASK##7 != 0)                                                                      \
3121        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3122    else                                                                                    \
3123        BASENAME##7 = 0;
3124
3125#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3126    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3127    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3128    BASENAME##8;                                                                            \
3129    if(Y_MASK##8 != 0)                                                                      \
3130        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3131    else                                                                                    \
3132        BASENAME##8 = 0;
3133
3134#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3135    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3136    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3137    BASENAME##9;                                                                            \
3138    if(Y_MASK##9 != 0)                                                                      \
3139        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3140    else                                                                                    \
3141        BASENAME##9 = 0;
3142
3143#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3144    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3145    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3146    BASENAME##A;                                                                            \
3147    if(Y_MASK##A != 0)                                                                      \
3148        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3149    else                                                                                    \
3150        BASENAME##A = 0;
3151
3152#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3153    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3154    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3155    BASENAME##B;                                                                            \
3156    if(Y_MASK##B != 0)                                                                      \
3157        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3158    else                                                                                    \
3159        BASENAME##B = 0;
3160
3161#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3162    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3163    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3164    BASENAME##C;                                                                            \
3165    if(Y_MASK##C != 0)                                                                      \
3166        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3167    else                                                                                    \
3168        BASENAME##C = 0;
3169
3170#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3171    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3172    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3173    BASENAME##D;                                                                            \
3174    if(Y_MASK##D != 0)                                                                      \
3175        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3176    else                                                                                    \
3177        BASENAME##D = 0;
3178
3179#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3180    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3181    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3182    BASENAME##E;                                                                            \
3183    if(Y_MASK##E != 0)                                                                      \
3184        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3185    else                                                                                    \
3186        BASENAME##E = 0;
3187
3188#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3189    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3190    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3191    BASENAME##F;                                                                            \
3192    if(Y_MASK##F != 0)                                                                      \
3193        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3194    else                                                                                    \
3195        BASENAME##F = 0;
3196
3197
3198#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3199#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200
3201
3202#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3203    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3204    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3205
3206#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3207    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3208    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3209    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3210
3211#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3212    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3213    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3214    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3215
3216#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3217    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3218    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3219    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3220
3221#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3222    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3223    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3224    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3225
3226#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3227    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3228    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3229    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3230
3231#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3232    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3233    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3234    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3235
3236#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3237    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3238    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3239    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3240
3241#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3242    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3243    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3244    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3245
3246#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3247    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3248    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3249    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3250
3251#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3252    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3253    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3254    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3255
3256#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3257    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3258    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3259    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3260
3261#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3262    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3263    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3264    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3265
3266#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3267    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3268    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3269    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3270
3271#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3272    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3273    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3274    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3275
3276#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3277    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3278    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3279    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3280
3281
3282
3283
3284#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3285#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286
3287
3288
3289#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3290    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3291    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3292    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3293
3294#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3295    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3296    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3297    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3298    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3299
3300#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3301    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3302    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3303    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3304    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3305
3306#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3307    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3308    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3309    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3310    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3311
3312#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3313    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3314    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3315    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3316    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3317
3318#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3319    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3320    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3321    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3322    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3323
3324#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3325    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3326    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3327    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3328    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3329
3330#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3331    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3332    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3333    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3334    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3335
3336
3337
3338
3339#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3340#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341
3342
3343
3344#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3345    BASENAME##0 *= (DATA_TYPE)SCALE;
3346
3347#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3348    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3349    BASENAME##1 *= (DATA_TYPE)SCALE;
3350
3351#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3352    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3353    BASENAME##2 *= (DATA_TYPE)SCALE;
3354
3355#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3356    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3357    BASENAME##3 *= (DATA_TYPE)SCALE;
3358
3359#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3360    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3361    BASENAME##4 *= (DATA_TYPE)SCALE;
3362
3363#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3364    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3365    BASENAME##5 *= (DATA_TYPE)SCALE;
3366
3367#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3368    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3369    BASENAME##6 *= (DATA_TYPE)SCALE;
3370
3371#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3372    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3373    BASENAME##7 *= (DATA_TYPE)SCALE;
3374
3375#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3376    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3377    BASENAME##8 *= (DATA_TYPE)SCALE;
3378
3379#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3380    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3381    BASENAME##9 *= (DATA_TYPE)SCALE;
3382
3383#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3384    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3385    BASENAME##A *= (DATA_TYPE)SCALE;
3386
3387#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3388    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3389    BASENAME##B *= (DATA_TYPE)SCALE;
3390
3391#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3392    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3393    BASENAME##C *= (DATA_TYPE)SCALE;
3394
3395#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3396    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3397    BASENAME##D *= (DATA_TYPE)SCALE;
3398
3399#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3400    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3401    BASENAME##E *= (DATA_TYPE)SCALE;
3402
3403#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3404    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3405    BASENAME##F *= (DATA_TYPE)SCALE;
3406
3407
3408
3409#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3410#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3411
3412
3413
3414#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3415    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3416#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3417    VEC_DATA_TYPE(TYPE, 2)                         \
3418    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3419#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3420    VEC_DATA_TYPE(TYPE, 3)                         \
3421    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3422#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3423    VEC_DATA_TYPE(TYPE, 4)                         \
3424    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3425#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3426    VEC_DATA_TYPE(TYPE, 8)                         \
3427    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3428#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3429    VEC_DATA_TYPE(TYPE, 16)                         \
3430    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3431
3432
3433
3434#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3435    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3436#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3437    VEC_DATA_TYPE(TYPE, 2)                                \
3438    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3439#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3440    VEC_DATA_TYPE(TYPE, 3)                                \
3441    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3442#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3443    VEC_DATA_TYPE(TYPE, 4)                                \
3444    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3445#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3446    VEC_DATA_TYPE(TYPE, 8)                                \
3447    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3448#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3449    VEC_DATA_TYPE(TYPE, 16)                                \
3450    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3451
3452
3453
3454#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3455    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3456#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3457    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3458    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3459#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3460    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3461    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3462#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3463    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3464    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3465#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3466    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3467    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3468    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3469    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3470    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3471#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3472    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3473    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3474    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3475    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3476    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3477    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3478    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3479    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3480    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3481
3482
3483
3484
3485#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3486    CONCAT(COLUMN_VECTOR, K0)                          \
3487    (IDX_COL, BASENAME, BS, TYPE);
3488
3489
3490#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3491    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3492    (IDX_COL, BASENAME, BS, TYPE);
3493
3494
3495#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3496    CONCAT(TRANSPOSE_K0X, N0)                       \
3497    (K0, BASENAME, BS, TYPE);
3498
3499
3500#define ADD_ROW_1(BASENAME, BIAS) \
3501    BASENAME##0 += BIAS##0;
3502
3503#define ADD_ROW_2(BASENAME, BIAS) \
3504    ADD_ROW_1(BASENAME, BIAS)     \
3505    BASENAME##1 += BIAS##1;
3506
3507#define ADD_ROW_3(BASENAME, BIAS) \
3508    ADD_ROW_2(BASENAME, BIAS)     \
3509    BASENAME##2 += BIAS##2;
3510
3511#define ADD_ROW_4(BASENAME, BIAS) \
3512    ADD_ROW_3(BASENAME, BIAS)     \
3513    BASENAME##3 += BIAS##3;
3514
3515#define ADD_ROW_5(BASENAME, BIAS) \
3516    ADD_ROW_4(BASENAME, BIAS)     \
3517    BASENAME##4 += BIAS##4;
3518
3519#define ADD_ROW_6(BASENAME, BIAS) \
3520    ADD_ROW_5(BASENAME, BIAS)     \
3521    BASENAME##5 += BIAS##5;
3522
3523#define ADD_ROW_7(BASENAME, BIAS) \
3524    ADD_ROW_6(BASENAME, BIAS)     \
3525    BASENAME##6 += BIAS##6;
3526
3527#define ADD_ROW_8(BASENAME, BIAS) \
3528    ADD_ROW_7(BASENAME, BIAS)     \
3529    BASENAME##7 += BIAS##7;
3530
3531#define ADD_ROW_9(BASENAME, BIAS) \
3532    ADD_ROW_8(BASENAME, BIAS)     \
3533    BASENAME##8 += BIAS##8;
3534
3535#define ADD_ROW_10(BASENAME, BIAS) \
3536    ADD_ROW_9(BASENAME, BIAS)      \
3537    BASENAME##9 += BIAS##9;
3538
3539#define ADD_ROW_11(BASENAME, BIAS) \
3540    ADD_ROW_10(BASENAME, BIAS)     \
3541    BASENAME##A += BIAS##A;
3542
3543#define ADD_ROW_12(BASENAME, BIAS) \
3544    ADD_ROW_11(BASENAME, BIAS)     \
3545    BASENAME##B += BIAS##B;
3546
3547#define ADD_ROW_13(BASENAME, BIAS) \
3548    ADD_ROW_12(BASENAME, BIAS)     \
3549    BASENAME##C += BIAS##C;
3550
3551#define ADD_ROW_14(BASENAME, BIAS) \
3552    ADD_ROW_13(BASENAME, BIAS)     \
3553    BASENAME##D += BIAS##D;
3554
3555#define ADD_ROW_15(BASENAME, BIAS) \
3556    ADD_ROW_14(BASENAME, BIAS)     \
3557    BASENAME##E += BIAS##E;
3558
3559#define ADD_ROW_16(BASENAME, BIAS) \
3560    ADD_ROW_15(BASENAME, BIAS)     \
3561    BASENAME##F += BIAS##F;
3562
3563
3564
3565
3566#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3567#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3568
3569
3570
3571#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3572    BASENAME##0 += BIAS;
3573
3574#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3575    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3576    BASENAME##1 += BIAS;
3577
3578#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3579    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3580    BASENAME##2 += BIAS;
3581
3582#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3583    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3584    BASENAME##3 += BIAS;
3585
3586#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3587    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3588    BASENAME##4 += BIAS;
3589
3590#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3591    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3592    BASENAME##5 += BIAS;
3593
3594#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3595    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3596    BASENAME##6 += BIAS;
3597
3598#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3599    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3600    BASENAME##7 += BIAS;
3601
3602#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3603    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3604    BASENAME##8 += BIAS;
3605
3606#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3607    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3608    BASENAME##9 += BIAS;
3609
3610#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3611    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3612    BASENAME##A += BIAS;
3613
3614#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3615    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3616    BASENAME##B += BIAS;
3617
3618#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3619    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3620    BASENAME##C += BIAS;
3621
3622#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3623    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3624    BASENAME##D += BIAS;
3625
3626#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3627    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3628    BASENAME##E += BIAS;
3629
3630#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3631    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3632    BASENAME##F += BIAS;
3633
3634
3635#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3636#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3637
3638
3639
3640#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3641    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3642
3643#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3644    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3645    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3646
3647#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3648    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3649    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3650
3651#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3652    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3653    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3654
3655#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3656    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3657    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3658
3659#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3660    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3661    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3662
3663#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3664    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3665    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3666
3667#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3668    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3669    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3670
3671#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3672    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3673    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3674
3675#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3676    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3677    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3678
3679#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3680    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3681    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3682
3683#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3684    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3685    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3686
3687#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3688    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3689    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3690
3691#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3692    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3693    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3694
3695#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3696    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3697    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3698
3699#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3700    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3701    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3702
3703
3704
3705#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3706#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707
3708
3709
3710#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3711    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3712    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3713
3714#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3715    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3716    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3717    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3718
3719#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3720    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3721    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3722    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3723
3724#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3725    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3726    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3727    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3728
3729#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3730    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3731    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3732    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3733
3734#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3735    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3736    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3737    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3738
3739#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3740    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3741    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3742    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3743
3744#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3745    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3746    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3747    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3748
3749#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3750    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3751    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3752    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3753
3754#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3755    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3756    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3757    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3758
3759#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3760    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3761    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3762    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3763
3764#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3765    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3766    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3767    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3768
3769#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3770    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3771    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3772    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3773
3774#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3775    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3776    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3777    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3778
3779#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3780    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3781    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3782    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3783
3784#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3785    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3786    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3787    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3788
3789
3790
3791#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3792#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793   )"