• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1R"(
2
3
4#ifndef ARM_COMPUTE_HELPER_H
5#define ARM_COMPUTE_HELPER_H
6
7
8
9
10#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
11    VSTORE(N0)                                                 \
12    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
13
14#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
15    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
16    VSTORE(N0)                                                 \
17    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
18
19#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
20    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
21    VSTORE(N0)                                                 \
22    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
23
24#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
25    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
26    VSTORE(N0)                                                 \
27    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
28
29#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
30    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
31    VSTORE(N0)                                                 \
32    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
33
34#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
35    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
36    VSTORE(N0)                                                 \
37    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
38
39#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
40    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
41    VSTORE(N0)                                                 \
42    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
43
44#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
45    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
46    VSTORE(N0)                                                 \
47    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
48
49#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
50    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
51    VSTORE(N0)                                                 \
52    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
53
54#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
55    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
56    VSTORE(N0)                                                  \
57    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
58
59#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
60    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
61    VSTORE(N0)                                                  \
62    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
63
64#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
65    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
66    VSTORE(N0)                                                  \
67    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
68
69#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
70    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
71    VSTORE(N0)                                                  \
72    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
73
74#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
75    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
76    VSTORE(N0)                                                  \
77    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
78
79#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
80    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
81    VSTORE(N0)                                                  \
82    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
83
84#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
85    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
86    VSTORE(N0)                                                  \
87    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
88
89
90
91#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
92    VSTORE(N0)                                                         \
93    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
94
95#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
96    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
97    VSTORE(N0)                                                         \
98    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
99
100#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
101    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
102    VSTORE(N0)                                                         \
103    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
104
105#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
106    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
107    VSTORE(N0)                                                         \
108    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
109
110#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
111    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
112    VSTORE(N0)                                                         \
113    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
114
115#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
116    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
117    VSTORE(N0)                                                         \
118    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
119
120#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
121    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
122    VSTORE(N0)                                                         \
123    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
124
125#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
126    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
127    VSTORE(N0)                                                         \
128    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
129
130#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
131    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
132    VSTORE(N0)                                                         \
133    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
134
135#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
136    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
137    VSTORE(N0)                                                     \
138    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
139
140#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
141    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
142    VSTORE(N0)                                                          \
143    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
144
145#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
146    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
147    VSTORE(N0)                                                          \
148    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
149
150#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
151    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
152    VSTORE(N0)                                                          \
153    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
154
155#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
156    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
157    VSTORE(N0)                                                          \
158    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
159
160#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
161    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
162    VSTORE(N0)                                                          \
163    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
164
165#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
166    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
167    VSTORE(N0)                                                          \
168    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
169
170
171
172
173#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
174#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
175
176
177
178#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
179#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
180
181
182
183#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
184    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
185    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
186
187#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
188    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
189    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
190    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
191
192#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
193    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
194    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
195    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
196
197#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
198    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
199    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
200    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
201
202#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
203    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
204    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
205    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
206
207#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
208    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
209    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
210    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
211
212#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
213    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
214    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
215    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
216
217#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
218    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
219    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
220    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
221
222#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
223    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
224    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
225    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
226
227#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
228    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
229    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
230    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
231
232#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
233    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
234    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
235    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
236
237#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
238    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
239    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
240    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
241
242#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
243    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
244    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
245    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
246
247#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
248    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
249    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
250    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
251
252#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
253    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
254    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
255    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
256
257#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
258    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
259    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
260    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
261
262
263
264#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
265#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
266
267#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
268    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
269    {                                                                                                                                                     \
270        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
271    }                                                                                                                                                     \
272    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
273    {                                                                                                                                                     \
274        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
275    }                                                                                                                                                     \
276    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
277    {                                                                                                                                                     \
278        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
279    }                                                                                                                                                     \
280    else                                                                                                                                                  \
281    {                                                                                                                                                     \
282        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
283    }
284
285#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
286    if(!(PARTIAL_COND_X))                                                                                         \
287    {                                                                                                             \
288        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
289    }                                                                                                             \
290    else                                                                                                          \
291    {                                                                                                             \
292        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
293    }
294
295#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
296    if(!(PARTIAL_COND_Y))                                                                                         \
297    {                                                                                                             \
298        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
299    }                                                                                                             \
300    else                                                                                                          \
301    {                                                                                                             \
302        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
303    }
304
305
306#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
307
308
309#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
310
311#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
312    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
313
314#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
315
316#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
317    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
318
319#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
320
321#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
322    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
323
324#else
325
326#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
327    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
328
329#endif
330
331#endif
332
333
334#if defined(PARTIAL_STORE_M0)
335
336#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
337    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
338#else
339#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
340    ((uint)(y * M0))
341#endif
342
343
344
345#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
346    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
347
348
349#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
350#pragma OPENCL EXTENSION cl_khr_fp16 : enable
351#endif
352
353#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
354#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
355#endif
356
357#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
358#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
359#endif
360
361#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
362#pragma OPENCL EXTENSION cl_arm_printf : enable
363#endif
364
365#define GPU_ARCH_MIDGARD 0x100
366#define GPU_ARCH_BIFROST 0x200
367#define GPU_ARCH_VALHALL 0x300
368
369
370#define CONCAT(a, b) a##b
371
372
373#define EXPAND(x) x
374
375
376#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
377
378
379#define REV1(x) ((x))
380#define REV2(x) ((x).s10)
381#define REV3(x) ((x).s210)
382#define REV4(x) ((x).s3210)
383#define REV8(x) ((x).s76543210)
384#define REV16(x) ((x).sFEDCBA9876543210)
385
386
387
388#define REVERSE_STR(x, s) REV##s((x))
389#define REVERSE(x, s) REVERSE_STR(x, s)
390
391
392
393#define ROT1_0(x) ((x))
394#define ROT1_1(x) ((x))
395
396#define ROT2_0(x) ((x))
397#define ROT2_1(x) ((x).s10)
398#define ROT2_2(x) ((x))
399
400#define ROT3_0(x) ((x))
401#define ROT3_1(x) ((x).s201)
402#define ROT3_2(x) ((x).s120)
403#define ROT3_3(x) ((x))
404
405#define ROT4_0(x) ((x))
406#define ROT4_1(x) ((x).s3012)
407#define ROT4_2(x) ((x).s2301)
408#define ROT4_3(x) ((x).s1230)
409#define ROT4_4(x) ((x))
410
411#define ROT8_0(x) ((x))
412#define ROT8_1(x) ((x).s70123456)
413#define ROT8_2(x) ((x).s67012345)
414#define ROT8_3(x) ((x).s56701234)
415#define ROT8_4(x) ((x).s45670123)
416#define ROT8_5(x) ((x).s34567012)
417#define ROT8_6(x) ((x).s23456701)
418#define ROT8_7(x) ((x).s12345670)
419#define ROT8_8(x) ((x))
420
421#define ROT16_0(x) ((x))
422#define ROT16_1(x) ((x).sF0123456789ABCDE)
423#define ROT16_2(x) ((x).sEF0123456789ABCD)
424#define ROT16_3(x) ((x).sDEF0123456789ABC)
425#define ROT16_4(x) ((x).sCDEF0123456789AB)
426#define ROT16_5(x) ((x).sBCDEF0123456789A)
427#define ROT16_6(x) ((x).sABCDEF0123456789)
428#define ROT16_7(x) ((x).s9ABCDEF012345678)
429#define ROT16_8(x) ((x).s89ABCDEF01234567)
430#define ROT16_9(x) ((x).s789ABCDEF0123456)
431#define ROT16_10(x) ((x).s6789ABCDEF012345)
432#define ROT16_11(x) ((x).s56789ABCDEF01234)
433#define ROT16_12(x) ((x).s456789ABCDEF0123)
434#define ROT16_13(x) ((x).s3456789ABCDEF012)
435#define ROT16_14(x) ((x).s23456789ABCDEF01)
436#define ROT16_15(x) ((x).s123456789ABCDEF0)
437#define ROT16_16(x) ((x))
438
439
440
441#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
442#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
443
444
445
446#define V_OFFS1(dt) (dt##1)(0)
447#define V_OFFS2(dt) (dt##2)(0, 1)
448#define V_OFFS3(dt) (dt##3)(0, 1, 2)
449#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
450#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
451#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
452
453
454
455#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
456#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
457
458
459#define VLOAD_STR(size) vload##size
460#define VLOAD(size) VLOAD_STR(size)
461
462
463#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
464#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
465
466#define NO_LOAD(data, offs, ptr) \
467    {                            \
468    }
469
470
471#define vload_partial_1_0 NO_LOAD
472#define vload_partial_1_1 vload1
473#define vload_partial_1_2 NO_LOAD
474#define vload_partial_1_3 NO_LOAD
475#define vload_partial_1_4 NO_LOAD
476#define vload_partial_1_5 NO_LOAD
477#define vload_partial_1_6 NO_LOAD
478#define vload_partial_1_7 NO_LOAD
479#define vload_partial_1_8 NO_LOAD
480#define vload_partial_1_9 NO_LOAD
481#define vload_partial_1_10 NO_LOAD
482#define vload_partial_1_11 NO_LOAD
483#define vload_partial_1_12 NO_LOAD
484#define vload_partial_1_13 NO_LOAD
485#define vload_partial_1_14 NO_LOAD
486#define vload_partial_1_15 NO_LOAD
487#define vload_partial_1_16 NO_LOAD
488
489#define vload_partial_2_0 NO_LOAD
490#define vload_partial_2_1 vload_partial_1
491#define vload_partial_2_2 vload_partial_2
492#define vload_partial_2_3 NO_LOAD
493#define vload_partial_2_4 NO_LOAD
494#define vload_partial_2_5 NO_LOAD
495#define vload_partial_2_6 NO_LOAD
496#define vload_partial_2_7 NO_LOAD
497#define vload_partial_2_8 NO_LOAD
498#define vload_partial_2_9 NO_LOAD
499#define vload_partial_2_10 NO_LOAD
500#define vload_partial_2_11 NO_LOAD
501#define vload_partial_2_12 NO_LOAD
502#define vload_partial_2_13 NO_LOAD
503#define vload_partial_2_14 NO_LOAD
504#define vload_partial_2_15 NO_LOAD
505#define vload_partial_2_16 NO_LOAD
506
507#define vload_partial_3_0 NO_LOAD
508#define vload_partial_3_1 vload_partial_1
509#define vload_partial_3_2 vload_partial_2
510#define vload_partial_3_3 vload_partial_3
511#define vload_partial_3_4 NO_LOAD
512#define vload_partial_3_5 NO_LOAD
513#define vload_partial_3_6 NO_LOAD
514#define vload_partial_3_7 NO_LOAD
515#define vload_partial_3_8 NO_LOAD
516#define vload_partial_3_9 NO_LOAD
517#define vload_partial_3_10 NO_LOAD
518#define vload_partial_3_11 NO_LOAD
519#define vload_partial_3_12 NO_LOAD
520#define vload_partial_3_13 NO_LOAD
521#define vload_partial_3_14 NO_LOAD
522#define vload_partial_3_15 NO_LOAD
523#define vload_partial_3_16 NO_LOAD
524
525#define vload_partial_4_0 NO_LOAD
526#define vload_partial_4_1 vload_partial_1
527#define vload_partial_4_2 vload_partial_2
528#define vload_partial_4_3 vload_partial_3
529#define vload_partial_4_4 vload_partial_4
530#define vload_partial_4_5 NO_LOAD
531#define vload_partial_4_6 NO_LOAD
532#define vload_partial_4_7 NO_LOAD
533#define vload_partial_4_8 NO_LOAD
534#define vload_partial_4_9 NO_LOAD
535#define vload_partial_4_10 NO_LOAD
536#define vload_partial_4_11 NO_LOAD
537#define vload_partial_4_12 NO_LOAD
538#define vload_partial_4_13 NO_LOAD
539#define vload_partial_4_14 NO_LOAD
540#define vload_partial_4_15 NO_LOAD
541#define vload_partial_4_16 NO_LOAD
542
543#define vload_partial_8_0 NO_LOAD
544#define vload_partial_8_1 vload_partial_1
545#define vload_partial_8_2 vload_partial_2
546#define vload_partial_8_3 vload_partial_3
547#define vload_partial_8_4 vload_partial_4
548#define vload_partial_8_5 vload_partial_5
549#define vload_partial_8_6 vload_partial_6
550#define vload_partial_8_7 vload_partial_7
551#define vload_partial_8_8 vload_partial_8
552#define vload_partial_8_9 NO_LOAD
553#define vload_partial_8_10 NO_LOAD
554#define vload_partial_8_11 NO_LOAD
555#define vload_partial_8_12 NO_LOAD
556#define vload_partial_8_13 NO_LOAD
557#define vload_partial_8_14 NO_LOAD
558#define vload_partial_8_15 NO_LOAD
559#define vload_partial_8_16 NO_LOAD
560
561#define vload_partial_16_0 NO_LOAD
562#define vload_partial_16_1 vload_partial_1
563#define vload_partial_16_2 vload_partial_2
564#define vload_partial_16_3 vload_partial_3
565#define vload_partial_16_4 vload_partial_4
566#define vload_partial_16_5 vload_partial_5
567#define vload_partial_16_6 vload_partial_6
568#define vload_partial_16_7 vload_partial_7
569#define vload_partial_16_8 vload_partial_8
570#define vload_partial_16_9 vload_partial_9
571#define vload_partial_16_10 vload_partial_10
572#define vload_partial_16_11 vload_partial_11
573#define vload_partial_16_12 vload_partial_12
574#define vload_partial_16_13 vload_partial_13
575#define vload_partial_16_14 vload_partial_14
576#define vload_partial_16_15 vload_partial_15
577#define vload_partial_16_16 vload_partial_16
578
579
580#define vload_partial_1(DATA, OFFSET, PTR) \
581    DATA.s0 = vload1(OFFSET, PTR);
582
583#define vload_partial_2(DATA, OFFSET, PTR) \
584    DATA.s01 = vload2(OFFSET, PTR);
585
586#define vload_partial_3(DATA, OFFSET, PTR) \
587    DATA.s012 = vload3(OFFSET, PTR);
588
589#define vload_partial_4(DATA, OFFSET, PTR) \
590    DATA.s0123 = vload4(OFFSET, PTR);
591
592#define vload_partial_5(DATA, OFFSET, PTR)    \
593    vload_partial_4(DATA.s0123, OFFSET, PTR); \
594    DATA.s4 = vload1(OFFSET, PTR + 4);
595
596#define vload_partial_6(DATA, OFFSET, PTR)    \
597    vload_partial_4(DATA.s0123, OFFSET, PTR); \
598    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
599
600#define vload_partial_7(DATA, OFFSET, PTR)    \
601    vload_partial_4(DATA.s0123, OFFSET, PTR); \
602    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
603
604#define vload_partial_8(DATA, OFFSET, PTR) \
605    DATA.s01234567 = vload8(OFFSET, PTR);
606
607#define vload_partial_9(DATA, OFFSET, PTR)        \
608    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
609    DATA.s8 = vload1(OFFSET, PTR + 8);
610
611#define vload_partial_10(DATA, OFFSET, PTR)       \
612    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
613    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
614
615#define vload_partial_11(DATA, OFFSET, PTR)       \
616    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
617    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
618
619#define vload_partial_12(DATA, OFFSET, PTR)       \
620    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
621    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
622
623#define vload_partial_13(DATA, OFFSET, PTR)       \
624    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
625    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
626
627#define vload_partial_14(DATA, OFFSET, PTR)       \
628    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
629    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
630
631#define vload_partial_15(DATA, OFFSET, PTR)       \
632    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
633    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
634
635#define vload_partial_16(DATA, OFFSET, PTR) \
636    DATA = vload16(OFFSET, PTR);
637
638
639
640#define PIXEL_UNIT4 1
641#define PIXEL_UNIT8 2
642#define PIXEL_UNIT16 4
643
644
645#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
646#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
647
648
649#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
650#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
651#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
652
653#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
654#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
655#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
656#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
657#endif
658
659#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
660#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
661#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
662
663#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
664#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
665#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
666#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
667#endif
668
669
670#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
671#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
672
673
674#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
675#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
676
677#define VSTORE_STR(size) vstore##size
678#define VSTORE(size) VSTORE_STR(size)
679
680#define float1 float
681#define half1 half
682#define char1 char
683#define uchar1 uchar
684#define short1 short
685#define ushort1 ushort
686#define int1 int
687#define uint1 uint
688#define long1 long
689#define ulong1 ulong
690#define double1 double
691
692#define vload1(OFFSET, PTR) *(OFFSET + PTR)
693#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
694
695
696#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
697#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
698
699#define NO_STORE(data, offs, ptr) \
700    {                             \
701    }
702
703
704#define vstore_partial_1_0 NO_STORE
705#define vstore_partial_1_1 vstore1
706#define vstore_partial_1_2 NO_STORE
707#define vstore_partial_1_3 NO_STORE
708#define vstore_partial_1_4 NO_STORE
709#define vstore_partial_1_5 NO_STORE
710#define vstore_partial_1_6 NO_STORE
711#define vstore_partial_1_7 NO_STORE
712#define vstore_partial_1_8 NO_STORE
713#define vstore_partial_1_9 NO_STORE
714#define vstore_partial_1_10 NO_STORE
715#define vstore_partial_1_11 NO_STORE
716#define vstore_partial_1_12 NO_STORE
717#define vstore_partial_1_13 NO_STORE
718#define vstore_partial_1_14 NO_STORE
719#define vstore_partial_1_15 NO_STORE
720#define vstore_partial_1_16 NO_STORE
721
722#define vstore_partial_2_0 NO_STORE
723#define vstore_partial_2_1 vstore_partial_1
724#define vstore_partial_2_2 vstore_partial_2
725#define vstore_partial_2_3 NO_STORE
726#define vstore_partial_2_4 NO_STORE
727#define vstore_partial_2_5 NO_STORE
728#define vstore_partial_2_6 NO_STORE
729#define vstore_partial_2_7 NO_STORE
730#define vstore_partial_2_8 NO_STORE
731#define vstore_partial_2_9 NO_STORE
732#define vstore_partial_2_10 NO_STORE
733#define vstore_partial_2_11 NO_STORE
734#define vstore_partial_2_12 NO_STORE
735#define vstore_partial_2_13 NO_STORE
736#define vstore_partial_2_14 NO_STORE
737#define vstore_partial_2_15 NO_STORE
738#define vstore_partial_2_16 NO_STORE
739
740#define vstore_partial_3_0 NO_STORE
741#define vstore_partial_3_1 vstore_partial_1
742#define vstore_partial_3_2 vstore_partial_2
743#define vstore_partial_3_3 vstore_partial_3
744#define vstore_partial_3_4 NO_STORE
745#define vstore_partial_3_5 NO_STORE
746#define vstore_partial_3_6 NO_STORE
747#define vstore_partial_3_7 NO_STORE
748#define vstore_partial_3_8 NO_STORE
749#define vstore_partial_3_9 NO_STORE
750#define vstore_partial_3_10 NO_STORE
751#define vstore_partial_3_11 NO_STORE
752#define vstore_partial_3_12 NO_STORE
753#define vstore_partial_3_13 NO_STORE
754#define vstore_partial_3_14 NO_STORE
755#define vstore_partial_3_15 NO_STORE
756#define vstore_partial_3_16 NO_STORE
757
758#define vstore_partial_4_0 NO_STORE
759#define vstore_partial_4_1 vstore_partial_1
760#define vstore_partial_4_2 vstore_partial_2
761#define vstore_partial_4_3 vstore_partial_3
762#define vstore_partial_4_4 vstore_partial_4
763#define vstore_partial_4_5 NO_STORE
764#define vstore_partial_4_6 NO_STORE
765#define vstore_partial_4_7 NO_STORE
766#define vstore_partial_4_8 NO_STORE
767#define vstore_partial_4_9 NO_STORE
768#define vstore_partial_4_10 NO_STORE
769#define vstore_partial_4_11 NO_STORE
770#define vstore_partial_4_12 NO_STORE
771#define vstore_partial_4_13 NO_STORE
772#define vstore_partial_4_14 NO_STORE
773#define vstore_partial_4_15 NO_STORE
774#define vstore_partial_4_16 NO_STORE
775
776#define vstore_partial_8_0 NO_STORE
777#define vstore_partial_8_1 vstore_partial_1
778#define vstore_partial_8_2 vstore_partial_2
779#define vstore_partial_8_3 vstore_partial_3
780#define vstore_partial_8_4 vstore_partial_4
781#define vstore_partial_8_5 vstore_partial_5
782#define vstore_partial_8_6 vstore_partial_6
783#define vstore_partial_8_7 vstore_partial_7
784#define vstore_partial_8_8 vstore_partial_8
785#define vstore_partial_8_9 NO_STORE
786#define vstore_partial_8_10 NO_STORE
787#define vstore_partial_8_11 NO_STORE
788#define vstore_partial_8_12 NO_STORE
789#define vstore_partial_8_13 NO_STORE
790#define vstore_partial_8_14 NO_STORE
791#define vstore_partial_8_15 NO_STORE
792#define vstore_partial_8_16 NO_STORE
793
794#define vstore_partial_16_0 NO_STORE
795#define vstore_partial_16_1 vstore_partial_1
796#define vstore_partial_16_2 vstore_partial_2
797#define vstore_partial_16_3 vstore_partial_3
798#define vstore_partial_16_4 vstore_partial_4
799#define vstore_partial_16_5 vstore_partial_5
800#define vstore_partial_16_6 vstore_partial_6
801#define vstore_partial_16_7 vstore_partial_7
802#define vstore_partial_16_8 vstore_partial_8
803#define vstore_partial_16_9 vstore_partial_9
804#define vstore_partial_16_10 vstore_partial_10
805#define vstore_partial_16_11 vstore_partial_11
806#define vstore_partial_16_12 vstore_partial_12
807#define vstore_partial_16_13 vstore_partial_13
808#define vstore_partial_16_14 vstore_partial_14
809#define vstore_partial_16_15 vstore_partial_15
810#define vstore_partial_16_16 vstore_partial_16
811
812
813#define vstore_partial_1(DATA, OFFSET, PTR) \
814    vstore1(DATA.s0, OFFSET, PTR);
815
816#define vstore_partial_2(DATA, OFFSET, PTR) \
817    vstore2(DATA.s01, OFFSET, PTR);
818
819#define vstore_partial_3(DATA, OFFSET, PTR) \
820    vstore3(DATA.s012, OFFSET, PTR);
821
822#define vstore_partial_4(DATA, OFFSET, PTR) \
823    vstore4(DATA.s0123, OFFSET, PTR);
824
825#define vstore_partial_5(DATA, OFFSET, PTR)    \
826    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
827    vstore1(DATA.s4, OFFSET, PTR + 4);
828
829#define vstore_partial_6(DATA, OFFSET, PTR)    \
830    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
831    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
832
833#define vstore_partial_7(DATA, OFFSET, PTR)    \
834    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
835    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
836
837#define vstore_partial_8(DATA, OFFSET, PTR) \
838    vstore8(DATA.s01234567, OFFSET, PTR);
839
840#define vstore_partial_9(DATA, OFFSET, PTR)        \
841    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
842    vstore1(DATA.s8, OFFSET, PTR + 8);
843
844#define vstore_partial_10(DATA, OFFSET, PTR)       \
845    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
846    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
847
848#define vstore_partial_11(DATA, OFFSET, PTR)       \
849    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
850    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
851
852#define vstore_partial_12(DATA, OFFSET, PTR)       \
853    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
854    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
855
856#define vstore_partial_13(DATA, OFFSET, PTR)       \
857    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
858    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
859
860#define vstore_partial_14(DATA, OFFSET, PTR)       \
861    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
862    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
863
864#define vstore_partial_15(DATA, OFFSET, PTR)       \
865    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
866    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
867
868#define vstore_partial_16(DATA, OFFSET, PTR) \
869    vstore16(DATA, OFFSET, PTR);
870
871
872
873
874
875#define convert_float_sat convert_float
876#define convert_float1_sat convert_float
877#define convert_float2_sat convert_float2
878#define convert_float3_sat convert_float3
879#define convert_float4_sat convert_float4
880#define convert_float8_sat convert_float8
881#define convert_float16_sat convert_float16
882#define convert_half_sat convert_float
883#define convert_half1_sat convert_half
884#define convert_half2_sat convert_half2
885#define convert_half3_sat convert_half3
886#define convert_half4_sat convert_half4
887#define convert_half8_sat convert_half8
888#define convert_half16_sat convert_half16
889
890#define convert_float1 convert_float
891#define convert_half1 convert_half
892#define convert_char1 convert_char
893#define convert_uchar1 convert_uchar
894#define convert_short1 convert_short
895#define convert_ushort1 convert_ushort
896#define convert_int1 convert_int
897#define convert_uint1 convert_uint
898#define convert_long1 convert_long
899#define convert_ulong1 convert_ulong
900#define convert_double1 convert_double
901
902#define convert_char1_sat convert_char_sat
903#define convert_uchar1_sat convert_uchar_sat
904#define convert_uchar2_sat convert_uchar2_sat
905#define convert_uchar3_sat convert_uchar3_sat
906#define convert_uchar4_sat convert_uchar4_sat
907#define convert_uchar8_sat convert_uchar8_sat
908#define convert_uchar16_sat convert_uchar16_sat
909#define convert_short1_sat convert_short_sat
910#define convert_ushort1_sat convert_ushort_sat
911#define convert_int1_sat convert_int_sat
912#define convert_uint1_sat convert_uint_sat
913#define convert_long1_sat convert_long_sat
914#define convert_ulong1_sat convert_ulong_sat
915#define convert_double1_sat convert_double_sat
916
917#define VEC_DATA_TYPE_STR(type, size) type##size
918#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
919
920#define CONVERT_STR(x, type) (convert_##type((x)))
921#define CONVERT(x, type) CONVERT_STR(x, type)
922
923#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
924#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
925
926#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
927#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
928
929#define select_vec_dt_uchar(size) uchar##size
930#define select_vec_dt_char(size) char##size
931#define select_vec_dt_ushort(size) ushort##size
932#define select_vec_dt_short(size) short##size
933#define select_vec_dt_half(size) short##size
934#define select_vec_dt_uint(size) uint##size
935#define select_vec_dt_int(size) int##size
936#define select_vec_dt_float(size) int##size
937#define select_vec_dt_ulong(size) ulong##size
938#define select_vec_dt_long(size) long##size
939
940#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
941#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
942#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
943
944#define signed_int_vec_dt_uchar(size) char##size
945#define signed_int_vec_dt_char(size) char##size
946#define signed_int_vec_dt_ushort(size) short##size
947#define signed_int_vec_dt_short(size) short##size
948#define signed_int_vec_dt_half(size) short##size
949#define signed_int_vec_dt_uint(size) int##size
950#define signed_int_vec_dt_int(size) int##size
951#define signed_int_vec_dt_float(size) int##size
952#define signed_int_vec_dt_ulong(size) long##size
953#define signed_int_vec_dt_long(size) long##size
954
955#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
956#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
957#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
958
959#define sum_reduce_1(x) (x)
960#define sum_reduce_2(x) ((x).s0) + ((x).s1)
961#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
962#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
963#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
964#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
965
966#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
967#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
968
969#define prod_reduce_1(x) (x)
970#define prod_reduce_2(x) ((x).s0) * ((x).s1)
971#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
972#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
973#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
974#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
975
976#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
977#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
978
979#define max_reduce_1(x) (x)
980#define max_reduce_2(x) max(((x).s0), ((x).s1))
981#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
982#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
983#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
984#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
985
986#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
987#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
988
989#define VECTOR_DECLARATION(name)     \
990    __global uchar *name##_ptr,      \
991    uint        name##_stride_x, \
992    uint        name##_step_x,   \
993    uint        name##_offset_first_element_in_bytes
994
995#define IMAGE_DECLARATION(name)      \
996    __global uchar *name##_ptr,      \
997    uint        name##_stride_x, \
998    uint        name##_step_x,   \
999    uint        name##_stride_y, \
1000    uint        name##_step_y,   \
1001    uint        name##_offset_first_element_in_bytes
1002
1003#define TENSOR3D_DECLARATION(name)   \
1004    __global uchar *name##_ptr,      \
1005    uint        name##_stride_x, \
1006    uint        name##_step_x,   \
1007    uint        name##_stride_y, \
1008    uint        name##_step_y,   \
1009    uint        name##_stride_z, \
1010    uint        name##_step_z,   \
1011    uint        name##_offset_first_element_in_bytes
1012
1013#define TENSOR4D_DECLARATION(name)   \
1014    __global uchar *name##_ptr,      \
1015    uint        name##_stride_x, \
1016    uint        name##_step_x,   \
1017    uint        name##_stride_y, \
1018    uint        name##_step_y,   \
1019    uint        name##_stride_z, \
1020    uint        name##_step_z,   \
1021    uint        name##_stride_w, \
1022    uint        name##_step_w,   \
1023    uint        name##_offset_first_element_in_bytes
1024
1025#define TENSOR5D_DECLARATION(name)   \
1026    __global uchar *name##_ptr,      \
1027    uint        name##_stride_x, \
1028    uint        name##_step_x,   \
1029    uint        name##_stride_y, \
1030    uint        name##_step_y,   \
1031    uint        name##_stride_z, \
1032    uint        name##_step_z,   \
1033    uint        name##_stride_w, \
1034    uint        name##_step_w,   \
1035    uint        name##_stride_v, \
1036    uint        name##_step_v,   \
1037    uint        name##_offset_first_element_in_bytes
1038
1039#define CONVERT_TO_VECTOR_STRUCT(name) \
1040    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1041
1042#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1043    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1044
1045#define CONVERT_TO_IMAGE_STRUCT(name) \
1046    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1047
1048#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1049    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1050
1051#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1052    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1053
1054#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1055    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1056
1057#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1058    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1059
1060#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1061    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1062                                 name##_stride_z, name##_step_z)
1063
1064#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1065    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1066
1067#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1068    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1069                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1070
1071#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1072    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1073
1074#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1075    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1076                           name##_stride_z, name##_step_z)
1077
1078
1079typedef struct Vector
1080{
1081    __global uchar *ptr;
1082    int             offset_first_element_in_bytes;
1083    int             stride_x;
1084} Vector;
1085
1086
1087typedef struct Image
1088{
1089    __global uchar *ptr;
1090    int             offset_first_element_in_bytes;
1091    int             stride_x;
1092    int             stride_y;
1093} Image;
1094
1095
1096typedef struct Tensor3D
1097{
1098    __global uchar *ptr;
1099    int             offset_first_element_in_bytes;
1100    int             stride_x;
1101    int             stride_y;
1102    int             stride_z;
1103} Tensor3D;
1104
1105
1106typedef struct Tensor4D
1107{
1108    __global uchar *ptr;
1109    int             offset_first_element_in_bytes;
1110    int             stride_x;
1111    int             stride_y;
1112    int             stride_z;
1113    int             stride_w;
1114} Tensor4D;
1115
1116
1117inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1118{
1119    Vector vector =
1120    {
1121        .ptr                           = ptr,
1122        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1123        .stride_x                      = stride_x,
1124    };
1125    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1126    return vector;
1127}
1128
1129
1130inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1131{
1132    Image img =
1133    {
1134        .ptr                           = ptr,
1135        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1136        .stride_x                      = stride_x,
1137        .stride_y                      = stride_y
1138    };
1139    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1140    return img;
1141}
1142
1143
1144inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1145{
1146    Image img =
1147    {
1148        .ptr                           = ptr,
1149        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1150        .stride_x                      = stride_x,
1151        .stride_y                      = stride_y
1152    };
1153    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1154    return img;
1155}
1156
1157
1158inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1159{
1160    Tensor3D tensor =
1161    {
1162        .ptr                           = ptr,
1163        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1164        .stride_x                      = stride_x,
1165        .stride_y                      = stride_y,
1166        .stride_z                      = stride_z
1167    };
1168    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1169    return tensor;
1170}
1171
1172
1173inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1174{
1175    Tensor3D tensor =
1176    {
1177        .ptr                           = ptr,
1178        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1179        .stride_x                      = stride_x,
1180        .stride_y                      = stride_y,
1181        .stride_z                      = stride_z
1182    };
1183    return tensor;
1184}
1185
1186inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1187                                             uint step_w,
1188                                             uint mod_size)
1189{
1190    Tensor4D tensor =
1191    {
1192        .ptr                           = ptr,
1193        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1194        .stride_x                      = stride_x,
1195        .stride_y                      = stride_y,
1196        .stride_z                      = stride_z,
1197        .stride_w                      = stride_w
1198    };
1199
1200    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1201    return tensor;
1202}
1203
1204
1205inline __global const uchar *vector_offset(const Vector *vec, int x)
1206{
1207    return vec->ptr + x * vec->stride_x;
1208}
1209
1210
1211inline __global uchar *offset(const Image *img, int x, int y)
1212{
1213    return img->ptr + x * img->stride_x + y * img->stride_y;
1214}
1215
1216
1217inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1218{
1219    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1220}
1221
1222
1223inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1224{
1225    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1226}
1227
1228
1229inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1230{
1231    uint num_elements = width * height;
1232
1233    const uint z = index / num_elements;
1234
1235    index %= num_elements;
1236
1237    const uint y = index / width;
1238
1239    index %= width;
1240
1241    const uint x = index;
1242
1243    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1244}
1245
1246#endif
1247
1248
1249
1250
1251#if defined(N0) && !defined(VEC_SIZE)
1252#define VEC_SIZE N0
1253#endif
1254
1255#if defined(VEC_SIZE) && defined(DATA_TYPE)
1256
1257#define ADD_X_POS_0(x, y) (x) + (y)
1258#define SUB_X_POS_0(x, y) (x) - (y)
1259#define MAX_X_POS_0(x, y) max(x, y)
1260#define MIN_X_POS_0(x, y) min(x, y)
1261#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
1262#define POWER_X_POS_0(x, y) pow(x, y)
1263#if VEC_SIZE == 1
1264#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
1265#else
1266
1267#if defined(MIXED_PRECISION)
1268#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
1269#else
1270#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
1271#endif
1272
1273#endif
1274#define DIV_X_POS_0(x, y) (x / y)
1275#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
1276#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
1277
1278#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
1279#define SUB_X_POS_1(x, y) (y) - (x)
1280#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
1281#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
1282#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
1283#define POWER_X_POS_1(x, y) pow(y, x)
1284#if VEC_SIZE == 1
1285#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
1286#else
1287
1288#if defined(MIXED_PRECISION)
1289#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
1290#else
1291#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
1292#endif
1293
1294#endif
1295#define DIV_X_POS_1(x, y) (y / x)
1296#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
1297#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
1298
1299
1300#define ADD(x, y) ADD_X_POS_0(x, y)
1301#define SUB(x, y) SUB_X_POS_0(x, y)
1302#define MAX(x, y) MAX_X_POS_0(x, y)
1303#define MIN(x, y) MIN_X_POS_0(x, y)
1304#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
1305#define POWER(x, y) POWER_X_POS_0(x, y)
1306#define PRELU(x, y) PRELU_X_POS_0(x, y)
1307#define DIV(x, y) DIV_X_POS_0(x, y)
1308#define AND(x, y) AND_X_POS_0(x, y)
1309#define OR(x, y) OR_X_POS_0(x, y)
1310
1311#endif
1312
1313
1314
1315#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
1316    OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
1317
1318#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
1319    ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2)     \
1320    OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
1321
1322#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
1323    ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2)     \
1324    OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
1325
1326#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
1327    ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2)     \
1328    OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
1329
1330#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
1331    ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2)     \
1332    OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
1333
1334#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
1335    ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2)     \
1336    OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
1337
1338#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
1339    ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2)     \
1340    OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
1341
1342#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
1343    ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2)     \
1344    OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
1345
1346#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
1347    ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2)     \
1348    OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
1349
1350#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
1351    ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2)      \
1352    OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
1353
1354#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
1355    ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2)     \
1356    OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
1357
1358#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
1359    ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2)     \
1360    OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
1361
1362#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
1363    ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2)     \
1364    OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
1365
1366#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
1367    ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2)     \
1368    OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
1369
1370#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
1371    ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2)     \
1372    OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
1373
1374#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
1375    ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2)     \
1376    OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
1377
1378
1379
1380
1381#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
1382#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
1383
1384
1385
1386#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
1387    OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
1388
1389#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
1390    ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2)     \
1391    OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
1392
1393#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
1394    ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2)     \
1395    OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
1396
1397#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
1398    ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2)     \
1399    OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
1400
1401#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
1402    ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2)     \
1403    OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
1404
1405#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
1406    ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2)     \
1407    OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
1408
1409#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
1410    ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2)     \
1411    OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
1412
1413#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
1414    ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2)     \
1415    OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
1416
1417#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
1418    ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2)     \
1419    OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
1420
1421#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
1422    ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2)      \
1423    OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
1424
1425#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
1426    ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2)     \
1427    OPERAND1##A = OP(OPERAND1##A, OPERAND2);
1428
1429#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
1430    ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2)     \
1431    OPERAND1##B = OP(OPERAND1##B, OPERAND2);
1432
1433#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
1434    ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2)     \
1435    OPERAND1##C = OP(OPERAND1##C, OPERAND2);
1436
1437#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
1438    ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2)     \
1439    OPERAND1##D = OP(OPERAND1##D, OPERAND2);
1440
1441#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
1442    ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2)     \
1443    OPERAND1##E = OP(OPERAND1##E, OPERAND2);
1444
1445#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
1446    ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2)     \
1447    OPERAND1##F = OP(OPERAND1##F, OPERAND2);
1448
1449
1450
1451
1452#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
1453#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
1454
1455
1456
1457
1458
1459#ifndef ARM_COMPUTE_HELPER_H
1460#define ARM_COMPUTE_HELPER_H
1461
1462
1463
1464
1465#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1466    VSTORE(N0)                                                 \
1467    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1468
1469#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1470    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1471    VSTORE(N0)                                                 \
1472    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1473
1474#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1475    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1476    VSTORE(N0)                                                 \
1477    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1478
1479#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1480    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1481    VSTORE(N0)                                                 \
1482    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1483
1484#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1486    VSTORE(N0)                                                 \
1487    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1488
1489#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1490    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1491    VSTORE(N0)                                                 \
1492    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1493
1494#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1495    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1496    VSTORE(N0)                                                 \
1497    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1498
1499#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1500    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1501    VSTORE(N0)                                                 \
1502    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1503
1504#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1505    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1506    VSTORE(N0)                                                 \
1507    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1508
1509#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1510    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1511    VSTORE(N0)                                                  \
1512    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1513
1514#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1515    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1516    VSTORE(N0)                                                  \
1517    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1518
1519#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1520    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1521    VSTORE(N0)                                                  \
1522    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1523
1524#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1525    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1526    VSTORE(N0)                                                  \
1527    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1528
1529#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1530    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1531    VSTORE(N0)                                                  \
1532    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1533
1534#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1535    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1536    VSTORE(N0)                                                  \
1537    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1538
1539#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1540    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1541    VSTORE(N0)                                                  \
1542    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1543
1544
1545
1546#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1547    VSTORE(N0)                                                         \
1548    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1549
1550#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1551    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1552    VSTORE(N0)                                                         \
1553    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1554
1555#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1556    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1557    VSTORE(N0)                                                         \
1558    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1559
1560#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1561    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1562    VSTORE(N0)                                                         \
1563    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1564
1565#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1566    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1567    VSTORE(N0)                                                         \
1568    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1569
1570#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1571    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1572    VSTORE(N0)                                                         \
1573    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1574
1575#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1576    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1577    VSTORE(N0)                                                         \
1578    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1579
1580#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1581    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1582    VSTORE(N0)                                                         \
1583    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1584
1585#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1586    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1587    VSTORE(N0)                                                         \
1588    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1589
1590#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1591    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1592    VSTORE(N0)                                                     \
1593    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1594
1595#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1596    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1597    VSTORE(N0)                                                          \
1598    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1599
1600#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1601    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1602    VSTORE(N0)                                                          \
1603    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1604
1605#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1606    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1607    VSTORE(N0)                                                          \
1608    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1609
1610#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1611    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1612    VSTORE(N0)                                                          \
1613    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1614
1615#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1616    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1617    VSTORE(N0)                                                          \
1618    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1619
1620#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1621    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1622    VSTORE(N0)                                                          \
1623    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1624
1625
1626
1627
1628#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1629#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1630
1631
1632
1633#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1634#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1635
1636
1637
1638#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1639    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1640    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1641
1642#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1643    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1644    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1645    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1646
1647#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1648    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1649    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1650    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1651
1652#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1653    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1654    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1655    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1656
1657#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1658    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1659    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1660    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1661
1662#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1663    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1664    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1665    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1666
1667#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1668    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1669    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1670    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1671
1672#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1673    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1674    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1675    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1676
1677#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1678    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1679    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1680    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1681
1682#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1683    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1684    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1685    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1686
1687#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1688    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1689    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1690    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1691
1692#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1693    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1694    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1695    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1696
1697#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1698    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1699    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1700    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1701
1702#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1703    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1704    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1705    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1706
1707#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1708    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1709    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1710    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1711
1712#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1713    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1714    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1715    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1716
1717
1718
1719#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1720#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1721
1722#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1723    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1724    {                                                                                                                                                     \
1725        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1726    }                                                                                                                                                     \
1727    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1728    {                                                                                                                                                     \
1729        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1730    }                                                                                                                                                     \
1731    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1732    {                                                                                                                                                     \
1733        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1734    }                                                                                                                                                     \
1735    else                                                                                                                                                  \
1736    {                                                                                                                                                     \
1737        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1738    }
1739
1740#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1741    if(!(PARTIAL_COND_X))                                                                                         \
1742    {                                                                                                             \
1743        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1744    }                                                                                                             \
1745    else                                                                                                          \
1746    {                                                                                                             \
1747        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1748    }
1749
1750#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1751    if(!(PARTIAL_COND_Y))                                                                                         \
1752    {                                                                                                             \
1753        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1754    }                                                                                                             \
1755    else                                                                                                          \
1756    {                                                                                                             \
1757        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1758    }
1759
1760
1761#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1762
1763
1764#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1765
1766#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1767    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1768
1769#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1770
1771#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1772    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1773
1774#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1775
1776#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1777    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1778
1779#else
1780
1781#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1782    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1783
1784#endif
1785
1786#endif
1787
1788
1789#if defined(PARTIAL_STORE_M0)
1790
1791#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1792    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1793#else
1794#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1795    ((uint)(y * M0))
1796#endif
1797
1798
1799
1800#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1801    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1802
1803
1804#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1805#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1806#endif
1807
1808#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1809#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1810#endif
1811
1812#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1813#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1814#endif
1815
1816#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1817#pragma OPENCL EXTENSION cl_arm_printf : enable
1818#endif
1819
1820#define GPU_ARCH_MIDGARD 0x100
1821#define GPU_ARCH_BIFROST 0x200
1822#define GPU_ARCH_VALHALL 0x300
1823
1824
1825#define CONCAT(a, b) a##b
1826
1827
1828#define EXPAND(x) x
1829
1830
1831#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1832
1833
1834#define REV1(x) ((x))
1835#define REV2(x) ((x).s10)
1836#define REV3(x) ((x).s210)
1837#define REV4(x) ((x).s3210)
1838#define REV8(x) ((x).s76543210)
1839#define REV16(x) ((x).sFEDCBA9876543210)
1840
1841
1842
1843#define REVERSE_STR(x, s) REV##s((x))
1844#define REVERSE(x, s) REVERSE_STR(x, s)
1845
1846
1847
1848#define ROT1_0(x) ((x))
1849#define ROT1_1(x) ((x))
1850
1851#define ROT2_0(x) ((x))
1852#define ROT2_1(x) ((x).s10)
1853#define ROT2_2(x) ((x))
1854
1855#define ROT3_0(x) ((x))
1856#define ROT3_1(x) ((x).s201)
1857#define ROT3_2(x) ((x).s120)
1858#define ROT3_3(x) ((x))
1859
1860#define ROT4_0(x) ((x))
1861#define ROT4_1(x) ((x).s3012)
1862#define ROT4_2(x) ((x).s2301)
1863#define ROT4_3(x) ((x).s1230)
1864#define ROT4_4(x) ((x))
1865
1866#define ROT8_0(x) ((x))
1867#define ROT8_1(x) ((x).s70123456)
1868#define ROT8_2(x) ((x).s67012345)
1869#define ROT8_3(x) ((x).s56701234)
1870#define ROT8_4(x) ((x).s45670123)
1871#define ROT8_5(x) ((x).s34567012)
1872#define ROT8_6(x) ((x).s23456701)
1873#define ROT8_7(x) ((x).s12345670)
1874#define ROT8_8(x) ((x))
1875
1876#define ROT16_0(x) ((x))
1877#define ROT16_1(x) ((x).sF0123456789ABCDE)
1878#define ROT16_2(x) ((x).sEF0123456789ABCD)
1879#define ROT16_3(x) ((x).sDEF0123456789ABC)
1880#define ROT16_4(x) ((x).sCDEF0123456789AB)
1881#define ROT16_5(x) ((x).sBCDEF0123456789A)
1882#define ROT16_6(x) ((x).sABCDEF0123456789)
1883#define ROT16_7(x) ((x).s9ABCDEF012345678)
1884#define ROT16_8(x) ((x).s89ABCDEF01234567)
1885#define ROT16_9(x) ((x).s789ABCDEF0123456)
1886#define ROT16_10(x) ((x).s6789ABCDEF012345)
1887#define ROT16_11(x) ((x).s56789ABCDEF01234)
1888#define ROT16_12(x) ((x).s456789ABCDEF0123)
1889#define ROT16_13(x) ((x).s3456789ABCDEF012)
1890#define ROT16_14(x) ((x).s23456789ABCDEF01)
1891#define ROT16_15(x) ((x).s123456789ABCDEF0)
1892#define ROT16_16(x) ((x))
1893
1894
1895
1896#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1897#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1898
1899
1900
1901#define V_OFFS1(dt) (dt##1)(0)
1902#define V_OFFS2(dt) (dt##2)(0, 1)
1903#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1904#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1905#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1906#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1907
1908
1909
1910#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1911#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1912
1913
1914#define VLOAD_STR(size) vload##size
1915#define VLOAD(size) VLOAD_STR(size)
1916
1917
1918#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1919#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1920
1921#define NO_LOAD(data, offs, ptr) \
1922    {                            \
1923    }
1924
1925
1926#define vload_partial_1_0 NO_LOAD
1927#define vload_partial_1_1 vload1
1928#define vload_partial_1_2 NO_LOAD
1929#define vload_partial_1_3 NO_LOAD
1930#define vload_partial_1_4 NO_LOAD
1931#define vload_partial_1_5 NO_LOAD
1932#define vload_partial_1_6 NO_LOAD
1933#define vload_partial_1_7 NO_LOAD
1934#define vload_partial_1_8 NO_LOAD
1935#define vload_partial_1_9 NO_LOAD
1936#define vload_partial_1_10 NO_LOAD
1937#define vload_partial_1_11 NO_LOAD
1938#define vload_partial_1_12 NO_LOAD
1939#define vload_partial_1_13 NO_LOAD
1940#define vload_partial_1_14 NO_LOAD
1941#define vload_partial_1_15 NO_LOAD
1942#define vload_partial_1_16 NO_LOAD
1943
1944#define vload_partial_2_0 NO_LOAD
1945#define vload_partial_2_1 vload_partial_1
1946#define vload_partial_2_2 vload_partial_2
1947#define vload_partial_2_3 NO_LOAD
1948#define vload_partial_2_4 NO_LOAD
1949#define vload_partial_2_5 NO_LOAD
1950#define vload_partial_2_6 NO_LOAD
1951#define vload_partial_2_7 NO_LOAD
1952#define vload_partial_2_8 NO_LOAD
1953#define vload_partial_2_9 NO_LOAD
1954#define vload_partial_2_10 NO_LOAD
1955#define vload_partial_2_11 NO_LOAD
1956#define vload_partial_2_12 NO_LOAD
1957#define vload_partial_2_13 NO_LOAD
1958#define vload_partial_2_14 NO_LOAD
1959#define vload_partial_2_15 NO_LOAD
1960#define vload_partial_2_16 NO_LOAD
1961
1962#define vload_partial_3_0 NO_LOAD
1963#define vload_partial_3_1 vload_partial_1
1964#define vload_partial_3_2 vload_partial_2
1965#define vload_partial_3_3 vload_partial_3
1966#define vload_partial_3_4 NO_LOAD
1967#define vload_partial_3_5 NO_LOAD
1968#define vload_partial_3_6 NO_LOAD
1969#define vload_partial_3_7 NO_LOAD
1970#define vload_partial_3_8 NO_LOAD
1971#define vload_partial_3_9 NO_LOAD
1972#define vload_partial_3_10 NO_LOAD
1973#define vload_partial_3_11 NO_LOAD
1974#define vload_partial_3_12 NO_LOAD
1975#define vload_partial_3_13 NO_LOAD
1976#define vload_partial_3_14 NO_LOAD
1977#define vload_partial_3_15 NO_LOAD
1978#define vload_partial_3_16 NO_LOAD
1979
1980#define vload_partial_4_0 NO_LOAD
1981#define vload_partial_4_1 vload_partial_1
1982#define vload_partial_4_2 vload_partial_2
1983#define vload_partial_4_3 vload_partial_3
1984#define vload_partial_4_4 vload_partial_4
1985#define vload_partial_4_5 NO_LOAD
1986#define vload_partial_4_6 NO_LOAD
1987#define vload_partial_4_7 NO_LOAD
1988#define vload_partial_4_8 NO_LOAD
1989#define vload_partial_4_9 NO_LOAD
1990#define vload_partial_4_10 NO_LOAD
1991#define vload_partial_4_11 NO_LOAD
1992#define vload_partial_4_12 NO_LOAD
1993#define vload_partial_4_13 NO_LOAD
1994#define vload_partial_4_14 NO_LOAD
1995#define vload_partial_4_15 NO_LOAD
1996#define vload_partial_4_16 NO_LOAD
1997
1998#define vload_partial_8_0 NO_LOAD
1999#define vload_partial_8_1 vload_partial_1
2000#define vload_partial_8_2 vload_partial_2
2001#define vload_partial_8_3 vload_partial_3
2002#define vload_partial_8_4 vload_partial_4
2003#define vload_partial_8_5 vload_partial_5
2004#define vload_partial_8_6 vload_partial_6
2005#define vload_partial_8_7 vload_partial_7
2006#define vload_partial_8_8 vload_partial_8
2007#define vload_partial_8_9 NO_LOAD
2008#define vload_partial_8_10 NO_LOAD
2009#define vload_partial_8_11 NO_LOAD
2010#define vload_partial_8_12 NO_LOAD
2011#define vload_partial_8_13 NO_LOAD
2012#define vload_partial_8_14 NO_LOAD
2013#define vload_partial_8_15 NO_LOAD
2014#define vload_partial_8_16 NO_LOAD
2015
2016#define vload_partial_16_0 NO_LOAD
2017#define vload_partial_16_1 vload_partial_1
2018#define vload_partial_16_2 vload_partial_2
2019#define vload_partial_16_3 vload_partial_3
2020#define vload_partial_16_4 vload_partial_4
2021#define vload_partial_16_5 vload_partial_5
2022#define vload_partial_16_6 vload_partial_6
2023#define vload_partial_16_7 vload_partial_7
2024#define vload_partial_16_8 vload_partial_8
2025#define vload_partial_16_9 vload_partial_9
2026#define vload_partial_16_10 vload_partial_10
2027#define vload_partial_16_11 vload_partial_11
2028#define vload_partial_16_12 vload_partial_12
2029#define vload_partial_16_13 vload_partial_13
2030#define vload_partial_16_14 vload_partial_14
2031#define vload_partial_16_15 vload_partial_15
2032#define vload_partial_16_16 vload_partial_16
2033
2034
2035#define vload_partial_1(DATA, OFFSET, PTR) \
2036    DATA.s0 = vload1(OFFSET, PTR);
2037
2038#define vload_partial_2(DATA, OFFSET, PTR) \
2039    DATA.s01 = vload2(OFFSET, PTR);
2040
2041#define vload_partial_3(DATA, OFFSET, PTR) \
2042    DATA.s012 = vload3(OFFSET, PTR);
2043
2044#define vload_partial_4(DATA, OFFSET, PTR) \
2045    DATA.s0123 = vload4(OFFSET, PTR);
2046
2047#define vload_partial_5(DATA, OFFSET, PTR)    \
2048    vload_partial_4(DATA.s0123, OFFSET, PTR); \
2049    DATA.s4 = vload1(OFFSET, PTR + 4);
2050
2051#define vload_partial_6(DATA, OFFSET, PTR)    \
2052    vload_partial_4(DATA.s0123, OFFSET, PTR); \
2053    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
2054
2055#define vload_partial_7(DATA, OFFSET, PTR)    \
2056    vload_partial_4(DATA.s0123, OFFSET, PTR); \
2057    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
2058
2059#define vload_partial_8(DATA, OFFSET, PTR) \
2060    DATA.s01234567 = vload8(OFFSET, PTR);
2061
2062#define vload_partial_9(DATA, OFFSET, PTR)        \
2063    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2064    DATA.s8 = vload1(OFFSET, PTR + 8);
2065
2066#define vload_partial_10(DATA, OFFSET, PTR)       \
2067    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2068    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
2069
2070#define vload_partial_11(DATA, OFFSET, PTR)       \
2071    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2072    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
2073
2074#define vload_partial_12(DATA, OFFSET, PTR)       \
2075    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2076    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
2077
2078#define vload_partial_13(DATA, OFFSET, PTR)       \
2079    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2080    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
2081
2082#define vload_partial_14(DATA, OFFSET, PTR)       \
2083    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2084    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
2085
2086#define vload_partial_15(DATA, OFFSET, PTR)       \
2087    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
2088    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
2089
2090#define vload_partial_16(DATA, OFFSET, PTR) \
2091    DATA = vload16(OFFSET, PTR);
2092
2093
2094
2095#define PIXEL_UNIT4 1
2096#define PIXEL_UNIT8 2
2097#define PIXEL_UNIT16 4
2098
2099
2100#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
2101#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
2102
2103
2104#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
2105#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
2106#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
2107
2108#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2109#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
2110#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
2111#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
2112#endif
2113
2114#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
2115#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
2116#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
2117
2118#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
2119#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
2120#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
2121#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
2122#endif
2123
2124
2125#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
2126#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
2127
2128
2129#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
2130#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
2131
2132#define VSTORE_STR(size) vstore##size
2133#define VSTORE(size) VSTORE_STR(size)
2134
2135#define float1 float
2136#define half1 half
2137#define char1 char
2138#define uchar1 uchar
2139#define short1 short
2140#define ushort1 ushort
2141#define int1 int
2142#define uint1 uint
2143#define long1 long
2144#define ulong1 ulong
2145#define double1 double
2146
2147#define vload1(OFFSET, PTR) *(OFFSET + PTR)
2148#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
2149
2150
2151#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
2152#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
2153
2154#define NO_STORE(data, offs, ptr) \
2155    {                             \
2156    }
2157
2158
2159#define vstore_partial_1_0 NO_STORE
2160#define vstore_partial_1_1 vstore1
2161#define vstore_partial_1_2 NO_STORE
2162#define vstore_partial_1_3 NO_STORE
2163#define vstore_partial_1_4 NO_STORE
2164#define vstore_partial_1_5 NO_STORE
2165#define vstore_partial_1_6 NO_STORE
2166#define vstore_partial_1_7 NO_STORE
2167#define vstore_partial_1_8 NO_STORE
2168#define vstore_partial_1_9 NO_STORE
2169#define vstore_partial_1_10 NO_STORE
2170#define vstore_partial_1_11 NO_STORE
2171#define vstore_partial_1_12 NO_STORE
2172#define vstore_partial_1_13 NO_STORE
2173#define vstore_partial_1_14 NO_STORE
2174#define vstore_partial_1_15 NO_STORE
2175#define vstore_partial_1_16 NO_STORE
2176
2177#define vstore_partial_2_0 NO_STORE
2178#define vstore_partial_2_1 vstore_partial_1
2179#define vstore_partial_2_2 vstore_partial_2
2180#define vstore_partial_2_3 NO_STORE
2181#define vstore_partial_2_4 NO_STORE
2182#define vstore_partial_2_5 NO_STORE
2183#define vstore_partial_2_6 NO_STORE
2184#define vstore_partial_2_7 NO_STORE
2185#define vstore_partial_2_8 NO_STORE
2186#define vstore_partial_2_9 NO_STORE
2187#define vstore_partial_2_10 NO_STORE
2188#define vstore_partial_2_11 NO_STORE
2189#define vstore_partial_2_12 NO_STORE
2190#define vstore_partial_2_13 NO_STORE
2191#define vstore_partial_2_14 NO_STORE
2192#define vstore_partial_2_15 NO_STORE
2193#define vstore_partial_2_16 NO_STORE
2194
2195#define vstore_partial_3_0 NO_STORE
2196#define vstore_partial_3_1 vstore_partial_1
2197#define vstore_partial_3_2 vstore_partial_2
2198#define vstore_partial_3_3 vstore_partial_3
2199#define vstore_partial_3_4 NO_STORE
2200#define vstore_partial_3_5 NO_STORE
2201#define vstore_partial_3_6 NO_STORE
2202#define vstore_partial_3_7 NO_STORE
2203#define vstore_partial_3_8 NO_STORE
2204#define vstore_partial_3_9 NO_STORE
2205#define vstore_partial_3_10 NO_STORE
2206#define vstore_partial_3_11 NO_STORE
2207#define vstore_partial_3_12 NO_STORE
2208#define vstore_partial_3_13 NO_STORE
2209#define vstore_partial_3_14 NO_STORE
2210#define vstore_partial_3_15 NO_STORE
2211#define vstore_partial_3_16 NO_STORE
2212
2213#define vstore_partial_4_0 NO_STORE
2214#define vstore_partial_4_1 vstore_partial_1
2215#define vstore_partial_4_2 vstore_partial_2
2216#define vstore_partial_4_3 vstore_partial_3
2217#define vstore_partial_4_4 vstore_partial_4
2218#define vstore_partial_4_5 NO_STORE
2219#define vstore_partial_4_6 NO_STORE
2220#define vstore_partial_4_7 NO_STORE
2221#define vstore_partial_4_8 NO_STORE
2222#define vstore_partial_4_9 NO_STORE
2223#define vstore_partial_4_10 NO_STORE
2224#define vstore_partial_4_11 NO_STORE
2225#define vstore_partial_4_12 NO_STORE
2226#define vstore_partial_4_13 NO_STORE
2227#define vstore_partial_4_14 NO_STORE
2228#define vstore_partial_4_15 NO_STORE
2229#define vstore_partial_4_16 NO_STORE
2230
2231#define vstore_partial_8_0 NO_STORE
2232#define vstore_partial_8_1 vstore_partial_1
2233#define vstore_partial_8_2 vstore_partial_2
2234#define vstore_partial_8_3 vstore_partial_3
2235#define vstore_partial_8_4 vstore_partial_4
2236#define vstore_partial_8_5 vstore_partial_5
2237#define vstore_partial_8_6 vstore_partial_6
2238#define vstore_partial_8_7 vstore_partial_7
2239#define vstore_partial_8_8 vstore_partial_8
2240#define vstore_partial_8_9 NO_STORE
2241#define vstore_partial_8_10 NO_STORE
2242#define vstore_partial_8_11 NO_STORE
2243#define vstore_partial_8_12 NO_STORE
2244#define vstore_partial_8_13 NO_STORE
2245#define vstore_partial_8_14 NO_STORE
2246#define vstore_partial_8_15 NO_STORE
2247#define vstore_partial_8_16 NO_STORE
2248
2249#define vstore_partial_16_0 NO_STORE
2250#define vstore_partial_16_1 vstore_partial_1
2251#define vstore_partial_16_2 vstore_partial_2
2252#define vstore_partial_16_3 vstore_partial_3
2253#define vstore_partial_16_4 vstore_partial_4
2254#define vstore_partial_16_5 vstore_partial_5
2255#define vstore_partial_16_6 vstore_partial_6
2256#define vstore_partial_16_7 vstore_partial_7
2257#define vstore_partial_16_8 vstore_partial_8
2258#define vstore_partial_16_9 vstore_partial_9
2259#define vstore_partial_16_10 vstore_partial_10
2260#define vstore_partial_16_11 vstore_partial_11
2261#define vstore_partial_16_12 vstore_partial_12
2262#define vstore_partial_16_13 vstore_partial_13
2263#define vstore_partial_16_14 vstore_partial_14
2264#define vstore_partial_16_15 vstore_partial_15
2265#define vstore_partial_16_16 vstore_partial_16
2266
2267
2268#define vstore_partial_1(DATA, OFFSET, PTR) \
2269    vstore1(DATA.s0, OFFSET, PTR);
2270
2271#define vstore_partial_2(DATA, OFFSET, PTR) \
2272    vstore2(DATA.s01, OFFSET, PTR);
2273
2274#define vstore_partial_3(DATA, OFFSET, PTR) \
2275    vstore3(DATA.s012, OFFSET, PTR);
2276
2277#define vstore_partial_4(DATA, OFFSET, PTR) \
2278    vstore4(DATA.s0123, OFFSET, PTR);
2279
2280#define vstore_partial_5(DATA, OFFSET, PTR)    \
2281    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2282    vstore1(DATA.s4, OFFSET, PTR + 4);
2283
2284#define vstore_partial_6(DATA, OFFSET, PTR)    \
2285    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2286    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2287
2288#define vstore_partial_7(DATA, OFFSET, PTR)    \
2289    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2290    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2291
2292#define vstore_partial_8(DATA, OFFSET, PTR) \
2293    vstore8(DATA.s01234567, OFFSET, PTR);
2294
2295#define vstore_partial_9(DATA, OFFSET, PTR)        \
2296    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2297    vstore1(DATA.s8, OFFSET, PTR + 8);
2298
2299#define vstore_partial_10(DATA, OFFSET, PTR)       \
2300    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2301    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2302
2303#define vstore_partial_11(DATA, OFFSET, PTR)       \
2304    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2305    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2306
2307#define vstore_partial_12(DATA, OFFSET, PTR)       \
2308    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2309    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2310
2311#define vstore_partial_13(DATA, OFFSET, PTR)       \
2312    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2313    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2314
2315#define vstore_partial_14(DATA, OFFSET, PTR)       \
2316    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2317    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2318
2319#define vstore_partial_15(DATA, OFFSET, PTR)       \
2320    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2321    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2322
2323#define vstore_partial_16(DATA, OFFSET, PTR) \
2324    vstore16(DATA, OFFSET, PTR);
2325
2326
2327
2328
2329
2330#define convert_float_sat convert_float
2331#define convert_float1_sat convert_float
2332#define convert_float2_sat convert_float2
2333#define convert_float3_sat convert_float3
2334#define convert_float4_sat convert_float4
2335#define convert_float8_sat convert_float8
2336#define convert_float16_sat convert_float16
2337#define convert_half_sat convert_float
2338#define convert_half1_sat convert_half
2339#define convert_half2_sat convert_half2
2340#define convert_half3_sat convert_half3
2341#define convert_half4_sat convert_half4
2342#define convert_half8_sat convert_half8
2343#define convert_half16_sat convert_half16
2344
2345#define convert_float1 convert_float
2346#define convert_half1 convert_half
2347#define convert_char1 convert_char
2348#define convert_uchar1 convert_uchar
2349#define convert_short1 convert_short
2350#define convert_ushort1 convert_ushort
2351#define convert_int1 convert_int
2352#define convert_uint1 convert_uint
2353#define convert_long1 convert_long
2354#define convert_ulong1 convert_ulong
2355#define convert_double1 convert_double
2356
2357#define convert_char1_sat convert_char_sat
2358#define convert_uchar1_sat convert_uchar_sat
2359#define convert_uchar2_sat convert_uchar2_sat
2360#define convert_uchar3_sat convert_uchar3_sat
2361#define convert_uchar4_sat convert_uchar4_sat
2362#define convert_uchar8_sat convert_uchar8_sat
2363#define convert_uchar16_sat convert_uchar16_sat
2364#define convert_short1_sat convert_short_sat
2365#define convert_ushort1_sat convert_ushort_sat
2366#define convert_int1_sat convert_int_sat
2367#define convert_uint1_sat convert_uint_sat
2368#define convert_long1_sat convert_long_sat
2369#define convert_ulong1_sat convert_ulong_sat
2370#define convert_double1_sat convert_double_sat
2371
2372#define VEC_DATA_TYPE_STR(type, size) type##size
2373#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2374
2375#define CONVERT_STR(x, type) (convert_##type((x)))
2376#define CONVERT(x, type) CONVERT_STR(x, type)
2377
2378#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2379#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2380
2381#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2382#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2383
2384#define select_vec_dt_uchar(size) uchar##size
2385#define select_vec_dt_char(size) char##size
2386#define select_vec_dt_ushort(size) ushort##size
2387#define select_vec_dt_short(size) short##size
2388#define select_vec_dt_half(size) short##size
2389#define select_vec_dt_uint(size) uint##size
2390#define select_vec_dt_int(size) int##size
2391#define select_vec_dt_float(size) int##size
2392#define select_vec_dt_ulong(size) ulong##size
2393#define select_vec_dt_long(size) long##size
2394
2395#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2396#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2397#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2398
2399#define signed_int_vec_dt_uchar(size) char##size
2400#define signed_int_vec_dt_char(size) char##size
2401#define signed_int_vec_dt_ushort(size) short##size
2402#define signed_int_vec_dt_short(size) short##size
2403#define signed_int_vec_dt_half(size) short##size
2404#define signed_int_vec_dt_uint(size) int##size
2405#define signed_int_vec_dt_int(size) int##size
2406#define signed_int_vec_dt_float(size) int##size
2407#define signed_int_vec_dt_ulong(size) long##size
2408#define signed_int_vec_dt_long(size) long##size
2409
2410#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2411#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2412#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2413
2414#define sum_reduce_1(x) (x)
2415#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2416#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2417#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2418#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2419#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2420
2421#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2422#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2423
2424#define prod_reduce_1(x) (x)
2425#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2426#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2427#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2428#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2429#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2430
2431#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2432#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2433
2434#define max_reduce_1(x) (x)
2435#define max_reduce_2(x) max(((x).s0), ((x).s1))
2436#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2437#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2438#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2439#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2440
2441#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2442#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2443
2444#define VECTOR_DECLARATION(name)     \
2445    __global uchar *name##_ptr,      \
2446    uint        name##_stride_x, \
2447    uint        name##_step_x,   \
2448    uint        name##_offset_first_element_in_bytes
2449
2450#define IMAGE_DECLARATION(name)      \
2451    __global uchar *name##_ptr,      \
2452    uint        name##_stride_x, \
2453    uint        name##_step_x,   \
2454    uint        name##_stride_y, \
2455    uint        name##_step_y,   \
2456    uint        name##_offset_first_element_in_bytes
2457
2458#define TENSOR3D_DECLARATION(name)   \
2459    __global uchar *name##_ptr,      \
2460    uint        name##_stride_x, \
2461    uint        name##_step_x,   \
2462    uint        name##_stride_y, \
2463    uint        name##_step_y,   \
2464    uint        name##_stride_z, \
2465    uint        name##_step_z,   \
2466    uint        name##_offset_first_element_in_bytes
2467
2468#define TENSOR4D_DECLARATION(name)   \
2469    __global uchar *name##_ptr,      \
2470    uint        name##_stride_x, \
2471    uint        name##_step_x,   \
2472    uint        name##_stride_y, \
2473    uint        name##_step_y,   \
2474    uint        name##_stride_z, \
2475    uint        name##_step_z,   \
2476    uint        name##_stride_w, \
2477    uint        name##_step_w,   \
2478    uint        name##_offset_first_element_in_bytes
2479
2480#define TENSOR5D_DECLARATION(name)   \
2481    __global uchar *name##_ptr,      \
2482    uint        name##_stride_x, \
2483    uint        name##_step_x,   \
2484    uint        name##_stride_y, \
2485    uint        name##_step_y,   \
2486    uint        name##_stride_z, \
2487    uint        name##_step_z,   \
2488    uint        name##_stride_w, \
2489    uint        name##_step_w,   \
2490    uint        name##_stride_v, \
2491    uint        name##_step_v,   \
2492    uint        name##_offset_first_element_in_bytes
2493
2494#define CONVERT_TO_VECTOR_STRUCT(name) \
2495    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2496
2497#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2498    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2499
2500#define CONVERT_TO_IMAGE_STRUCT(name) \
2501    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2502
2503#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2504    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2505
2506#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2507    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2508
2509#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2510    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2511
2512#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2513    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2514
2515#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2516    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2517                                 name##_stride_z, name##_step_z)
2518
2519#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2520    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2521
2522#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2523    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2524                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2525
2526#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2527    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2528
2529#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2530    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2531                           name##_stride_z, name##_step_z)
2532
2533
2534typedef struct Vector
2535{
2536    __global uchar *ptr;
2537    int             offset_first_element_in_bytes;
2538    int             stride_x;
2539} Vector;
2540
2541
2542typedef struct Image
2543{
2544    __global uchar *ptr;
2545    int             offset_first_element_in_bytes;
2546    int             stride_x;
2547    int             stride_y;
2548} Image;
2549
2550
2551typedef struct Tensor3D
2552{
2553    __global uchar *ptr;
2554    int             offset_first_element_in_bytes;
2555    int             stride_x;
2556    int             stride_y;
2557    int             stride_z;
2558} Tensor3D;
2559
2560
2561typedef struct Tensor4D
2562{
2563    __global uchar *ptr;
2564    int             offset_first_element_in_bytes;
2565    int             stride_x;
2566    int             stride_y;
2567    int             stride_z;
2568    int             stride_w;
2569} Tensor4D;
2570
2571
2572inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2573{
2574    Vector vector =
2575    {
2576        .ptr                           = ptr,
2577        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2578        .stride_x                      = stride_x,
2579    };
2580    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2581    return vector;
2582}
2583
2584
2585inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2586{
2587    Image img =
2588    {
2589        .ptr                           = ptr,
2590        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2591        .stride_x                      = stride_x,
2592        .stride_y                      = stride_y
2593    };
2594    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2595    return img;
2596}
2597
2598
2599inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2600{
2601    Image img =
2602    {
2603        .ptr                           = ptr,
2604        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2605        .stride_x                      = stride_x,
2606        .stride_y                      = stride_y
2607    };
2608    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2609    return img;
2610}
2611
2612
2613inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2614{
2615    Tensor3D tensor =
2616    {
2617        .ptr                           = ptr,
2618        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2619        .stride_x                      = stride_x,
2620        .stride_y                      = stride_y,
2621        .stride_z                      = stride_z
2622    };
2623    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2624    return tensor;
2625}
2626
2627
2628inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2629{
2630    Tensor3D tensor =
2631    {
2632        .ptr                           = ptr,
2633        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2634        .stride_x                      = stride_x,
2635        .stride_y                      = stride_y,
2636        .stride_z                      = stride_z
2637    };
2638    return tensor;
2639}
2640
2641inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2642                                             uint step_w,
2643                                             uint mod_size)
2644{
2645    Tensor4D tensor =
2646    {
2647        .ptr                           = ptr,
2648        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2649        .stride_x                      = stride_x,
2650        .stride_y                      = stride_y,
2651        .stride_z                      = stride_z,
2652        .stride_w                      = stride_w
2653    };
2654
2655    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2656    return tensor;
2657}
2658
2659
2660inline __global const uchar *vector_offset(const Vector *vec, int x)
2661{
2662    return vec->ptr + x * vec->stride_x;
2663}
2664
2665
2666inline __global uchar *offset(const Image *img, int x, int y)
2667{
2668    return img->ptr + x * img->stride_x + y * img->stride_y;
2669}
2670
2671
2672inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2673{
2674    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2675}
2676
2677
2678inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2679{
2680    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2681}
2682
2683
2684inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2685{
2686    uint num_elements = width * height;
2687
2688    const uint z = index / num_elements;
2689
2690    index %= num_elements;
2691
2692    const uint y = index / width;
2693
2694    index %= width;
2695
2696    const uint x = index;
2697
2698    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2699}
2700
2701#endif
2702
2703#if GPU_ARCH == GPU_ARCH_BIFROST
2704#define MLA(a, b, c) (fma(c, b, a))
2705#else
2706#define MLA(a, b, c) ((b) * (c) + (a))
2707#endif
2708
2709
2710#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
2711
2712
2713#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
2714
2715
2716#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
2717
2718
2719#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
2720
2721
2722#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
2723
2724
2725#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
2726
2727
2728#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
2729
2730
2731#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
2732
2733
2734#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
2735
2736
2737#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
2738
2739
2740#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
2741
2742
2743#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
2744
2745
2746#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
2747
2748
2749#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
2750
2751
2752#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
2753
2754#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
2755
2756#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
2757
2758#ifndef ARM_COMPUTE_HELPER_H
2759#define ARM_COMPUTE_HELPER_H
2760
2761
2762
2763
2764#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2765    VSTORE(N0)                                                 \
2766    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2767
2768#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2769    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2770    VSTORE(N0)                                                 \
2771    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2772
2773#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2774    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2775    VSTORE(N0)                                                 \
2776    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2777
2778#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2779    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2780    VSTORE(N0)                                                 \
2781    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2782
2783#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2784    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2785    VSTORE(N0)                                                 \
2786    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2787
2788#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2789    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2790    VSTORE(N0)                                                 \
2791    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2792
2793#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2794    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2795    VSTORE(N0)                                                 \
2796    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2797
2798#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2799    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2800    VSTORE(N0)                                                 \
2801    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2802
2803#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2804    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2805    VSTORE(N0)                                                 \
2806    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2807
2808#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2809    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2810    VSTORE(N0)                                                  \
2811    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2812
2813#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2814    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2815    VSTORE(N0)                                                  \
2816    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2817
2818#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2819    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2820    VSTORE(N0)                                                  \
2821    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2822
2823#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2824    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2825    VSTORE(N0)                                                  \
2826    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2827
2828#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2829    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2830    VSTORE(N0)                                                  \
2831    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2832
2833#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2834    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2835    VSTORE(N0)                                                  \
2836    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2837
2838#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2839    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2840    VSTORE(N0)                                                  \
2841    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2842
2843
2844
2845#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2846    VSTORE(N0)                                                         \
2847    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2848
2849#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2850    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2851    VSTORE(N0)                                                         \
2852    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2853
2854#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2855    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2856    VSTORE(N0)                                                         \
2857    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2858
2859#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2860    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2861    VSTORE(N0)                                                         \
2862    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2863
2864#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2865    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2866    VSTORE(N0)                                                         \
2867    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2868
2869#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2870    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2871    VSTORE(N0)                                                         \
2872    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2873
2874#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2875    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2876    VSTORE(N0)                                                         \
2877    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2878
2879#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2880    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2881    VSTORE(N0)                                                         \
2882    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2883
2884#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2885    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2886    VSTORE(N0)                                                         \
2887    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2888
2889#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
2890    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2891    VSTORE(N0)                                                     \
2892    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2893
2894#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2895    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2896    VSTORE(N0)                                                          \
2897    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2898
2899#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2900    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2901    VSTORE(N0)                                                          \
2902    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2903
2904#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2905    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2906    VSTORE(N0)                                                          \
2907    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2908
2909#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2910    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2911    VSTORE(N0)                                                          \
2912    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2913
2914#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2915    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2916    VSTORE(N0)                                                          \
2917    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2918
2919#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2920    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2921    VSTORE(N0)                                                          \
2922    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2923
2924
2925
2926
2927#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2928#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2929
2930
2931
2932#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2933#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2934
2935
2936
2937#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2938    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2939    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2940
2941#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2942    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2943    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2944    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2945
2946#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2947    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2948    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2949    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2950
2951#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2952    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2953    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2954    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2955
2956#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2957    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2958    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2959    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2960
2961#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2962    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2963    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2964    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2965
2966#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2967    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2968    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2969    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2970
2971#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2972    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2973    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2974    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2975
2976#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2977    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2978    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
2979    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2980
2981#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2982    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
2983    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2984    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2985
2986#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2987    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2988    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2989    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2990
2991#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2992    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2993    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2994    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2995
2996#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
2997    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
2998    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
2999    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3000
3001#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3002    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3003    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
3004    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3005
3006#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3007    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3008    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
3009    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3010
3011#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3012    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3013    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
3014    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3015
3016
3017
3018#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3019#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3020
3021#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
3022    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
3023    {                                                                                                                                                     \
3024        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
3025    }                                                                                                                                                     \
3026    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
3027    {                                                                                                                                                     \
3028        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
3029    }                                                                                                                                                     \
3030    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
3031    {                                                                                                                                                     \
3032        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
3033    }                                                                                                                                                     \
3034    else                                                                                                                                                  \
3035    {                                                                                                                                                     \
3036        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
3037    }
3038
3039#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
3040    if(!(PARTIAL_COND_X))                                                                                         \
3041    {                                                                                                             \
3042        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
3043    }                                                                                                             \
3044    else                                                                                                          \
3045    {                                                                                                             \
3046        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
3047    }
3048
3049#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
3050    if(!(PARTIAL_COND_Y))                                                                                         \
3051    {                                                                                                             \
3052        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
3053    }                                                                                                             \
3054    else                                                                                                          \
3055    {                                                                                                             \
3056        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
3057    }
3058
3059
3060#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
3061
3062
3063#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
3064
3065#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
3066    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3067
3068#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
3069
3070#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
3071    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
3072
3073#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
3074
3075#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
3076    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
3077
3078#else
3079
3080#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
3081    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
3082
3083#endif
3084
3085#endif
3086
3087
3088#if defined(PARTIAL_STORE_M0)
3089
3090#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
3091    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
3092#else
3093#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
3094    ((uint)(y * M0))
3095#endif
3096
3097
3098
3099#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
3100    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
3101
3102
3103#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3104#pragma OPENCL EXTENSION cl_khr_fp16 : enable
3105#endif
3106
3107#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
3108#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
3109#endif
3110
3111#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
3112#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
3113#endif
3114
3115#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
3116#pragma OPENCL EXTENSION cl_arm_printf : enable
3117#endif
3118
3119#define GPU_ARCH_MIDGARD 0x100
3120#define GPU_ARCH_BIFROST 0x200
3121#define GPU_ARCH_VALHALL 0x300
3122
3123
3124#define CONCAT(a, b) a##b
3125
3126
3127#define EXPAND(x) x
3128
3129
3130#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
3131
3132
3133#define REV1(x) ((x))
3134#define REV2(x) ((x).s10)
3135#define REV3(x) ((x).s210)
3136#define REV4(x) ((x).s3210)
3137#define REV8(x) ((x).s76543210)
3138#define REV16(x) ((x).sFEDCBA9876543210)
3139
3140
3141
3142#define REVERSE_STR(x, s) REV##s((x))
3143#define REVERSE(x, s) REVERSE_STR(x, s)
3144
3145
3146
3147#define ROT1_0(x) ((x))
3148#define ROT1_1(x) ((x))
3149
3150#define ROT2_0(x) ((x))
3151#define ROT2_1(x) ((x).s10)
3152#define ROT2_2(x) ((x))
3153
3154#define ROT3_0(x) ((x))
3155#define ROT3_1(x) ((x).s201)
3156#define ROT3_2(x) ((x).s120)
3157#define ROT3_3(x) ((x))
3158
3159#define ROT4_0(x) ((x))
3160#define ROT4_1(x) ((x).s3012)
3161#define ROT4_2(x) ((x).s2301)
3162#define ROT4_3(x) ((x).s1230)
3163#define ROT4_4(x) ((x))
3164
3165#define ROT8_0(x) ((x))
3166#define ROT8_1(x) ((x).s70123456)
3167#define ROT8_2(x) ((x).s67012345)
3168#define ROT8_3(x) ((x).s56701234)
3169#define ROT8_4(x) ((x).s45670123)
3170#define ROT8_5(x) ((x).s34567012)
3171#define ROT8_6(x) ((x).s23456701)
3172#define ROT8_7(x) ((x).s12345670)
3173#define ROT8_8(x) ((x))
3174
3175#define ROT16_0(x) ((x))
3176#define ROT16_1(x) ((x).sF0123456789ABCDE)
3177#define ROT16_2(x) ((x).sEF0123456789ABCD)
3178#define ROT16_3(x) ((x).sDEF0123456789ABC)
3179#define ROT16_4(x) ((x).sCDEF0123456789AB)
3180#define ROT16_5(x) ((x).sBCDEF0123456789A)
3181#define ROT16_6(x) ((x).sABCDEF0123456789)
3182#define ROT16_7(x) ((x).s9ABCDEF012345678)
3183#define ROT16_8(x) ((x).s89ABCDEF01234567)
3184#define ROT16_9(x) ((x).s789ABCDEF0123456)
3185#define ROT16_10(x) ((x).s6789ABCDEF012345)
3186#define ROT16_11(x) ((x).s56789ABCDEF01234)
3187#define ROT16_12(x) ((x).s456789ABCDEF0123)
3188#define ROT16_13(x) ((x).s3456789ABCDEF012)
3189#define ROT16_14(x) ((x).s23456789ABCDEF01)
3190#define ROT16_15(x) ((x).s123456789ABCDEF0)
3191#define ROT16_16(x) ((x))
3192
3193
3194
3195#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
3196#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
3197
3198
3199
3200#define V_OFFS1(dt) (dt##1)(0)
3201#define V_OFFS2(dt) (dt##2)(0, 1)
3202#define V_OFFS3(dt) (dt##3)(0, 1, 2)
3203#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
3204#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
3205#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
3206
3207
3208
3209#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
3210#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
3211
3212
3213#define VLOAD_STR(size) vload##size
3214#define VLOAD(size) VLOAD_STR(size)
3215
3216
3217#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
3218#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
3219
3220#define NO_LOAD(data, offs, ptr) \
3221    {                            \
3222    }
3223
3224
3225#define vload_partial_1_0 NO_LOAD
3226#define vload_partial_1_1 vload1
3227#define vload_partial_1_2 NO_LOAD
3228#define vload_partial_1_3 NO_LOAD
3229#define vload_partial_1_4 NO_LOAD
3230#define vload_partial_1_5 NO_LOAD
3231#define vload_partial_1_6 NO_LOAD
3232#define vload_partial_1_7 NO_LOAD
3233#define vload_partial_1_8 NO_LOAD
3234#define vload_partial_1_9 NO_LOAD
3235#define vload_partial_1_10 NO_LOAD
3236#define vload_partial_1_11 NO_LOAD
3237#define vload_partial_1_12 NO_LOAD
3238#define vload_partial_1_13 NO_LOAD
3239#define vload_partial_1_14 NO_LOAD
3240#define vload_partial_1_15 NO_LOAD
3241#define vload_partial_1_16 NO_LOAD
3242
3243#define vload_partial_2_0 NO_LOAD
3244#define vload_partial_2_1 vload_partial_1
3245#define vload_partial_2_2 vload_partial_2
3246#define vload_partial_2_3 NO_LOAD
3247#define vload_partial_2_4 NO_LOAD
3248#define vload_partial_2_5 NO_LOAD
3249#define vload_partial_2_6 NO_LOAD
3250#define vload_partial_2_7 NO_LOAD
3251#define vload_partial_2_8 NO_LOAD
3252#define vload_partial_2_9 NO_LOAD
3253#define vload_partial_2_10 NO_LOAD
3254#define vload_partial_2_11 NO_LOAD
3255#define vload_partial_2_12 NO_LOAD
3256#define vload_partial_2_13 NO_LOAD
3257#define vload_partial_2_14 NO_LOAD
3258#define vload_partial_2_15 NO_LOAD
3259#define vload_partial_2_16 NO_LOAD
3260
3261#define vload_partial_3_0 NO_LOAD
3262#define vload_partial_3_1 vload_partial_1
3263#define vload_partial_3_2 vload_partial_2
3264#define vload_partial_3_3 vload_partial_3
3265#define vload_partial_3_4 NO_LOAD
3266#define vload_partial_3_5 NO_LOAD
3267#define vload_partial_3_6 NO_LOAD
3268#define vload_partial_3_7 NO_LOAD
3269#define vload_partial_3_8 NO_LOAD
3270#define vload_partial_3_9 NO_LOAD
3271#define vload_partial_3_10 NO_LOAD
3272#define vload_partial_3_11 NO_LOAD
3273#define vload_partial_3_12 NO_LOAD
3274#define vload_partial_3_13 NO_LOAD
3275#define vload_partial_3_14 NO_LOAD
3276#define vload_partial_3_15 NO_LOAD
3277#define vload_partial_3_16 NO_LOAD
3278
3279#define vload_partial_4_0 NO_LOAD
3280#define vload_partial_4_1 vload_partial_1
3281#define vload_partial_4_2 vload_partial_2
3282#define vload_partial_4_3 vload_partial_3
3283#define vload_partial_4_4 vload_partial_4
3284#define vload_partial_4_5 NO_LOAD
3285#define vload_partial_4_6 NO_LOAD
3286#define vload_partial_4_7 NO_LOAD
3287#define vload_partial_4_8 NO_LOAD
3288#define vload_partial_4_9 NO_LOAD
3289#define vload_partial_4_10 NO_LOAD
3290#define vload_partial_4_11 NO_LOAD
3291#define vload_partial_4_12 NO_LOAD
3292#define vload_partial_4_13 NO_LOAD
3293#define vload_partial_4_14 NO_LOAD
3294#define vload_partial_4_15 NO_LOAD
3295#define vload_partial_4_16 NO_LOAD
3296
3297#define vload_partial_8_0 NO_LOAD
3298#define vload_partial_8_1 vload_partial_1
3299#define vload_partial_8_2 vload_partial_2
3300#define vload_partial_8_3 vload_partial_3
3301#define vload_partial_8_4 vload_partial_4
3302#define vload_partial_8_5 vload_partial_5
3303#define vload_partial_8_6 vload_partial_6
3304#define vload_partial_8_7 vload_partial_7
3305#define vload_partial_8_8 vload_partial_8
3306#define vload_partial_8_9 NO_LOAD
3307#define vload_partial_8_10 NO_LOAD
3308#define vload_partial_8_11 NO_LOAD
3309#define vload_partial_8_12 NO_LOAD
3310#define vload_partial_8_13 NO_LOAD
3311#define vload_partial_8_14 NO_LOAD
3312#define vload_partial_8_15 NO_LOAD
3313#define vload_partial_8_16 NO_LOAD
3314
3315#define vload_partial_16_0 NO_LOAD
3316#define vload_partial_16_1 vload_partial_1
3317#define vload_partial_16_2 vload_partial_2
3318#define vload_partial_16_3 vload_partial_3
3319#define vload_partial_16_4 vload_partial_4
3320#define vload_partial_16_5 vload_partial_5
3321#define vload_partial_16_6 vload_partial_6
3322#define vload_partial_16_7 vload_partial_7
3323#define vload_partial_16_8 vload_partial_8
3324#define vload_partial_16_9 vload_partial_9
3325#define vload_partial_16_10 vload_partial_10
3326#define vload_partial_16_11 vload_partial_11
3327#define vload_partial_16_12 vload_partial_12
3328#define vload_partial_16_13 vload_partial_13
3329#define vload_partial_16_14 vload_partial_14
3330#define vload_partial_16_15 vload_partial_15
3331#define vload_partial_16_16 vload_partial_16
3332
3333
3334#define vload_partial_1(DATA, OFFSET, PTR) \
3335    DATA.s0 = vload1(OFFSET, PTR);
3336
3337#define vload_partial_2(DATA, OFFSET, PTR) \
3338    DATA.s01 = vload2(OFFSET, PTR);
3339
3340#define vload_partial_3(DATA, OFFSET, PTR) \
3341    DATA.s012 = vload3(OFFSET, PTR);
3342
3343#define vload_partial_4(DATA, OFFSET, PTR) \
3344    DATA.s0123 = vload4(OFFSET, PTR);
3345
3346#define vload_partial_5(DATA, OFFSET, PTR)    \
3347    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3348    DATA.s4 = vload1(OFFSET, PTR + 4);
3349
3350#define vload_partial_6(DATA, OFFSET, PTR)    \
3351    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3352    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
3353
3354#define vload_partial_7(DATA, OFFSET, PTR)    \
3355    vload_partial_4(DATA.s0123, OFFSET, PTR); \
3356    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
3357
3358#define vload_partial_8(DATA, OFFSET, PTR) \
3359    DATA.s01234567 = vload8(OFFSET, PTR);
3360
3361#define vload_partial_9(DATA, OFFSET, PTR)        \
3362    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3363    DATA.s8 = vload1(OFFSET, PTR + 8);
3364
3365#define vload_partial_10(DATA, OFFSET, PTR)       \
3366    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3367    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
3368
3369#define vload_partial_11(DATA, OFFSET, PTR)       \
3370    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3371    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
3372
3373#define vload_partial_12(DATA, OFFSET, PTR)       \
3374    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3375    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
3376
3377#define vload_partial_13(DATA, OFFSET, PTR)       \
3378    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3379    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
3380
3381#define vload_partial_14(DATA, OFFSET, PTR)       \
3382    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3383    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
3384
3385#define vload_partial_15(DATA, OFFSET, PTR)       \
3386    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
3387    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
3388
3389#define vload_partial_16(DATA, OFFSET, PTR) \
3390    DATA = vload16(OFFSET, PTR);
3391
3392
3393
3394#define PIXEL_UNIT4 1
3395#define PIXEL_UNIT8 2
3396#define PIXEL_UNIT16 4
3397
3398
3399#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
3400#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
3401
3402
3403#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
3404#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
3405#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
3406
3407#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3408#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
3409#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
3410#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
3411#endif
3412
3413#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
3414#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
3415#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
3416
3417#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
3418#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
3419#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
3420#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
3421#endif
3422
3423
3424#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
3425#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
3426
3427
3428#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
3429#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
3430
3431#define VSTORE_STR(size) vstore##size
3432#define VSTORE(size) VSTORE_STR(size)
3433
3434#define float1 float
3435#define half1 half
3436#define char1 char
3437#define uchar1 uchar
3438#define short1 short
3439#define ushort1 ushort
3440#define int1 int
3441#define uint1 uint
3442#define long1 long
3443#define ulong1 ulong
3444#define double1 double
3445
3446#define vload1(OFFSET, PTR) *(OFFSET + PTR)
3447#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
3448
3449
3450#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
3451#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
3452
3453#define NO_STORE(data, offs, ptr) \
3454    {                             \
3455    }
3456
3457
3458#define vstore_partial_1_0 NO_STORE
3459#define vstore_partial_1_1 vstore1
3460#define vstore_partial_1_2 NO_STORE
3461#define vstore_partial_1_3 NO_STORE
3462#define vstore_partial_1_4 NO_STORE
3463#define vstore_partial_1_5 NO_STORE
3464#define vstore_partial_1_6 NO_STORE
3465#define vstore_partial_1_7 NO_STORE
3466#define vstore_partial_1_8 NO_STORE
3467#define vstore_partial_1_9 NO_STORE
3468#define vstore_partial_1_10 NO_STORE
3469#define vstore_partial_1_11 NO_STORE
3470#define vstore_partial_1_12 NO_STORE
3471#define vstore_partial_1_13 NO_STORE
3472#define vstore_partial_1_14 NO_STORE
3473#define vstore_partial_1_15 NO_STORE
3474#define vstore_partial_1_16 NO_STORE
3475
3476#define vstore_partial_2_0 NO_STORE
3477#define vstore_partial_2_1 vstore_partial_1
3478#define vstore_partial_2_2 vstore_partial_2
3479#define vstore_partial_2_3 NO_STORE
3480#define vstore_partial_2_4 NO_STORE
3481#define vstore_partial_2_5 NO_STORE
3482#define vstore_partial_2_6 NO_STORE
3483#define vstore_partial_2_7 NO_STORE
3484#define vstore_partial_2_8 NO_STORE
3485#define vstore_partial_2_9 NO_STORE
3486#define vstore_partial_2_10 NO_STORE
3487#define vstore_partial_2_11 NO_STORE
3488#define vstore_partial_2_12 NO_STORE
3489#define vstore_partial_2_13 NO_STORE
3490#define vstore_partial_2_14 NO_STORE
3491#define vstore_partial_2_15 NO_STORE
3492#define vstore_partial_2_16 NO_STORE
3493
3494#define vstore_partial_3_0 NO_STORE
3495#define vstore_partial_3_1 vstore_partial_1
3496#define vstore_partial_3_2 vstore_partial_2
3497#define vstore_partial_3_3 vstore_partial_3
3498#define vstore_partial_3_4 NO_STORE
3499#define vstore_partial_3_5 NO_STORE
3500#define vstore_partial_3_6 NO_STORE
3501#define vstore_partial_3_7 NO_STORE
3502#define vstore_partial_3_8 NO_STORE
3503#define vstore_partial_3_9 NO_STORE
3504#define vstore_partial_3_10 NO_STORE
3505#define vstore_partial_3_11 NO_STORE
3506#define vstore_partial_3_12 NO_STORE
3507#define vstore_partial_3_13 NO_STORE
3508#define vstore_partial_3_14 NO_STORE
3509#define vstore_partial_3_15 NO_STORE
3510#define vstore_partial_3_16 NO_STORE
3511
3512#define vstore_partial_4_0 NO_STORE
3513#define vstore_partial_4_1 vstore_partial_1
3514#define vstore_partial_4_2 vstore_partial_2
3515#define vstore_partial_4_3 vstore_partial_3
3516#define vstore_partial_4_4 vstore_partial_4
3517#define vstore_partial_4_5 NO_STORE
3518#define vstore_partial_4_6 NO_STORE
3519#define vstore_partial_4_7 NO_STORE
3520#define vstore_partial_4_8 NO_STORE
3521#define vstore_partial_4_9 NO_STORE
3522#define vstore_partial_4_10 NO_STORE
3523#define vstore_partial_4_11 NO_STORE
3524#define vstore_partial_4_12 NO_STORE
3525#define vstore_partial_4_13 NO_STORE
3526#define vstore_partial_4_14 NO_STORE
3527#define vstore_partial_4_15 NO_STORE
3528#define vstore_partial_4_16 NO_STORE
3529
3530#define vstore_partial_8_0 NO_STORE
3531#define vstore_partial_8_1 vstore_partial_1
3532#define vstore_partial_8_2 vstore_partial_2
3533#define vstore_partial_8_3 vstore_partial_3
3534#define vstore_partial_8_4 vstore_partial_4
3535#define vstore_partial_8_5 vstore_partial_5
3536#define vstore_partial_8_6 vstore_partial_6
3537#define vstore_partial_8_7 vstore_partial_7
3538#define vstore_partial_8_8 vstore_partial_8
3539#define vstore_partial_8_9 NO_STORE
3540#define vstore_partial_8_10 NO_STORE
3541#define vstore_partial_8_11 NO_STORE
3542#define vstore_partial_8_12 NO_STORE
3543#define vstore_partial_8_13 NO_STORE
3544#define vstore_partial_8_14 NO_STORE
3545#define vstore_partial_8_15 NO_STORE
3546#define vstore_partial_8_16 NO_STORE
3547
3548#define vstore_partial_16_0 NO_STORE
3549#define vstore_partial_16_1 vstore_partial_1
3550#define vstore_partial_16_2 vstore_partial_2
3551#define vstore_partial_16_3 vstore_partial_3
3552#define vstore_partial_16_4 vstore_partial_4
3553#define vstore_partial_16_5 vstore_partial_5
3554#define vstore_partial_16_6 vstore_partial_6
3555#define vstore_partial_16_7 vstore_partial_7
3556#define vstore_partial_16_8 vstore_partial_8
3557#define vstore_partial_16_9 vstore_partial_9
3558#define vstore_partial_16_10 vstore_partial_10
3559#define vstore_partial_16_11 vstore_partial_11
3560#define vstore_partial_16_12 vstore_partial_12
3561#define vstore_partial_16_13 vstore_partial_13
3562#define vstore_partial_16_14 vstore_partial_14
3563#define vstore_partial_16_15 vstore_partial_15
3564#define vstore_partial_16_16 vstore_partial_16
3565
3566
3567#define vstore_partial_1(DATA, OFFSET, PTR) \
3568    vstore1(DATA.s0, OFFSET, PTR);
3569
3570#define vstore_partial_2(DATA, OFFSET, PTR) \
3571    vstore2(DATA.s01, OFFSET, PTR);
3572
3573#define vstore_partial_3(DATA, OFFSET, PTR) \
3574    vstore3(DATA.s012, OFFSET, PTR);
3575
3576#define vstore_partial_4(DATA, OFFSET, PTR) \
3577    vstore4(DATA.s0123, OFFSET, PTR);
3578
3579#define vstore_partial_5(DATA, OFFSET, PTR)    \
3580    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3581    vstore1(DATA.s4, OFFSET, PTR + 4);
3582
3583#define vstore_partial_6(DATA, OFFSET, PTR)    \
3584    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3585    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
3586
3587#define vstore_partial_7(DATA, OFFSET, PTR)    \
3588    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
3589    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
3590
3591#define vstore_partial_8(DATA, OFFSET, PTR) \
3592    vstore8(DATA.s01234567, OFFSET, PTR);
3593
3594#define vstore_partial_9(DATA, OFFSET, PTR)        \
3595    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3596    vstore1(DATA.s8, OFFSET, PTR + 8);
3597
3598#define vstore_partial_10(DATA, OFFSET, PTR)       \
3599    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3600    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
3601
3602#define vstore_partial_11(DATA, OFFSET, PTR)       \
3603    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3604    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
3605
3606#define vstore_partial_12(DATA, OFFSET, PTR)       \
3607    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3608    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
3609
3610#define vstore_partial_13(DATA, OFFSET, PTR)       \
3611    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3612    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
3613
3614#define vstore_partial_14(DATA, OFFSET, PTR)       \
3615    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3616    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
3617
3618#define vstore_partial_15(DATA, OFFSET, PTR)       \
3619    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
3620    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
3621
3622#define vstore_partial_16(DATA, OFFSET, PTR) \
3623    vstore16(DATA, OFFSET, PTR);
3624
3625
3626
3627
3628
3629#define convert_float_sat convert_float
3630#define convert_float1_sat convert_float
3631#define convert_float2_sat convert_float2
3632#define convert_float3_sat convert_float3
3633#define convert_float4_sat convert_float4
3634#define convert_float8_sat convert_float8
3635#define convert_float16_sat convert_float16
3636#define convert_half_sat convert_float
3637#define convert_half1_sat convert_half
3638#define convert_half2_sat convert_half2
3639#define convert_half3_sat convert_half3
3640#define convert_half4_sat convert_half4
3641#define convert_half8_sat convert_half8
3642#define convert_half16_sat convert_half16
3643
3644#define convert_float1 convert_float
3645#define convert_half1 convert_half
3646#define convert_char1 convert_char
3647#define convert_uchar1 convert_uchar
3648#define convert_short1 convert_short
3649#define convert_ushort1 convert_ushort
3650#define convert_int1 convert_int
3651#define convert_uint1 convert_uint
3652#define convert_long1 convert_long
3653#define convert_ulong1 convert_ulong
3654#define convert_double1 convert_double
3655
3656#define convert_char1_sat convert_char_sat
3657#define convert_uchar1_sat convert_uchar_sat
3658#define convert_uchar2_sat convert_uchar2_sat
3659#define convert_uchar3_sat convert_uchar3_sat
3660#define convert_uchar4_sat convert_uchar4_sat
3661#define convert_uchar8_sat convert_uchar8_sat
3662#define convert_uchar16_sat convert_uchar16_sat
3663#define convert_short1_sat convert_short_sat
3664#define convert_ushort1_sat convert_ushort_sat
3665#define convert_int1_sat convert_int_sat
3666#define convert_uint1_sat convert_uint_sat
3667#define convert_long1_sat convert_long_sat
3668#define convert_ulong1_sat convert_ulong_sat
3669#define convert_double1_sat convert_double_sat
3670
3671#define VEC_DATA_TYPE_STR(type, size) type##size
3672#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
3673
3674#define CONVERT_STR(x, type) (convert_##type((x)))
3675#define CONVERT(x, type) CONVERT_STR(x, type)
3676
3677#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
3678#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
3679
3680#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
3681#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
3682
3683#define select_vec_dt_uchar(size) uchar##size
3684#define select_vec_dt_char(size) char##size
3685#define select_vec_dt_ushort(size) ushort##size
3686#define select_vec_dt_short(size) short##size
3687#define select_vec_dt_half(size) short##size
3688#define select_vec_dt_uint(size) uint##size
3689#define select_vec_dt_int(size) int##size
3690#define select_vec_dt_float(size) int##size
3691#define select_vec_dt_ulong(size) ulong##size
3692#define select_vec_dt_long(size) long##size
3693
3694#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
3695#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
3696#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
3697
3698#define signed_int_vec_dt_uchar(size) char##size
3699#define signed_int_vec_dt_char(size) char##size
3700#define signed_int_vec_dt_ushort(size) short##size
3701#define signed_int_vec_dt_short(size) short##size
3702#define signed_int_vec_dt_half(size) short##size
3703#define signed_int_vec_dt_uint(size) int##size
3704#define signed_int_vec_dt_int(size) int##size
3705#define signed_int_vec_dt_float(size) int##size
3706#define signed_int_vec_dt_ulong(size) long##size
3707#define signed_int_vec_dt_long(size) long##size
3708
3709#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
3710#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
3711#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
3712
3713#define sum_reduce_1(x) (x)
3714#define sum_reduce_2(x) ((x).s0) + ((x).s1)
3715#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
3716#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
3717#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
3718#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
3719
3720#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
3721#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
3722
3723#define prod_reduce_1(x) (x)
3724#define prod_reduce_2(x) ((x).s0) * ((x).s1)
3725#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
3726#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
3727#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
3728#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
3729
3730#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
3731#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
3732
3733#define max_reduce_1(x) (x)
3734#define max_reduce_2(x) max(((x).s0), ((x).s1))
3735#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
3736#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
3737#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
3738#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
3739
3740#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
3741#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
3742
3743#define VECTOR_DECLARATION(name)     \
3744    __global uchar *name##_ptr,      \
3745    uint        name##_stride_x, \
3746    uint        name##_step_x,   \
3747    uint        name##_offset_first_element_in_bytes
3748
3749#define IMAGE_DECLARATION(name)      \
3750    __global uchar *name##_ptr,      \
3751    uint        name##_stride_x, \
3752    uint        name##_step_x,   \
3753    uint        name##_stride_y, \
3754    uint        name##_step_y,   \
3755    uint        name##_offset_first_element_in_bytes
3756
3757#define TENSOR3D_DECLARATION(name)   \
3758    __global uchar *name##_ptr,      \
3759    uint        name##_stride_x, \
3760    uint        name##_step_x,   \
3761    uint        name##_stride_y, \
3762    uint        name##_step_y,   \
3763    uint        name##_stride_z, \
3764    uint        name##_step_z,   \
3765    uint        name##_offset_first_element_in_bytes
3766
3767#define TENSOR4D_DECLARATION(name)   \
3768    __global uchar *name##_ptr,      \
3769    uint        name##_stride_x, \
3770    uint        name##_step_x,   \
3771    uint        name##_stride_y, \
3772    uint        name##_step_y,   \
3773    uint        name##_stride_z, \
3774    uint        name##_step_z,   \
3775    uint        name##_stride_w, \
3776    uint        name##_step_w,   \
3777    uint        name##_offset_first_element_in_bytes
3778
3779#define TENSOR5D_DECLARATION(name)   \
3780    __global uchar *name##_ptr,      \
3781    uint        name##_stride_x, \
3782    uint        name##_step_x,   \
3783    uint        name##_stride_y, \
3784    uint        name##_step_y,   \
3785    uint        name##_stride_z, \
3786    uint        name##_step_z,   \
3787    uint        name##_stride_w, \
3788    uint        name##_step_w,   \
3789    uint        name##_stride_v, \
3790    uint        name##_step_v,   \
3791    uint        name##_offset_first_element_in_bytes
3792
3793#define CONVERT_TO_VECTOR_STRUCT(name) \
3794    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
3795
3796#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
3797    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
3798
3799#define CONVERT_TO_IMAGE_STRUCT(name) \
3800    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
3801
3802#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
3803    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
3804
3805#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3806    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3807
3808#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
3809    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
3810
3811#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
3812    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
3813
3814#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
3815    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3816                                 name##_stride_z, name##_step_z)
3817
3818#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
3819    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
3820
3821#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
3822    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3823                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
3824
3825#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
3826    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
3827
3828#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
3829    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
3830                           name##_stride_z, name##_step_z)
3831
3832
3833typedef struct Vector
3834{
3835    __global uchar *ptr;
3836    int             offset_first_element_in_bytes;
3837    int             stride_x;
3838} Vector;
3839
3840
3841typedef struct Image
3842{
3843    __global uchar *ptr;
3844    int             offset_first_element_in_bytes;
3845    int             stride_x;
3846    int             stride_y;
3847} Image;
3848
3849
3850typedef struct Tensor3D
3851{
3852    __global uchar *ptr;
3853    int             offset_first_element_in_bytes;
3854    int             stride_x;
3855    int             stride_y;
3856    int             stride_z;
3857} Tensor3D;
3858
3859
3860typedef struct Tensor4D
3861{
3862    __global uchar *ptr;
3863    int             offset_first_element_in_bytes;
3864    int             stride_x;
3865    int             stride_y;
3866    int             stride_z;
3867    int             stride_w;
3868} Tensor4D;
3869
3870
3871inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
3872{
3873    Vector vector =
3874    {
3875        .ptr                           = ptr,
3876        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3877        .stride_x                      = stride_x,
3878    };
3879    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
3880    return vector;
3881}
3882
3883
3884inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
3885{
3886    Image img =
3887    {
3888        .ptr                           = ptr,
3889        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3890        .stride_x                      = stride_x,
3891        .stride_y                      = stride_y
3892    };
3893    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
3894    return img;
3895}
3896
3897
3898inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3899{
3900    Image img =
3901    {
3902        .ptr                           = ptr,
3903        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3904        .stride_x                      = stride_x,
3905        .stride_y                      = stride_y
3906    };
3907    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3908    return img;
3909}
3910
3911
3912inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3913{
3914    Tensor3D tensor =
3915    {
3916        .ptr                           = ptr,
3917        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3918        .stride_x                      = stride_x,
3919        .stride_y                      = stride_y,
3920        .stride_z                      = stride_z
3921    };
3922    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
3923    return tensor;
3924}
3925
3926
3927inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
3928{
3929    Tensor3D tensor =
3930    {
3931        .ptr                           = ptr,
3932        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3933        .stride_x                      = stride_x,
3934        .stride_y                      = stride_y,
3935        .stride_z                      = stride_z
3936    };
3937    return tensor;
3938}
3939
3940inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
3941                                             uint step_w,
3942                                             uint mod_size)
3943{
3944    Tensor4D tensor =
3945    {
3946        .ptr                           = ptr,
3947        .offset_first_element_in_bytes = offset_first_element_in_bytes,
3948        .stride_x                      = stride_x,
3949        .stride_y                      = stride_y,
3950        .stride_z                      = stride_z,
3951        .stride_w                      = stride_w
3952    };
3953
3954    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
3955    return tensor;
3956}
3957
3958
3959inline __global const uchar *vector_offset(const Vector *vec, int x)
3960{
3961    return vec->ptr + x * vec->stride_x;
3962}
3963
3964
3965inline __global uchar *offset(const Image *img, int x, int y)
3966{
3967    return img->ptr + x * img->stride_x + y * img->stride_y;
3968}
3969
3970
3971inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
3972{
3973    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
3974}
3975
3976
3977inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
3978{
3979    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
3980}
3981
3982
3983inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
3984{
3985    uint num_elements = width * height;
3986
3987    const uint z = index / num_elements;
3988
3989    index %= num_elements;
3990
3991    const uint y = index / width;
3992
3993    index %= width;
3994
3995    const uint x = index;
3996
3997    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
3998}
3999
4000#endif
4001
4002
4003#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
4004#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
4005
4006
4007#define scalar_access_0_1(x) ((x).s0)
4008#define scalar_access_0_2(x) ((x).s01)
4009#define scalar_access_0_3(x) ((x).s012)
4010#define scalar_access_0_4(x) ((x).s0123)
4011#define scalar_access_0_8(x) ((x).s01234567)
4012#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
4013
4014
4015#define scalar_access_1_1(x) ((x).s1)
4016#define scalar_access_1_2(x) ((x).s12)
4017#define scalar_access_1_3(x) ((x).s123)
4018#define scalar_access_1_4(x) ((x).s1234)
4019#define scalar_access_1_8(x) ((x).s12345678)
4020
4021
4022#define scalar_access_2_1(x) ((x).s2)
4023#define scalar_access_2_2(x) ((x).s23)
4024#define scalar_access_2_3(x) ((x).s234)
4025#define scalar_access_2_4(x) ((x).s2345)
4026#define scalar_access_2_8(x) ((x).s23456789)
4027
4028
4029#define scalar_access_3_1(x) ((x).s3)
4030#define scalar_access_3_2(x) ((x).s34)
4031#define scalar_access_3_3(x) ((x).s345)
4032#define scalar_access_3_4(x) ((x).s3456)
4033#define scalar_access_3_8(x) ((x).s3456789A)
4034
4035
4036#define scalar_access_4_1(x) ((x).s4)
4037#define scalar_access_4_2(x) ((x).s45)
4038#define scalar_access_4_3(x) ((x).s456)
4039#define scalar_access_4_4(x) ((x).s4567)
4040#define scalar_access_4_8(x) ((x).s456789AB)
4041
4042
4043#define scalar_access_8_1(x) ((x).s8)
4044#define scalar_access_8_2(x) ((x).s89)
4045#define scalar_access_8_3(x) ((x).s89A)
4046#define scalar_access_8_4(x) ((x).s89AB)
4047#define scalar_access_8_8(x) ((x).s89ABCDEF)
4048
4049
4050#define scalar_access_12_1(x) ((x).sC)
4051#define scalar_access_12_2(x) ((x).sCD)
4052#define scalar_access_12_3(x) ((x).sCDE)
4053#define scalar_access_12_4(x) ((x).sCDEF)
4054
4055
4056#define scalar_access_16_1(x) ((x).sF)
4057
4058
4059#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4060    ({})
4061
4062#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4063    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
4064
4065#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4066    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4067    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
4068
4069#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4070    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4071    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
4072
4073#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4074    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4075    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
4076
4077#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4078    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4079    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
4080
4081#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4082    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4083    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4084
4085#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4086    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4087    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4088
4089#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4090    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4091    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4092
4093#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4094    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4095    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4096
4097#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4098    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
4099    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4100
4101#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4102    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4103    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4104
4105#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4106    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4107    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4108
4109#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4110    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4111    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4112
4113#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4114    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4115    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4116
4117#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4118    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4119    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4120
4121#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
4122    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
4123    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4124
4125
4126
4127#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
4128#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
4129
4130
4131
4132#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4133    ({})
4134
4135#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4136    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
4137
4138#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4139    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
4140
4141#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4142    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
4143
4144#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4145    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
4146
4147#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4148    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
4149    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
4150
4151#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4152    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
4153    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
4154
4155#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4156    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
4157    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
4158
4159#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4160    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
4161
4162#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4163    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
4164    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
4165
4166#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4167    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
4168    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
4169
4170#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4171    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
4172    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
4173
4174#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4175    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
4176    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
4177
4178#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
4179    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
4180    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
4181    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
4182
4183#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
4184    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
4185    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
4186    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
4187
4188#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
4189    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
4190    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
4191    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
4192
4193#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
4194    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
4195
4196
4197
4198#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4199#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4200
4201
4202#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4203    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4204    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
4205
4206#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4207    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4208    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4209    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
4210
4211#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4212    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4213    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4214    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
4215
4216#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4217    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4218    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4219    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
4220
4221#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4222    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4223    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4224    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
4225
4226#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4227    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4228    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4229    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
4230
4231#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4232    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4233    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4234    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
4235
4236#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4237    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4238    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4239    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
4240
4241#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4242    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4243    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
4244    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
4245
4246#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4247    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
4248    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4249    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
4250
4251#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4252    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4253    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4254    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
4255
4256#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4257    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4258    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4259    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
4260
4261#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4262    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4263    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4264    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
4265
4266#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4267    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4268    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4269    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
4270
4271#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4272    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4273    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4274    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
4275
4276#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4277    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4278    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4279    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
4280
4281
4282
4283
4284#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
4285#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
4286
4287
4288
4289#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4290    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4291    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
4292
4293#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4294    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4295    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4296    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
4297
4298#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4299    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4300    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4301    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
4302
4303#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4304    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4305    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4306    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
4307
4308#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4309    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4310    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4311    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
4312
4313#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4314    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4315    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4316    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
4317
4318#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4319    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4320    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4321    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
4322
4323#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4324    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4325    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4326    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
4327
4328#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4329    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4330    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
4331    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
4332
4333#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4334    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
4335    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4336    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
4337
4338#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4339    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4340    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4341    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
4342
4343#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4344    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4345    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4346    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
4347
4348#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4349    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4350    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4351    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
4352
4353#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4354    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4355    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4356    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
4357
4358#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4359    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4360    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4361    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
4362
4363#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
4364    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
4365    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
4366    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
4367
4368
4369
4370#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
4371#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
4372
4373#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4374    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
4375    {                                                                                                                                                            \
4376        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
4377    }                                                                                                                                                            \
4378    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
4379    {                                                                                                                                                            \
4380        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
4381    }                                                                                                                                                            \
4382    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
4383    {                                                                                                                                                            \
4384        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
4385    }                                                                                                                                                            \
4386    else                                                                                                                                                         \
4387    {                                                                                                                                                            \
4388        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
4389    }
4390
4391#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4392    if(!(PARTIAL_COND_X))                                                                                                \
4393    {                                                                                                                    \
4394        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
4395    }                                                                                                                    \
4396    else                                                                                                                 \
4397    {                                                                                                                    \
4398        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
4399    }
4400
4401#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4402    if(!(PARTIAL_COND_Y))                                                                                                \
4403    {                                                                                                                    \
4404        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
4405    }                                                                                                                    \
4406    else                                                                                                                 \
4407    {                                                                                                                    \
4408        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
4409    }
4410
4411
4412#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4413
4414#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4415    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
4416
4417#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4418
4419#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4420    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
4421    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4422
4423#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4424
4425#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4426    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
4427    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4428
4429#else
4430
4431#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4432    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
4433    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4434
4435#endif
4436
4437
4438#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4439    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
4440
4441#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4442    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4443    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
4444
4445#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4446    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4447    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
4448
4449#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4450    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4451    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
4452
4453#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4454    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4455    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
4456
4457#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4458    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4459    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
4460
4461#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4462    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4463    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
4464
4465#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4466    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4467    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
4468
4469#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4470    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4471    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
4472
4473#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4474    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
4475    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
4476
4477#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4478    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4479    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
4480
4481#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4482    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4483    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
4484
4485#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4486    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4487    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
4488
4489#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4490    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4491    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
4492
4493#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4494    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4495    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
4496
4497#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
4498    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
4499    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
4500
4501
4502
4503#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
4504#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
4505
4506
4507
4508#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4509    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4510    BASENAME##0;                                                                            \
4511    if(Y_MASK##0 != 0)                                                                      \
4512        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
4513    else                                                                                    \
4514        BASENAME##0 = 0;
4515
4516#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4517    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4518    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4519    BASENAME##1;                                                                            \
4520    if(Y_MASK##1 != 0)                                                                      \
4521        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
4522    else                                                                                    \
4523        BASENAME##1 = 0;
4524
4525#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4526    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4527    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4528    BASENAME##2;                                                                            \
4529    if(Y_MASK##2 != 0)                                                                      \
4530        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
4531    else                                                                                    \
4532        BASENAME##2 = 0;
4533
4534#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4535    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4536    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4537    BASENAME##3;                                                                            \
4538    if(Y_MASK##3 != 0)                                                                      \
4539        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
4540    else                                                                                    \
4541        BASENAME##3 = 0;
4542
4543#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4544    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4545    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4546    BASENAME##4;                                                                            \
4547    if(Y_MASK##4 != 0)                                                                      \
4548        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
4549    else                                                                                    \
4550        BASENAME##4 = 0;
4551
4552#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4553    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4554    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4555    BASENAME##5;                                                                            \
4556    if(Y_MASK##5 != 0)                                                                      \
4557        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
4558    else                                                                                    \
4559        BASENAME##5 = 0;
4560
4561#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4562    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4563    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4564    BASENAME##6;                                                                            \
4565    if(Y_MASK##6 != 0)                                                                      \
4566        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
4567    else                                                                                    \
4568        BASENAME##6 = 0;
4569
4570#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4571    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4572    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4573    BASENAME##7;                                                                            \
4574    if(Y_MASK##7 != 0)                                                                      \
4575        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
4576    else                                                                                    \
4577        BASENAME##7 = 0;
4578
4579#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
4580    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4581    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4582    BASENAME##8;                                                                            \
4583    if(Y_MASK##8 != 0)                                                                      \
4584        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
4585    else                                                                                    \
4586        BASENAME##8 = 0;
4587
4588#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4589    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
4590    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4591    BASENAME##9;                                                                            \
4592    if(Y_MASK##9 != 0)                                                                      \
4593        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
4594    else                                                                                    \
4595        BASENAME##9 = 0;
4596
4597#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4598    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
4599    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4600    BASENAME##A;                                                                            \
4601    if(Y_MASK##A != 0)                                                                      \
4602        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
4603    else                                                                                    \
4604        BASENAME##A = 0;
4605
4606#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4607    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
4608    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4609    BASENAME##B;                                                                            \
4610    if(Y_MASK##B != 0)                                                                      \
4611        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
4612    else                                                                                    \
4613        BASENAME##B = 0;
4614
4615#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4616    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
4617    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4618    BASENAME##C;                                                                            \
4619    if(Y_MASK##C != 0)                                                                      \
4620        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
4621    else                                                                                    \
4622        BASENAME##C = 0;
4623
4624#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4625    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
4626    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4627    BASENAME##D;                                                                            \
4628    if(Y_MASK##D != 0)                                                                      \
4629        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
4630    else                                                                                    \
4631        BASENAME##D = 0;
4632
4633#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4634    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
4635    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4636    BASENAME##E;                                                                            \
4637    if(Y_MASK##E != 0)                                                                      \
4638        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
4639    else                                                                                    \
4640        BASENAME##E = 0;
4641
4642#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
4643    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
4644    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
4645    BASENAME##F;                                                                            \
4646    if(Y_MASK##F != 0)                                                                      \
4647        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
4648    else                                                                                    \
4649        BASENAME##F = 0;
4650
4651
4652#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
4653#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
4654
4655
4656#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4657    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4658    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
4659
4660#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4661    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4662    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4663    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
4664
4665#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4666    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4667    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4668    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
4669
4670#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4671    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4672    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4673    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
4674
4675#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4676    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4677    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4678    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
4679
4680#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4681    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4682    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4683    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
4684
4685#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4686    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4687    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4688    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
4689
4690#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4691    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4692    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4693    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
4694
4695#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4696    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4697    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
4698    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
4699
4700#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4701    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
4702    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4703    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
4704
4705#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4706    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4707    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4708    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
4709
4710#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4711    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4712    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4713    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
4714
4715#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4716    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4717    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4718    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
4719
4720#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4721    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4722    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4723    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
4724
4725#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4726    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4727    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4728    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
4729
4730#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
4731    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
4732    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
4733    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
4734
4735
4736
4737
4738#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
4739#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
4740
4741
4742
4743#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4744    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4745    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
4746    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
4747
4748#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4749    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4750    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4751    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
4752    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
4753
4754#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4755    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4756    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4757    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
4758    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
4759
4760#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4761    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4762    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4763    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
4764    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
4765
4766#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4767    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4768    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4769    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
4770    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
4771
4772#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4773    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4774    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4775    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
4776    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
4777
4778#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4779    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4780    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4781    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
4782    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
4783
4784#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
4785    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
4786    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
4787    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
4788    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
4789
4790
4791
4792
4793#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
4794#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
4795
4796
4797
4798#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
4799    BASENAME##0 *= (DATA_TYPE)SCALE;
4800
4801#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
4802    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
4803    BASENAME##1 *= (DATA_TYPE)SCALE;
4804
4805#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
4806    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
4807    BASENAME##2 *= (DATA_TYPE)SCALE;
4808
4809#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
4810    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
4811    BASENAME##3 *= (DATA_TYPE)SCALE;
4812
4813#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
4814    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
4815    BASENAME##4 *= (DATA_TYPE)SCALE;
4816
4817#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
4818    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
4819    BASENAME##5 *= (DATA_TYPE)SCALE;
4820
4821#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
4822    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
4823    BASENAME##6 *= (DATA_TYPE)SCALE;
4824
4825#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
4826    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
4827    BASENAME##7 *= (DATA_TYPE)SCALE;
4828
4829#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
4830    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
4831    BASENAME##8 *= (DATA_TYPE)SCALE;
4832
4833#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
4834    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
4835    BASENAME##9 *= (DATA_TYPE)SCALE;
4836
4837#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
4838    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
4839    BASENAME##A *= (DATA_TYPE)SCALE;
4840
4841#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
4842    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
4843    BASENAME##B *= (DATA_TYPE)SCALE;
4844
4845#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
4846    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
4847    BASENAME##C *= (DATA_TYPE)SCALE;
4848
4849#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
4850    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
4851    BASENAME##D *= (DATA_TYPE)SCALE;
4852
4853#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
4854    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
4855    BASENAME##E *= (DATA_TYPE)SCALE;
4856
4857#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
4858    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
4859    BASENAME##F *= (DATA_TYPE)SCALE;
4860
4861
4862
4863#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
4864#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
4865
4866
4867
4868#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
4869    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
4870#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
4871    VEC_DATA_TYPE(TYPE, 2)                         \
4872    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
4873#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
4874    VEC_DATA_TYPE(TYPE, 3)                         \
4875    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
4876#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
4877    VEC_DATA_TYPE(TYPE, 4)                         \
4878    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
4879#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
4880    VEC_DATA_TYPE(TYPE, 8)                         \
4881    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
4882#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
4883    VEC_DATA_TYPE(TYPE, 16)                         \
4884    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
4885
4886
4887
4888#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
4889    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
4890#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
4891    VEC_DATA_TYPE(TYPE, 2)                                \
4892    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
4893#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
4894    VEC_DATA_TYPE(TYPE, 3)                                \
4895    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
4896#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
4897    VEC_DATA_TYPE(TYPE, 4)                                \
4898    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
4899#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
4900    VEC_DATA_TYPE(TYPE, 8)                                \
4901    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
4902#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
4903    VEC_DATA_TYPE(TYPE, 16)                                \
4904    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
4905
4906
4907
4908#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
4909    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
4910#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
4911    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
4912    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
4913#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
4914    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
4915    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
4916#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
4917    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
4918    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
4919#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
4920    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
4921    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
4922    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
4923    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
4924    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
4925#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
4926    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
4927    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
4928    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
4929    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
4930    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
4931    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
4932    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
4933    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
4934    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
4935
4936
4937
4938
4939#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
4940    CONCAT(COLUMN_VECTOR, K0)                          \
4941    (IDX_COL, BASENAME, BS, TYPE);
4942
4943
4944#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
4945    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
4946    (IDX_COL, BASENAME, BS, TYPE);
4947
4948
4949#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
4950    CONCAT(TRANSPOSE_K0X, N0)                       \
4951    (K0, BASENAME, BS, TYPE);
4952
4953
4954#define ADD_ROW_1(BASENAME, BIAS) \
4955    BASENAME##0 += BIAS##0;
4956
4957#define ADD_ROW_2(BASENAME, BIAS) \
4958    ADD_ROW_1(BASENAME, BIAS)     \
4959    BASENAME##1 += BIAS##1;
4960
4961#define ADD_ROW_3(BASENAME, BIAS) \
4962    ADD_ROW_2(BASENAME, BIAS)     \
4963    BASENAME##2 += BIAS##2;
4964
4965#define ADD_ROW_4(BASENAME, BIAS) \
4966    ADD_ROW_3(BASENAME, BIAS)     \
4967    BASENAME##3 += BIAS##3;
4968
4969#define ADD_ROW_5(BASENAME, BIAS) \
4970    ADD_ROW_4(BASENAME, BIAS)     \
4971    BASENAME##4 += BIAS##4;
4972
4973#define ADD_ROW_6(BASENAME, BIAS) \
4974    ADD_ROW_5(BASENAME, BIAS)     \
4975    BASENAME##5 += BIAS##5;
4976
4977#define ADD_ROW_7(BASENAME, BIAS) \
4978    ADD_ROW_6(BASENAME, BIAS)     \
4979    BASENAME##6 += BIAS##6;
4980
4981#define ADD_ROW_8(BASENAME, BIAS) \
4982    ADD_ROW_7(BASENAME, BIAS)     \
4983    BASENAME##7 += BIAS##7;
4984
4985#define ADD_ROW_9(BASENAME, BIAS) \
4986    ADD_ROW_8(BASENAME, BIAS)     \
4987    BASENAME##8 += BIAS##8;
4988
4989#define ADD_ROW_10(BASENAME, BIAS) \
4990    ADD_ROW_9(BASENAME, BIAS)      \
4991    BASENAME##9 += BIAS##9;
4992
4993#define ADD_ROW_11(BASENAME, BIAS) \
4994    ADD_ROW_10(BASENAME, BIAS)     \
4995    BASENAME##A += BIAS##A;
4996
4997#define ADD_ROW_12(BASENAME, BIAS) \
4998    ADD_ROW_11(BASENAME, BIAS)     \
4999    BASENAME##B += BIAS##B;
5000
5001#define ADD_ROW_13(BASENAME, BIAS) \
5002    ADD_ROW_12(BASENAME, BIAS)     \
5003    BASENAME##C += BIAS##C;
5004
5005#define ADD_ROW_14(BASENAME, BIAS) \
5006    ADD_ROW_13(BASENAME, BIAS)     \
5007    BASENAME##D += BIAS##D;
5008
5009#define ADD_ROW_15(BASENAME, BIAS) \
5010    ADD_ROW_14(BASENAME, BIAS)     \
5011    BASENAME##E += BIAS##E;
5012
5013#define ADD_ROW_16(BASENAME, BIAS) \
5014    ADD_ROW_15(BASENAME, BIAS)     \
5015    BASENAME##F += BIAS##F;
5016
5017
5018
5019
5020#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
5021#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
5022
5023
5024
5025#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
5026    BASENAME##0 += BIAS;
5027
5028#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
5029    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
5030    BASENAME##1 += BIAS;
5031
5032#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
5033    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
5034    BASENAME##2 += BIAS;
5035
5036#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
5037    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
5038    BASENAME##3 += BIAS;
5039
5040#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
5041    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
5042    BASENAME##4 += BIAS;
5043
5044#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
5045    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
5046    BASENAME##5 += BIAS;
5047
5048#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
5049    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
5050    BASENAME##6 += BIAS;
5051
5052#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
5053    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
5054    BASENAME##7 += BIAS;
5055
5056#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
5057    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
5058    BASENAME##8 += BIAS;
5059
5060#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
5061    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
5062    BASENAME##9 += BIAS;
5063
5064#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
5065    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
5066    BASENAME##A += BIAS;
5067
5068#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
5069    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
5070    BASENAME##B += BIAS;
5071
5072#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
5073    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
5074    BASENAME##C += BIAS;
5075
5076#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
5077    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
5078    BASENAME##D += BIAS;
5079
5080#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
5081    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
5082    BASENAME##E += BIAS;
5083
5084#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
5085    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
5086    BASENAME##F += BIAS;
5087
5088
5089#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
5090#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
5091
5092
5093
5094#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5095    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
5096
5097#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5098    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5099    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
5100
5101#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5102    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5103    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
5104
5105#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5106    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5107    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
5108
5109#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5110    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5111    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
5112
5113#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5114    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5115    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
5116
5117#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5118    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5119    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
5120
5121#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5122    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5123    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
5124
5125#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5126    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5127    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
5128
5129#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5130    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
5131    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
5132
5133#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5134    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5135    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
5136
5137#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5138    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5139    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
5140
5141#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5142    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5143    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
5144
5145#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5146    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5147    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
5148
5149#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5150    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5151    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
5152
5153#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
5154    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
5155    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
5156
5157
5158
5159#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
5160#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
5161
5162
5163
5164#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5165    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5166    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
5167
5168#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5169    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5170    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5171    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
5172
5173#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5174    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5175    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5176    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
5177
5178#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5179    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5180    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5181    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
5182
5183#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5184    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5185    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5186    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
5187
5188#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5189    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5190    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5191    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
5192
5193#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5194    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5195    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5196    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
5197
5198#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5199    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5200    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5201    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
5202
5203#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5204    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5205    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
5206    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
5207
5208#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5209    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
5210    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5211    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
5212
5213#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5214    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5215    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5216    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
5217
5218#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5219    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5220    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5221    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
5222
5223#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5224    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5225    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5226    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
5227
5228#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5229    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5230    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5231    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
5232
5233#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5234    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5235    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5236    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
5237
5238#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
5239    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
5240    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
5241    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
5242
5243
5244
5245#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
5246#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
5247
5248
5249
5250
5251#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5252    VSTORE(N0)                                                 \
5253    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5254
5255#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5256    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5257    VSTORE(N0)                                                 \
5258    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5259
5260#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5261    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5262    VSTORE(N0)                                                 \
5263    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5264
5265#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5266    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5267    VSTORE(N0)                                                 \
5268    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5269
5270#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5271    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5272    VSTORE(N0)                                                 \
5273    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5274
5275#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5276    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5277    VSTORE(N0)                                                 \
5278    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5279
5280#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5281    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5282    VSTORE(N0)                                                 \
5283    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5284
5285#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5286    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5287    VSTORE(N0)                                                 \
5288    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5289
5290#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5291    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5292    VSTORE(N0)                                                 \
5293    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5294
5295#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5296    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5297    VSTORE(N0)                                                  \
5298    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5299
5300#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5301    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5302    VSTORE(N0)                                                  \
5303    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5304
5305#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5306    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5307    VSTORE(N0)                                                  \
5308    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5309
5310#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5311    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5312    VSTORE(N0)                                                  \
5313    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5314
5315#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5316    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5317    VSTORE(N0)                                                  \
5318    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5319
5320#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5321    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5322    VSTORE(N0)                                                  \
5323    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5324
5325#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5326    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5327    VSTORE(N0)                                                  \
5328    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5329
5330
5331
5332#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5333    VSTORE(N0)                                                         \
5334    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5335
5336#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5337    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5338    VSTORE(N0)                                                         \
5339    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5340
5341#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5342    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5343    VSTORE(N0)                                                         \
5344    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5345
5346#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5347    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5348    VSTORE(N0)                                                         \
5349    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5350
5351#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5352    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5353    VSTORE(N0)                                                         \
5354    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5355
5356#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5357    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5358    VSTORE(N0)                                                         \
5359    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5360
5361#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5362    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5363    VSTORE(N0)                                                         \
5364    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5365
5366#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5367    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5368    VSTORE(N0)                                                         \
5369    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5370
5371#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5372    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5373    VSTORE(N0)                                                         \
5374    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5375
5376#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
5377    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5378    VSTORE(N0)                                                     \
5379    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5380
5381#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5382    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5383    VSTORE(N0)                                                          \
5384    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5385
5386#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5387    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5388    VSTORE(N0)                                                          \
5389    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5390
5391#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5392    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5393    VSTORE(N0)                                                          \
5394    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5395
5396#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5397    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5398    VSTORE(N0)                                                          \
5399    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5400
5401#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5402    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5403    VSTORE(N0)                                                          \
5404    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5405
5406#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5407    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5408    VSTORE(N0)                                                          \
5409    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5410
5411
5412
5413
5414#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5415#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5416
5417
5418
5419#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5420#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5421
5422
5423
5424#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5425    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5426    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5427
5428#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5429    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5430    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5431    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5432
5433#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5434    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5435    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5436    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5437
5438#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5439    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5440    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5441    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5442
5443#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5444    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5445    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5446    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5447
5448#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5449    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5450    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5451    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5452
5453#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5454    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5455    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5456    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5457
5458#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5459    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5460    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5461    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5462
5463#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5464    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5465    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5466    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5467
5468#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5469    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5470    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5471    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5472
5473#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5474    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5475    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5476    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5477
5478#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5479    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5480    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5481    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5482
5483#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5484    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5485    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5486    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5487
5488#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5489    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5490    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5491    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5492
5493#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5494    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5495    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5496    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5497
5498#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5499    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5500    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5501    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5502
5503
5504
5505#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5506#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5507
5508#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5509    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
5510    {                                                                                                                                                     \
5511        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
5512    }                                                                                                                                                     \
5513    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
5514    {                                                                                                                                                     \
5515        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5516    }                                                                                                                                                     \
5517    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
5518    {                                                                                                                                                     \
5519        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5520    }                                                                                                                                                     \
5521    else                                                                                                                                                  \
5522    {                                                                                                                                                     \
5523        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
5524    }
5525
5526#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
5527    if(!(PARTIAL_COND_X))                                                                                         \
5528    {                                                                                                             \
5529        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5530    }                                                                                                             \
5531    else                                                                                                          \
5532    {                                                                                                             \
5533        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5534    }
5535
5536#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
5537    if(!(PARTIAL_COND_Y))                                                                                         \
5538    {                                                                                                             \
5539        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5540    }                                                                                                             \
5541    else                                                                                                          \
5542    {                                                                                                             \
5543        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5544    }
5545
5546
5547#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
5548
5549
5550#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
5551
5552#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5553    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5554
5555#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
5556
5557#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5558    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
5559
5560#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
5561
5562#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5563    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
5564
5565#else
5566
5567#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5568    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
5569
5570#endif
5571
5572#endif
5573
5574
5575#if defined(PARTIAL_STORE_M0)
5576
5577#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5578    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
5579#else
5580#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5581    ((uint)(y * M0))
5582#endif
5583
5584
5585
5586#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
5587    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
5588
5589
5590
5591
5592
5593#if defined(MIXED_PRECISION)
5594#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
5595    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
5596#else
5597#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
5598    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
5599#endif
5600
5601
5602
5603#if defined(MIXED_PRECISION)
5604#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
5605    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                     \
5606    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
5607#else
5608#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
5609    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
5610#endif
5611
5612
5613
5614#if defined(MIXED_PRECISION)
5615#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
5616    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                                \
5617    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
5618#else
5619#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
5620    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
5621#endif
5622
5623
5624
5625#if defined(MIXED_PRECISION)
5626#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
5627    CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP);                                                                                                                       \
5628    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
5629#else
5630#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
5631    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
5632#endif
5633   )"