• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1R"(
2
3#ifndef ARM_COMPUTE_HELPER_H
4#define ARM_COMPUTE_HELPER_H
5
6
7
8
9#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
10    VSTORE(N0)                                                 \
11    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
12
13#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
14    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
15    VSTORE(N0)                                                 \
16    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
17
18#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
19    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
20    VSTORE(N0)                                                 \
21    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
22
23#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
24    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
25    VSTORE(N0)                                                 \
26    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
27
28#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
29    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
30    VSTORE(N0)                                                 \
31    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
32
33#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
34    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
35    VSTORE(N0)                                                 \
36    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
37
38#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
39    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
40    VSTORE(N0)                                                 \
41    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
42
43#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
44    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
45    VSTORE(N0)                                                 \
46    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
47
48#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
49    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
50    VSTORE(N0)                                                 \
51    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
52
53#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
54    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
55    VSTORE(N0)                                                  \
56    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
57
58#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
59    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
60    VSTORE(N0)                                                  \
61    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
62
63#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
64    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
65    VSTORE(N0)                                                  \
66    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
67
68#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
69    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
70    VSTORE(N0)                                                  \
71    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
72
73#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
74    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
75    VSTORE(N0)                                                  \
76    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
77
78#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
79    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
80    VSTORE(N0)                                                  \
81    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
82
83#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
84    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
85    VSTORE(N0)                                                  \
86    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
87
88
89
90#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
91    VSTORE(N0)                                                         \
92    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
93
94#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
95    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
96    VSTORE(N0)                                                         \
97    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
98
99#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
100    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
101    VSTORE(N0)                                                         \
102    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
103
104#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
105    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
106    VSTORE(N0)                                                         \
107    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
108
109#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
110    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
111    VSTORE(N0)                                                         \
112    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
113
114#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
115    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
116    VSTORE(N0)                                                         \
117    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
118
119#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
120    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
121    VSTORE(N0)                                                         \
122    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
123
124#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
125    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
126    VSTORE(N0)                                                         \
127    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
128
129#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
130    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
131    VSTORE(N0)                                                         \
132    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
133
134#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
135    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
136    VSTORE(N0)                                                     \
137    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
138
139#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
140    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
141    VSTORE(N0)                                                          \
142    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
143
144#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
145    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
146    VSTORE(N0)                                                          \
147    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
148
149#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
150    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
151    VSTORE(N0)                                                          \
152    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
153
154#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
155    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
156    VSTORE(N0)                                                          \
157    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
158
159#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
160    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
161    VSTORE(N0)                                                          \
162    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
163
164#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
165    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
166    VSTORE(N0)                                                          \
167    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168
169
170
171
172#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
173#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
174
175
176
177#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
178#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
179
180
181
182#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
183    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
184    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
185
186#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
187    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
188    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
189    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
190
191#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
192    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
193    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
194    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
195
196#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
197    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
198    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
199    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
200
201#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
202    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
203    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
204    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
205
206#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
207    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
208    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
209    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
210
211#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
212    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
213    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
214    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
215
216#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
217    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
218    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
219    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
220
221#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
222    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
223    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
224    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
225
226#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
227    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
228    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
229    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
230
231#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
232    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
233    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
234    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
235
236#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
237    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
238    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
239    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
240
241#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
242    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
243    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
244    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
245
246#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
247    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
248    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
249    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
250
251#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
252    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
253    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
254    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
255
256#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
257    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
258    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
259    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
260
261
262
263#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
264#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
265
266#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
267    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
268    {                                                                                                                                                     \
269        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
270    }                                                                                                                                                     \
271    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
272    {                                                                                                                                                     \
273        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
274    }                                                                                                                                                     \
275    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
276    {                                                                                                                                                     \
277        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
278    }                                                                                                                                                     \
279    else                                                                                                                                                  \
280    {                                                                                                                                                     \
281        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
282    }
283
284#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
285    if(!(PARTIAL_COND_X))                                                                                         \
286    {                                                                                                             \
287        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
288    }                                                                                                             \
289    else                                                                                                          \
290    {                                                                                                             \
291        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
292    }
293
294#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
295    if(!(PARTIAL_COND_Y))                                                                                         \
296    {                                                                                                             \
297        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
298    }                                                                                                             \
299    else                                                                                                          \
300    {                                                                                                             \
301        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
302    }
303
304
305#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
306
307
308#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
309
310#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
311    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
312
313#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
314
315#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
316    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
317
318#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
319
320#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
321    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
322
323#else
324
325#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
326    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
327
328#endif
329
330#endif
331
332
333#if defined(PARTIAL_STORE_M0)
334
335#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
336    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
337#else
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(y * M0))
340#endif
341
342
343
344#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
345    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
346
347
348#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
349#pragma OPENCL EXTENSION cl_khr_fp16 : enable
350#endif
351
352#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
353#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
354#endif
355
356#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
357#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
358#endif
359
360#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
361#pragma OPENCL EXTENSION cl_arm_printf : enable
362#endif
363
364#define GPU_ARCH_MIDGARD 0x100
365#define GPU_ARCH_BIFROST 0x200
366#define GPU_ARCH_VALHALL 0x300
367
368
369#define CONCAT(a, b) a##b
370
371
372#define EXPAND(x) x
373
374
375#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
376
377
378#define REV1(x) ((x))
379#define REV2(x) ((x).s10)
380#define REV3(x) ((x).s210)
381#define REV4(x) ((x).s3210)
382#define REV8(x) ((x).s76543210)
383#define REV16(x) ((x).sFEDCBA9876543210)
384
385
386
387#define REVERSE_STR(x, s) REV##s((x))
388#define REVERSE(x, s) REVERSE_STR(x, s)
389
390
391
392#define ROT1_0(x) ((x))
393#define ROT1_1(x) ((x))
394
395#define ROT2_0(x) ((x))
396#define ROT2_1(x) ((x).s10)
397#define ROT2_2(x) ((x))
398
399#define ROT3_0(x) ((x))
400#define ROT3_1(x) ((x).s201)
401#define ROT3_2(x) ((x).s120)
402#define ROT3_3(x) ((x))
403
404#define ROT4_0(x) ((x))
405#define ROT4_1(x) ((x).s3012)
406#define ROT4_2(x) ((x).s2301)
407#define ROT4_3(x) ((x).s1230)
408#define ROT4_4(x) ((x))
409
410#define ROT8_0(x) ((x))
411#define ROT8_1(x) ((x).s70123456)
412#define ROT8_2(x) ((x).s67012345)
413#define ROT8_3(x) ((x).s56701234)
414#define ROT8_4(x) ((x).s45670123)
415#define ROT8_5(x) ((x).s34567012)
416#define ROT8_6(x) ((x).s23456701)
417#define ROT8_7(x) ((x).s12345670)
418#define ROT8_8(x) ((x))
419
420#define ROT16_0(x) ((x))
421#define ROT16_1(x) ((x).sF0123456789ABCDE)
422#define ROT16_2(x) ((x).sEF0123456789ABCD)
423#define ROT16_3(x) ((x).sDEF0123456789ABC)
424#define ROT16_4(x) ((x).sCDEF0123456789AB)
425#define ROT16_5(x) ((x).sBCDEF0123456789A)
426#define ROT16_6(x) ((x).sABCDEF0123456789)
427#define ROT16_7(x) ((x).s9ABCDEF012345678)
428#define ROT16_8(x) ((x).s89ABCDEF01234567)
429#define ROT16_9(x) ((x).s789ABCDEF0123456)
430#define ROT16_10(x) ((x).s6789ABCDEF012345)
431#define ROT16_11(x) ((x).s56789ABCDEF01234)
432#define ROT16_12(x) ((x).s456789ABCDEF0123)
433#define ROT16_13(x) ((x).s3456789ABCDEF012)
434#define ROT16_14(x) ((x).s23456789ABCDEF01)
435#define ROT16_15(x) ((x).s123456789ABCDEF0)
436#define ROT16_16(x) ((x))
437
438
439
440#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
441#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
442
443
444
445#define V_OFFS1(dt) (dt##1)(0)
446#define V_OFFS2(dt) (dt##2)(0, 1)
447#define V_OFFS3(dt) (dt##3)(0, 1, 2)
448#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
449#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
450#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
451
452
453
454#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
455#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
456
457
458#define VLOAD_STR(size) vload##size
459#define VLOAD(size) VLOAD_STR(size)
460
461
462#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
463#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
464
465#define NO_LOAD(data, offs, ptr) \
466    {                            \
467    }
468
469
470#define vload_partial_1_0 NO_LOAD
471#define vload_partial_1_1 vload1
472#define vload_partial_1_2 NO_LOAD
473#define vload_partial_1_3 NO_LOAD
474#define vload_partial_1_4 NO_LOAD
475#define vload_partial_1_5 NO_LOAD
476#define vload_partial_1_6 NO_LOAD
477#define vload_partial_1_7 NO_LOAD
478#define vload_partial_1_8 NO_LOAD
479#define vload_partial_1_9 NO_LOAD
480#define vload_partial_1_10 NO_LOAD
481#define vload_partial_1_11 NO_LOAD
482#define vload_partial_1_12 NO_LOAD
483#define vload_partial_1_13 NO_LOAD
484#define vload_partial_1_14 NO_LOAD
485#define vload_partial_1_15 NO_LOAD
486#define vload_partial_1_16 NO_LOAD
487
488#define vload_partial_2_0 NO_LOAD
489#define vload_partial_2_1 vload_partial_1
490#define vload_partial_2_2 vload_partial_2
491#define vload_partial_2_3 NO_LOAD
492#define vload_partial_2_4 NO_LOAD
493#define vload_partial_2_5 NO_LOAD
494#define vload_partial_2_6 NO_LOAD
495#define vload_partial_2_7 NO_LOAD
496#define vload_partial_2_8 NO_LOAD
497#define vload_partial_2_9 NO_LOAD
498#define vload_partial_2_10 NO_LOAD
499#define vload_partial_2_11 NO_LOAD
500#define vload_partial_2_12 NO_LOAD
501#define vload_partial_2_13 NO_LOAD
502#define vload_partial_2_14 NO_LOAD
503#define vload_partial_2_15 NO_LOAD
504#define vload_partial_2_16 NO_LOAD
505
506#define vload_partial_3_0 NO_LOAD
507#define vload_partial_3_1 vload_partial_1
508#define vload_partial_3_2 vload_partial_2
509#define vload_partial_3_3 vload_partial_3
510#define vload_partial_3_4 NO_LOAD
511#define vload_partial_3_5 NO_LOAD
512#define vload_partial_3_6 NO_LOAD
513#define vload_partial_3_7 NO_LOAD
514#define vload_partial_3_8 NO_LOAD
515#define vload_partial_3_9 NO_LOAD
516#define vload_partial_3_10 NO_LOAD
517#define vload_partial_3_11 NO_LOAD
518#define vload_partial_3_12 NO_LOAD
519#define vload_partial_3_13 NO_LOAD
520#define vload_partial_3_14 NO_LOAD
521#define vload_partial_3_15 NO_LOAD
522#define vload_partial_3_16 NO_LOAD
523
524#define vload_partial_4_0 NO_LOAD
525#define vload_partial_4_1 vload_partial_1
526#define vload_partial_4_2 vload_partial_2
527#define vload_partial_4_3 vload_partial_3
528#define vload_partial_4_4 vload_partial_4
529#define vload_partial_4_5 NO_LOAD
530#define vload_partial_4_6 NO_LOAD
531#define vload_partial_4_7 NO_LOAD
532#define vload_partial_4_8 NO_LOAD
533#define vload_partial_4_9 NO_LOAD
534#define vload_partial_4_10 NO_LOAD
535#define vload_partial_4_11 NO_LOAD
536#define vload_partial_4_12 NO_LOAD
537#define vload_partial_4_13 NO_LOAD
538#define vload_partial_4_14 NO_LOAD
539#define vload_partial_4_15 NO_LOAD
540#define vload_partial_4_16 NO_LOAD
541
542#define vload_partial_8_0 NO_LOAD
543#define vload_partial_8_1 vload_partial_1
544#define vload_partial_8_2 vload_partial_2
545#define vload_partial_8_3 vload_partial_3
546#define vload_partial_8_4 vload_partial_4
547#define vload_partial_8_5 vload_partial_5
548#define vload_partial_8_6 vload_partial_6
549#define vload_partial_8_7 vload_partial_7
550#define vload_partial_8_8 vload_partial_8
551#define vload_partial_8_9 NO_LOAD
552#define vload_partial_8_10 NO_LOAD
553#define vload_partial_8_11 NO_LOAD
554#define vload_partial_8_12 NO_LOAD
555#define vload_partial_8_13 NO_LOAD
556#define vload_partial_8_14 NO_LOAD
557#define vload_partial_8_15 NO_LOAD
558#define vload_partial_8_16 NO_LOAD
559
560#define vload_partial_16_0 NO_LOAD
561#define vload_partial_16_1 vload_partial_1
562#define vload_partial_16_2 vload_partial_2
563#define vload_partial_16_3 vload_partial_3
564#define vload_partial_16_4 vload_partial_4
565#define vload_partial_16_5 vload_partial_5
566#define vload_partial_16_6 vload_partial_6
567#define vload_partial_16_7 vload_partial_7
568#define vload_partial_16_8 vload_partial_8
569#define vload_partial_16_9 vload_partial_9
570#define vload_partial_16_10 vload_partial_10
571#define vload_partial_16_11 vload_partial_11
572#define vload_partial_16_12 vload_partial_12
573#define vload_partial_16_13 vload_partial_13
574#define vload_partial_16_14 vload_partial_14
575#define vload_partial_16_15 vload_partial_15
576#define vload_partial_16_16 vload_partial_16
577
578
579#define vload_partial_1(DATA, OFFSET, PTR) \
580    DATA.s0 = vload1(OFFSET, PTR);
581
582#define vload_partial_2(DATA, OFFSET, PTR) \
583    DATA.s01 = vload2(OFFSET, PTR);
584
585#define vload_partial_3(DATA, OFFSET, PTR) \
586    DATA.s012 = vload3(OFFSET, PTR);
587
588#define vload_partial_4(DATA, OFFSET, PTR) \
589    DATA.s0123 = vload4(OFFSET, PTR);
590
591#define vload_partial_5(DATA, OFFSET, PTR)    \
592    vload_partial_4(DATA.s0123, OFFSET, PTR); \
593    DATA.s4 = vload1(OFFSET, PTR + 4);
594
595#define vload_partial_6(DATA, OFFSET, PTR)    \
596    vload_partial_4(DATA.s0123, OFFSET, PTR); \
597    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
598
599#define vload_partial_7(DATA, OFFSET, PTR)    \
600    vload_partial_4(DATA.s0123, OFFSET, PTR); \
601    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
602
603#define vload_partial_8(DATA, OFFSET, PTR) \
604    DATA.s01234567 = vload8(OFFSET, PTR);
605
606#define vload_partial_9(DATA, OFFSET, PTR)        \
607    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
608    DATA.s8 = vload1(OFFSET, PTR + 8);
609
610#define vload_partial_10(DATA, OFFSET, PTR)       \
611    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
612    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
613
614#define vload_partial_11(DATA, OFFSET, PTR)       \
615    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
616    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
617
618#define vload_partial_12(DATA, OFFSET, PTR)       \
619    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
620    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
621
622#define vload_partial_13(DATA, OFFSET, PTR)       \
623    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
624    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
625
626#define vload_partial_14(DATA, OFFSET, PTR)       \
627    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
628    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
629
630#define vload_partial_15(DATA, OFFSET, PTR)       \
631    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
632    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
633
634#define vload_partial_16(DATA, OFFSET, PTR) \
635    DATA = vload16(OFFSET, PTR);
636
637
638
639#define PIXEL_UNIT4 1
640#define PIXEL_UNIT8 2
641#define PIXEL_UNIT16 4
642
643
644#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
645#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
646
647
648#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
649#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
650#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
651
652#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
653#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
654#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
655#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
656#endif
657
658#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
659#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
660#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
661
662#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
663#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
664#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
665#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
666#endif
667
668
669#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
670#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
671
672
673#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
674#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
675
676#define VSTORE_STR(size) vstore##size
677#define VSTORE(size) VSTORE_STR(size)
678
679#define float1 float
680#define half1 half
681#define char1 char
682#define uchar1 uchar
683#define short1 short
684#define ushort1 ushort
685#define int1 int
686#define uint1 uint
687#define long1 long
688#define ulong1 ulong
689#define double1 double
690
691#define vload1(OFFSET, PTR) *(OFFSET + PTR)
692#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
693
694
695#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
696#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
697
698#define NO_STORE(data, offs, ptr) \
699    {                             \
700    }
701
702
703#define vstore_partial_1_0 NO_STORE
704#define vstore_partial_1_1 vstore1
705#define vstore_partial_1_2 NO_STORE
706#define vstore_partial_1_3 NO_STORE
707#define vstore_partial_1_4 NO_STORE
708#define vstore_partial_1_5 NO_STORE
709#define vstore_partial_1_6 NO_STORE
710#define vstore_partial_1_7 NO_STORE
711#define vstore_partial_1_8 NO_STORE
712#define vstore_partial_1_9 NO_STORE
713#define vstore_partial_1_10 NO_STORE
714#define vstore_partial_1_11 NO_STORE
715#define vstore_partial_1_12 NO_STORE
716#define vstore_partial_1_13 NO_STORE
717#define vstore_partial_1_14 NO_STORE
718#define vstore_partial_1_15 NO_STORE
719#define vstore_partial_1_16 NO_STORE
720
721#define vstore_partial_2_0 NO_STORE
722#define vstore_partial_2_1 vstore_partial_1
723#define vstore_partial_2_2 vstore_partial_2
724#define vstore_partial_2_3 NO_STORE
725#define vstore_partial_2_4 NO_STORE
726#define vstore_partial_2_5 NO_STORE
727#define vstore_partial_2_6 NO_STORE
728#define vstore_partial_2_7 NO_STORE
729#define vstore_partial_2_8 NO_STORE
730#define vstore_partial_2_9 NO_STORE
731#define vstore_partial_2_10 NO_STORE
732#define vstore_partial_2_11 NO_STORE
733#define vstore_partial_2_12 NO_STORE
734#define vstore_partial_2_13 NO_STORE
735#define vstore_partial_2_14 NO_STORE
736#define vstore_partial_2_15 NO_STORE
737#define vstore_partial_2_16 NO_STORE
738
739#define vstore_partial_3_0 NO_STORE
740#define vstore_partial_3_1 vstore_partial_1
741#define vstore_partial_3_2 vstore_partial_2
742#define vstore_partial_3_3 vstore_partial_3
743#define vstore_partial_3_4 NO_STORE
744#define vstore_partial_3_5 NO_STORE
745#define vstore_partial_3_6 NO_STORE
746#define vstore_partial_3_7 NO_STORE
747#define vstore_partial_3_8 NO_STORE
748#define vstore_partial_3_9 NO_STORE
749#define vstore_partial_3_10 NO_STORE
750#define vstore_partial_3_11 NO_STORE
751#define vstore_partial_3_12 NO_STORE
752#define vstore_partial_3_13 NO_STORE
753#define vstore_partial_3_14 NO_STORE
754#define vstore_partial_3_15 NO_STORE
755#define vstore_partial_3_16 NO_STORE
756
757#define vstore_partial_4_0 NO_STORE
758#define vstore_partial_4_1 vstore_partial_1
759#define vstore_partial_4_2 vstore_partial_2
760#define vstore_partial_4_3 vstore_partial_3
761#define vstore_partial_4_4 vstore_partial_4
762#define vstore_partial_4_5 NO_STORE
763#define vstore_partial_4_6 NO_STORE
764#define vstore_partial_4_7 NO_STORE
765#define vstore_partial_4_8 NO_STORE
766#define vstore_partial_4_9 NO_STORE
767#define vstore_partial_4_10 NO_STORE
768#define vstore_partial_4_11 NO_STORE
769#define vstore_partial_4_12 NO_STORE
770#define vstore_partial_4_13 NO_STORE
771#define vstore_partial_4_14 NO_STORE
772#define vstore_partial_4_15 NO_STORE
773#define vstore_partial_4_16 NO_STORE
774
775#define vstore_partial_8_0 NO_STORE
776#define vstore_partial_8_1 vstore_partial_1
777#define vstore_partial_8_2 vstore_partial_2
778#define vstore_partial_8_3 vstore_partial_3
779#define vstore_partial_8_4 vstore_partial_4
780#define vstore_partial_8_5 vstore_partial_5
781#define vstore_partial_8_6 vstore_partial_6
782#define vstore_partial_8_7 vstore_partial_7
783#define vstore_partial_8_8 vstore_partial_8
784#define vstore_partial_8_9 NO_STORE
785#define vstore_partial_8_10 NO_STORE
786#define vstore_partial_8_11 NO_STORE
787#define vstore_partial_8_12 NO_STORE
788#define vstore_partial_8_13 NO_STORE
789#define vstore_partial_8_14 NO_STORE
790#define vstore_partial_8_15 NO_STORE
791#define vstore_partial_8_16 NO_STORE
792
793#define vstore_partial_16_0 NO_STORE
794#define vstore_partial_16_1 vstore_partial_1
795#define vstore_partial_16_2 vstore_partial_2
796#define vstore_partial_16_3 vstore_partial_3
797#define vstore_partial_16_4 vstore_partial_4
798#define vstore_partial_16_5 vstore_partial_5
799#define vstore_partial_16_6 vstore_partial_6
800#define vstore_partial_16_7 vstore_partial_7
801#define vstore_partial_16_8 vstore_partial_8
802#define vstore_partial_16_9 vstore_partial_9
803#define vstore_partial_16_10 vstore_partial_10
804#define vstore_partial_16_11 vstore_partial_11
805#define vstore_partial_16_12 vstore_partial_12
806#define vstore_partial_16_13 vstore_partial_13
807#define vstore_partial_16_14 vstore_partial_14
808#define vstore_partial_16_15 vstore_partial_15
809#define vstore_partial_16_16 vstore_partial_16
810
811
812#define vstore_partial_1(DATA, OFFSET, PTR) \
813    vstore1(DATA.s0, OFFSET, PTR);
814
815#define vstore_partial_2(DATA, OFFSET, PTR) \
816    vstore2(DATA.s01, OFFSET, PTR);
817
818#define vstore_partial_3(DATA, OFFSET, PTR) \
819    vstore3(DATA.s012, OFFSET, PTR);
820
821#define vstore_partial_4(DATA, OFFSET, PTR) \
822    vstore4(DATA.s0123, OFFSET, PTR);
823
824#define vstore_partial_5(DATA, OFFSET, PTR)    \
825    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
826    vstore1(DATA.s4, OFFSET, PTR + 4);
827
828#define vstore_partial_6(DATA, OFFSET, PTR)    \
829    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
830    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
831
832#define vstore_partial_7(DATA, OFFSET, PTR)    \
833    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
834    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
835
836#define vstore_partial_8(DATA, OFFSET, PTR) \
837    vstore8(DATA.s01234567, OFFSET, PTR);
838
839#define vstore_partial_9(DATA, OFFSET, PTR)        \
840    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
841    vstore1(DATA.s8, OFFSET, PTR + 8);
842
843#define vstore_partial_10(DATA, OFFSET, PTR)       \
844    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
845    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
846
847#define vstore_partial_11(DATA, OFFSET, PTR)       \
848    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
849    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
850
851#define vstore_partial_12(DATA, OFFSET, PTR)       \
852    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
853    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
854
855#define vstore_partial_13(DATA, OFFSET, PTR)       \
856    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
857    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
858
859#define vstore_partial_14(DATA, OFFSET, PTR)       \
860    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
861    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
862
863#define vstore_partial_15(DATA, OFFSET, PTR)       \
864    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
865    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
866
867#define vstore_partial_16(DATA, OFFSET, PTR) \
868    vstore16(DATA, OFFSET, PTR);
869
870
871
872
873
874#define convert_float_sat convert_float
875#define convert_float1_sat convert_float
876#define convert_float2_sat convert_float2
877#define convert_float3_sat convert_float3
878#define convert_float4_sat convert_float4
879#define convert_float8_sat convert_float8
880#define convert_float16_sat convert_float16
881#define convert_half_sat convert_float
882#define convert_half1_sat convert_half
883#define convert_half2_sat convert_half2
884#define convert_half3_sat convert_half3
885#define convert_half4_sat convert_half4
886#define convert_half8_sat convert_half8
887#define convert_half16_sat convert_half16
888
889#define convert_float1 convert_float
890#define convert_half1 convert_half
891#define convert_char1 convert_char
892#define convert_uchar1 convert_uchar
893#define convert_short1 convert_short
894#define convert_ushort1 convert_ushort
895#define convert_int1 convert_int
896#define convert_uint1 convert_uint
897#define convert_long1 convert_long
898#define convert_ulong1 convert_ulong
899#define convert_double1 convert_double
900
901#define convert_char1_sat convert_char_sat
902#define convert_uchar1_sat convert_uchar_sat
903#define convert_uchar2_sat convert_uchar2_sat
904#define convert_uchar3_sat convert_uchar3_sat
905#define convert_uchar4_sat convert_uchar4_sat
906#define convert_uchar8_sat convert_uchar8_sat
907#define convert_uchar16_sat convert_uchar16_sat
908#define convert_short1_sat convert_short_sat
909#define convert_ushort1_sat convert_ushort_sat
910#define convert_int1_sat convert_int_sat
911#define convert_uint1_sat convert_uint_sat
912#define convert_long1_sat convert_long_sat
913#define convert_ulong1_sat convert_ulong_sat
914#define convert_double1_sat convert_double_sat
915
916#define VEC_DATA_TYPE_STR(type, size) type##size
917#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
918
919#define CONVERT_STR(x, type) (convert_##type((x)))
920#define CONVERT(x, type) CONVERT_STR(x, type)
921
922#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
923#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
924
925#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
926#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
927
928#define select_vec_dt_uchar(size) uchar##size
929#define select_vec_dt_char(size) char##size
930#define select_vec_dt_ushort(size) ushort##size
931#define select_vec_dt_short(size) short##size
932#define select_vec_dt_half(size) short##size
933#define select_vec_dt_uint(size) uint##size
934#define select_vec_dt_int(size) int##size
935#define select_vec_dt_float(size) int##size
936#define select_vec_dt_ulong(size) ulong##size
937#define select_vec_dt_long(size) long##size
938
939#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
940#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
941#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
942
943#define signed_int_vec_dt_uchar(size) char##size
944#define signed_int_vec_dt_char(size) char##size
945#define signed_int_vec_dt_ushort(size) short##size
946#define signed_int_vec_dt_short(size) short##size
947#define signed_int_vec_dt_half(size) short##size
948#define signed_int_vec_dt_uint(size) int##size
949#define signed_int_vec_dt_int(size) int##size
950#define signed_int_vec_dt_float(size) int##size
951#define signed_int_vec_dt_ulong(size) long##size
952#define signed_int_vec_dt_long(size) long##size
953
954#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
955#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
956#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
957
958#define sum_reduce_1(x) (x)
959#define sum_reduce_2(x) ((x).s0) + ((x).s1)
960#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
961#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
962#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
963#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
964
965#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
966#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
967
968#define prod_reduce_1(x) (x)
969#define prod_reduce_2(x) ((x).s0) * ((x).s1)
970#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
971#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
972#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
973#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
974
975#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
976#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
977
978#define max_reduce_1(x) (x)
979#define max_reduce_2(x) max(((x).s0), ((x).s1))
980#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
981#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
982#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
983#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
984
985#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
986#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
987
988#define VECTOR_DECLARATION(name)     \
989    __global uchar *name##_ptr,      \
990    uint        name##_stride_x, \
991    uint        name##_step_x,   \
992    uint        name##_offset_first_element_in_bytes
993
994#define IMAGE_DECLARATION(name)      \
995    __global uchar *name##_ptr,      \
996    uint        name##_stride_x, \
997    uint        name##_step_x,   \
998    uint        name##_stride_y, \
999    uint        name##_step_y,   \
1000    uint        name##_offset_first_element_in_bytes
1001
1002#define TENSOR3D_DECLARATION(name)   \
1003    __global uchar *name##_ptr,      \
1004    uint        name##_stride_x, \
1005    uint        name##_step_x,   \
1006    uint        name##_stride_y, \
1007    uint        name##_step_y,   \
1008    uint        name##_stride_z, \
1009    uint        name##_step_z,   \
1010    uint        name##_offset_first_element_in_bytes
1011
1012#define TENSOR4D_DECLARATION(name)   \
1013    __global uchar *name##_ptr,      \
1014    uint        name##_stride_x, \
1015    uint        name##_step_x,   \
1016    uint        name##_stride_y, \
1017    uint        name##_step_y,   \
1018    uint        name##_stride_z, \
1019    uint        name##_step_z,   \
1020    uint        name##_stride_w, \
1021    uint        name##_step_w,   \
1022    uint        name##_offset_first_element_in_bytes
1023
1024#define TENSOR5D_DECLARATION(name)   \
1025    __global uchar *name##_ptr,      \
1026    uint        name##_stride_x, \
1027    uint        name##_step_x,   \
1028    uint        name##_stride_y, \
1029    uint        name##_step_y,   \
1030    uint        name##_stride_z, \
1031    uint        name##_step_z,   \
1032    uint        name##_stride_w, \
1033    uint        name##_step_w,   \
1034    uint        name##_stride_v, \
1035    uint        name##_step_v,   \
1036    uint        name##_offset_first_element_in_bytes
1037
1038#define CONVERT_TO_VECTOR_STRUCT(name) \
1039    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1040
1041#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1043
1044#define CONVERT_TO_IMAGE_STRUCT(name) \
1045    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1046
1047#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1049
1050#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1051    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1060    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1061                                 name##_stride_z, name##_step_z)
1062
1063#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1064    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1065
1066#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1067    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1068                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1069
1070#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1071    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1072
1073#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1074    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1075                           name##_stride_z, name##_step_z)
1076
1077
1078typedef struct Vector
1079{
1080    __global uchar *ptr;
1081    int             offset_first_element_in_bytes;
1082    int             stride_x;
1083} Vector;
1084
1085
1086typedef struct Image
1087{
1088    __global uchar *ptr;
1089    int             offset_first_element_in_bytes;
1090    int             stride_x;
1091    int             stride_y;
1092} Image;
1093
1094
1095typedef struct Tensor3D
1096{
1097    __global uchar *ptr;
1098    int             offset_first_element_in_bytes;
1099    int             stride_x;
1100    int             stride_y;
1101    int             stride_z;
1102} Tensor3D;
1103
1104
1105typedef struct Tensor4D
1106{
1107    __global uchar *ptr;
1108    int             offset_first_element_in_bytes;
1109    int             stride_x;
1110    int             stride_y;
1111    int             stride_z;
1112    int             stride_w;
1113} Tensor4D;
1114
1115
1116inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1117{
1118    Vector vector =
1119    {
1120        .ptr                           = ptr,
1121        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1122        .stride_x                      = stride_x,
1123    };
1124    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1125    return vector;
1126}
1127
1128
1129inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1130{
1131    Image img =
1132    {
1133        .ptr                           = ptr,
1134        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1135        .stride_x                      = stride_x,
1136        .stride_y                      = stride_y
1137    };
1138    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1139    return img;
1140}
1141
1142
1143inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1144{
1145    Image img =
1146    {
1147        .ptr                           = ptr,
1148        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1149        .stride_x                      = stride_x,
1150        .stride_y                      = stride_y
1151    };
1152    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1153    return img;
1154}
1155
1156
1157inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1158{
1159    Tensor3D tensor =
1160    {
1161        .ptr                           = ptr,
1162        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1163        .stride_x                      = stride_x,
1164        .stride_y                      = stride_y,
1165        .stride_z                      = stride_z
1166    };
1167    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1168    return tensor;
1169}
1170
1171
1172inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1173{
1174    Tensor3D tensor =
1175    {
1176        .ptr                           = ptr,
1177        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1178        .stride_x                      = stride_x,
1179        .stride_y                      = stride_y,
1180        .stride_z                      = stride_z
1181    };
1182    return tensor;
1183}
1184
1185inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1186                                             uint step_w,
1187                                             uint mod_size)
1188{
1189    Tensor4D tensor =
1190    {
1191        .ptr                           = ptr,
1192        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1193        .stride_x                      = stride_x,
1194        .stride_y                      = stride_y,
1195        .stride_z                      = stride_z,
1196        .stride_w                      = stride_w
1197    };
1198
1199    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1200    return tensor;
1201}
1202
1203
1204inline __global const uchar *vector_offset(const Vector *vec, int x)
1205{
1206    return vec->ptr + x * vec->stride_x;
1207}
1208
1209
1210inline __global uchar *offset(const Image *img, int x, int y)
1211{
1212    return img->ptr + x * img->stride_x + y * img->stride_y;
1213}
1214
1215
1216inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1217{
1218    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1219}
1220
1221
1222inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1223{
1224    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1225}
1226
1227
1228inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1229{
1230    uint num_elements = width * height;
1231
1232    const uint z = index / num_elements;
1233
1234    index %= num_elements;
1235
1236    const uint y = index / width;
1237
1238    index %= width;
1239
1240    const uint x = index;
1241
1242    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1243}
1244
1245#endif
1246
1247#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
1248#define ARM_COMPUTE_HELPERS_ASYMM_H
1249
1250
1251#ifndef ARM_COMPUTE_HELPER_H
1252#define ARM_COMPUTE_HELPER_H
1253
1254
1255
1256
1257#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1258    VSTORE(N0)                                                 \
1259    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1260
1261#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1262    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1263    VSTORE(N0)                                                 \
1264    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1265
1266#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1267    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1268    VSTORE(N0)                                                 \
1269    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1270
1271#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1272    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1273    VSTORE(N0)                                                 \
1274    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1275
1276#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1277    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1278    VSTORE(N0)                                                 \
1279    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1280
1281#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1282    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1283    VSTORE(N0)                                                 \
1284    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1285
1286#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1287    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1288    VSTORE(N0)                                                 \
1289    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1290
1291#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1292    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1293    VSTORE(N0)                                                 \
1294    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1295
1296#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1297    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1298    VSTORE(N0)                                                 \
1299    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1300
1301#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1302    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1303    VSTORE(N0)                                                  \
1304    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1305
1306#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1307    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1308    VSTORE(N0)                                                  \
1309    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1310
1311#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1313    VSTORE(N0)                                                  \
1314    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1315
1316#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1317    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1318    VSTORE(N0)                                                  \
1319    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1320
1321#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1322    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1323    VSTORE(N0)                                                  \
1324    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1325
1326#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1327    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1328    VSTORE(N0)                                                  \
1329    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1330
1331#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1332    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1333    VSTORE(N0)                                                  \
1334    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1335
1336
1337
1338#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1339    VSTORE(N0)                                                         \
1340    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1341
1342#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1343    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1344    VSTORE(N0)                                                         \
1345    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1346
1347#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1348    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1349    VSTORE(N0)                                                         \
1350    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1351
1352#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1353    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1354    VSTORE(N0)                                                         \
1355    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1356
1357#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1358    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1359    VSTORE(N0)                                                         \
1360    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1361
1362#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1363    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1364    VSTORE(N0)                                                         \
1365    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1366
1367#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1368    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1369    VSTORE(N0)                                                         \
1370    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1371
1372#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1373    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1374    VSTORE(N0)                                                         \
1375    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1376
1377#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1378    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1379    VSTORE(N0)                                                         \
1380    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1381
1382#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1383    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1384    VSTORE(N0)                                                     \
1385    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1386
1387#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1388    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1389    VSTORE(N0)                                                          \
1390    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1391
1392#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1394    VSTORE(N0)                                                          \
1395    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1396
1397#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1398    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1399    VSTORE(N0)                                                          \
1400    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1401
1402#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1403    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1404    VSTORE(N0)                                                          \
1405    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1406
1407#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1408    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1409    VSTORE(N0)                                                          \
1410    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1411
1412#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1413    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1414    VSTORE(N0)                                                          \
1415    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1416
1417
1418
1419
1420#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1421#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1422
1423
1424
1425#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1426#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1427
1428
1429
1430#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1431    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1432    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1433
1434#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1435    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1436    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1437    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1438
1439#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1440    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1441    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1442    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1443
1444#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1445    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1446    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1447    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1448
1449#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1450    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1451    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1452    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1453
1454#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1455    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1456    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1457    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1458
1459#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1460    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1461    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1462    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1463
1464#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1465    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1466    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1467    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1468
1469#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1470    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1471    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1472    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1473
1474#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1475    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1476    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1477    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1478
1479#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1480    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1481    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1482    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1483
1484#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1486    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1487    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1488
1489#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1490    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1491    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1492    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1493
1494#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1495    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1496    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1497    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1498
1499#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1500    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1501    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1502    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1503
1504#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1505    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1506    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1507    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1508
1509
1510
1511#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1512#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1513
1514#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1515    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1516    {                                                                                                                                                     \
1517        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1518    }                                                                                                                                                     \
1519    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1520    {                                                                                                                                                     \
1521        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1522    }                                                                                                                                                     \
1523    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1524    {                                                                                                                                                     \
1525        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1526    }                                                                                                                                                     \
1527    else                                                                                                                                                  \
1528    {                                                                                                                                                     \
1529        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1530    }
1531
1532#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1533    if(!(PARTIAL_COND_X))                                                                                         \
1534    {                                                                                                             \
1535        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1536    }                                                                                                             \
1537    else                                                                                                          \
1538    {                                                                                                             \
1539        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1540    }
1541
1542#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1543    if(!(PARTIAL_COND_Y))                                                                                         \
1544    {                                                                                                             \
1545        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1546    }                                                                                                             \
1547    else                                                                                                          \
1548    {                                                                                                             \
1549        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1550    }
1551
1552
1553#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1554
1555
1556#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1557
1558#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1559    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1560
1561#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1562
1563#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1564    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1565
1566#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1567
1568#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1570
1571#else
1572
1573#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1574    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1575
1576#endif
1577
1578#endif
1579
1580
1581#if defined(PARTIAL_STORE_M0)
1582
1583#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1584    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1585#else
1586#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1587    ((uint)(y * M0))
1588#endif
1589
1590
1591
1592#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1593    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1594
1595
1596#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1597#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1598#endif
1599
1600#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1601#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1602#endif
1603
1604#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1605#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1606#endif
1607
1608#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1609#pragma OPENCL EXTENSION cl_arm_printf : enable
1610#endif
1611
1612#define GPU_ARCH_MIDGARD 0x100
1613#define GPU_ARCH_BIFROST 0x200
1614#define GPU_ARCH_VALHALL 0x300
1615
1616
1617#define CONCAT(a, b) a##b
1618
1619
1620#define EXPAND(x) x
1621
1622
1623#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1624
1625
1626#define REV1(x) ((x))
1627#define REV2(x) ((x).s10)
1628#define REV3(x) ((x).s210)
1629#define REV4(x) ((x).s3210)
1630#define REV8(x) ((x).s76543210)
1631#define REV16(x) ((x).sFEDCBA9876543210)
1632
1633
1634
1635#define REVERSE_STR(x, s) REV##s((x))
1636#define REVERSE(x, s) REVERSE_STR(x, s)
1637
1638
1639
1640#define ROT1_0(x) ((x))
1641#define ROT1_1(x) ((x))
1642
1643#define ROT2_0(x) ((x))
1644#define ROT2_1(x) ((x).s10)
1645#define ROT2_2(x) ((x))
1646
1647#define ROT3_0(x) ((x))
1648#define ROT3_1(x) ((x).s201)
1649#define ROT3_2(x) ((x).s120)
1650#define ROT3_3(x) ((x))
1651
1652#define ROT4_0(x) ((x))
1653#define ROT4_1(x) ((x).s3012)
1654#define ROT4_2(x) ((x).s2301)
1655#define ROT4_3(x) ((x).s1230)
1656#define ROT4_4(x) ((x))
1657
1658#define ROT8_0(x) ((x))
1659#define ROT8_1(x) ((x).s70123456)
1660#define ROT8_2(x) ((x).s67012345)
1661#define ROT8_3(x) ((x).s56701234)
1662#define ROT8_4(x) ((x).s45670123)
1663#define ROT8_5(x) ((x).s34567012)
1664#define ROT8_6(x) ((x).s23456701)
1665#define ROT8_7(x) ((x).s12345670)
1666#define ROT8_8(x) ((x))
1667
1668#define ROT16_0(x) ((x))
1669#define ROT16_1(x) ((x).sF0123456789ABCDE)
1670#define ROT16_2(x) ((x).sEF0123456789ABCD)
1671#define ROT16_3(x) ((x).sDEF0123456789ABC)
1672#define ROT16_4(x) ((x).sCDEF0123456789AB)
1673#define ROT16_5(x) ((x).sBCDEF0123456789A)
1674#define ROT16_6(x) ((x).sABCDEF0123456789)
1675#define ROT16_7(x) ((x).s9ABCDEF012345678)
1676#define ROT16_8(x) ((x).s89ABCDEF01234567)
1677#define ROT16_9(x) ((x).s789ABCDEF0123456)
1678#define ROT16_10(x) ((x).s6789ABCDEF012345)
1679#define ROT16_11(x) ((x).s56789ABCDEF01234)
1680#define ROT16_12(x) ((x).s456789ABCDEF0123)
1681#define ROT16_13(x) ((x).s3456789ABCDEF012)
1682#define ROT16_14(x) ((x).s23456789ABCDEF01)
1683#define ROT16_15(x) ((x).s123456789ABCDEF0)
1684#define ROT16_16(x) ((x))
1685
1686
1687
1688#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1689#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1690
1691
1692
1693#define V_OFFS1(dt) (dt##1)(0)
1694#define V_OFFS2(dt) (dt##2)(0, 1)
1695#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1696#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1697#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1698#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1699
1700
1701
1702#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1703#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1704
1705
1706#define VLOAD_STR(size) vload##size
1707#define VLOAD(size) VLOAD_STR(size)
1708
1709
1710#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1711#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1712
1713#define NO_LOAD(data, offs, ptr) \
1714    {                            \
1715    }
1716
1717
1718#define vload_partial_1_0 NO_LOAD
1719#define vload_partial_1_1 vload1
1720#define vload_partial_1_2 NO_LOAD
1721#define vload_partial_1_3 NO_LOAD
1722#define vload_partial_1_4 NO_LOAD
1723#define vload_partial_1_5 NO_LOAD
1724#define vload_partial_1_6 NO_LOAD
1725#define vload_partial_1_7 NO_LOAD
1726#define vload_partial_1_8 NO_LOAD
1727#define vload_partial_1_9 NO_LOAD
1728#define vload_partial_1_10 NO_LOAD
1729#define vload_partial_1_11 NO_LOAD
1730#define vload_partial_1_12 NO_LOAD
1731#define vload_partial_1_13 NO_LOAD
1732#define vload_partial_1_14 NO_LOAD
1733#define vload_partial_1_15 NO_LOAD
1734#define vload_partial_1_16 NO_LOAD
1735
1736#define vload_partial_2_0 NO_LOAD
1737#define vload_partial_2_1 vload_partial_1
1738#define vload_partial_2_2 vload_partial_2
1739#define vload_partial_2_3 NO_LOAD
1740#define vload_partial_2_4 NO_LOAD
1741#define vload_partial_2_5 NO_LOAD
1742#define vload_partial_2_6 NO_LOAD
1743#define vload_partial_2_7 NO_LOAD
1744#define vload_partial_2_8 NO_LOAD
1745#define vload_partial_2_9 NO_LOAD
1746#define vload_partial_2_10 NO_LOAD
1747#define vload_partial_2_11 NO_LOAD
1748#define vload_partial_2_12 NO_LOAD
1749#define vload_partial_2_13 NO_LOAD
1750#define vload_partial_2_14 NO_LOAD
1751#define vload_partial_2_15 NO_LOAD
1752#define vload_partial_2_16 NO_LOAD
1753
1754#define vload_partial_3_0 NO_LOAD
1755#define vload_partial_3_1 vload_partial_1
1756#define vload_partial_3_2 vload_partial_2
1757#define vload_partial_3_3 vload_partial_3
1758#define vload_partial_3_4 NO_LOAD
1759#define vload_partial_3_5 NO_LOAD
1760#define vload_partial_3_6 NO_LOAD
1761#define vload_partial_3_7 NO_LOAD
1762#define vload_partial_3_8 NO_LOAD
1763#define vload_partial_3_9 NO_LOAD
1764#define vload_partial_3_10 NO_LOAD
1765#define vload_partial_3_11 NO_LOAD
1766#define vload_partial_3_12 NO_LOAD
1767#define vload_partial_3_13 NO_LOAD
1768#define vload_partial_3_14 NO_LOAD
1769#define vload_partial_3_15 NO_LOAD
1770#define vload_partial_3_16 NO_LOAD
1771
1772#define vload_partial_4_0 NO_LOAD
1773#define vload_partial_4_1 vload_partial_1
1774#define vload_partial_4_2 vload_partial_2
1775#define vload_partial_4_3 vload_partial_3
1776#define vload_partial_4_4 vload_partial_4
1777#define vload_partial_4_5 NO_LOAD
1778#define vload_partial_4_6 NO_LOAD
1779#define vload_partial_4_7 NO_LOAD
1780#define vload_partial_4_8 NO_LOAD
1781#define vload_partial_4_9 NO_LOAD
1782#define vload_partial_4_10 NO_LOAD
1783#define vload_partial_4_11 NO_LOAD
1784#define vload_partial_4_12 NO_LOAD
1785#define vload_partial_4_13 NO_LOAD
1786#define vload_partial_4_14 NO_LOAD
1787#define vload_partial_4_15 NO_LOAD
1788#define vload_partial_4_16 NO_LOAD
1789
1790#define vload_partial_8_0 NO_LOAD
1791#define vload_partial_8_1 vload_partial_1
1792#define vload_partial_8_2 vload_partial_2
1793#define vload_partial_8_3 vload_partial_3
1794#define vload_partial_8_4 vload_partial_4
1795#define vload_partial_8_5 vload_partial_5
1796#define vload_partial_8_6 vload_partial_6
1797#define vload_partial_8_7 vload_partial_7
1798#define vload_partial_8_8 vload_partial_8
1799#define vload_partial_8_9 NO_LOAD
1800#define vload_partial_8_10 NO_LOAD
1801#define vload_partial_8_11 NO_LOAD
1802#define vload_partial_8_12 NO_LOAD
1803#define vload_partial_8_13 NO_LOAD
1804#define vload_partial_8_14 NO_LOAD
1805#define vload_partial_8_15 NO_LOAD
1806#define vload_partial_8_16 NO_LOAD
1807
1808#define vload_partial_16_0 NO_LOAD
1809#define vload_partial_16_1 vload_partial_1
1810#define vload_partial_16_2 vload_partial_2
1811#define vload_partial_16_3 vload_partial_3
1812#define vload_partial_16_4 vload_partial_4
1813#define vload_partial_16_5 vload_partial_5
1814#define vload_partial_16_6 vload_partial_6
1815#define vload_partial_16_7 vload_partial_7
1816#define vload_partial_16_8 vload_partial_8
1817#define vload_partial_16_9 vload_partial_9
1818#define vload_partial_16_10 vload_partial_10
1819#define vload_partial_16_11 vload_partial_11
1820#define vload_partial_16_12 vload_partial_12
1821#define vload_partial_16_13 vload_partial_13
1822#define vload_partial_16_14 vload_partial_14
1823#define vload_partial_16_15 vload_partial_15
1824#define vload_partial_16_16 vload_partial_16
1825
1826
1827#define vload_partial_1(DATA, OFFSET, PTR) \
1828    DATA.s0 = vload1(OFFSET, PTR);
1829
1830#define vload_partial_2(DATA, OFFSET, PTR) \
1831    DATA.s01 = vload2(OFFSET, PTR);
1832
1833#define vload_partial_3(DATA, OFFSET, PTR) \
1834    DATA.s012 = vload3(OFFSET, PTR);
1835
1836#define vload_partial_4(DATA, OFFSET, PTR) \
1837    DATA.s0123 = vload4(OFFSET, PTR);
1838
1839#define vload_partial_5(DATA, OFFSET, PTR)    \
1840    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1841    DATA.s4 = vload1(OFFSET, PTR + 4);
1842
1843#define vload_partial_6(DATA, OFFSET, PTR)    \
1844    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1845    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1846
1847#define vload_partial_7(DATA, OFFSET, PTR)    \
1848    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1849    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1850
1851#define vload_partial_8(DATA, OFFSET, PTR) \
1852    DATA.s01234567 = vload8(OFFSET, PTR);
1853
1854#define vload_partial_9(DATA, OFFSET, PTR)        \
1855    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1856    DATA.s8 = vload1(OFFSET, PTR + 8);
1857
1858#define vload_partial_10(DATA, OFFSET, PTR)       \
1859    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1860    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1861
1862#define vload_partial_11(DATA, OFFSET, PTR)       \
1863    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1864    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1865
1866#define vload_partial_12(DATA, OFFSET, PTR)       \
1867    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1868    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1869
1870#define vload_partial_13(DATA, OFFSET, PTR)       \
1871    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1872    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1873
1874#define vload_partial_14(DATA, OFFSET, PTR)       \
1875    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1876    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1877
1878#define vload_partial_15(DATA, OFFSET, PTR)       \
1879    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1880    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1881
1882#define vload_partial_16(DATA, OFFSET, PTR) \
1883    DATA = vload16(OFFSET, PTR);
1884
1885
1886
1887#define PIXEL_UNIT4 1
1888#define PIXEL_UNIT8 2
1889#define PIXEL_UNIT16 4
1890
1891
1892#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1893#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1894
1895
1896#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1897#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1898#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1899
1900#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1901#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1902#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1903#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1904#endif
1905
1906#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1907#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1908#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1909
1910#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1911#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1912#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1913#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1914#endif
1915
1916
1917#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1918#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1919
1920
1921#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1922#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1923
1924#define VSTORE_STR(size) vstore##size
1925#define VSTORE(size) VSTORE_STR(size)
1926
1927#define float1 float
1928#define half1 half
1929#define char1 char
1930#define uchar1 uchar
1931#define short1 short
1932#define ushort1 ushort
1933#define int1 int
1934#define uint1 uint
1935#define long1 long
1936#define ulong1 ulong
1937#define double1 double
1938
1939#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1940#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1941
1942
1943#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1944#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1945
1946#define NO_STORE(data, offs, ptr) \
1947    {                             \
1948    }
1949
1950
1951#define vstore_partial_1_0 NO_STORE
1952#define vstore_partial_1_1 vstore1
1953#define vstore_partial_1_2 NO_STORE
1954#define vstore_partial_1_3 NO_STORE
1955#define vstore_partial_1_4 NO_STORE
1956#define vstore_partial_1_5 NO_STORE
1957#define vstore_partial_1_6 NO_STORE
1958#define vstore_partial_1_7 NO_STORE
1959#define vstore_partial_1_8 NO_STORE
1960#define vstore_partial_1_9 NO_STORE
1961#define vstore_partial_1_10 NO_STORE
1962#define vstore_partial_1_11 NO_STORE
1963#define vstore_partial_1_12 NO_STORE
1964#define vstore_partial_1_13 NO_STORE
1965#define vstore_partial_1_14 NO_STORE
1966#define vstore_partial_1_15 NO_STORE
1967#define vstore_partial_1_16 NO_STORE
1968
1969#define vstore_partial_2_0 NO_STORE
1970#define vstore_partial_2_1 vstore_partial_1
1971#define vstore_partial_2_2 vstore_partial_2
1972#define vstore_partial_2_3 NO_STORE
1973#define vstore_partial_2_4 NO_STORE
1974#define vstore_partial_2_5 NO_STORE
1975#define vstore_partial_2_6 NO_STORE
1976#define vstore_partial_2_7 NO_STORE
1977#define vstore_partial_2_8 NO_STORE
1978#define vstore_partial_2_9 NO_STORE
1979#define vstore_partial_2_10 NO_STORE
1980#define vstore_partial_2_11 NO_STORE
1981#define vstore_partial_2_12 NO_STORE
1982#define vstore_partial_2_13 NO_STORE
1983#define vstore_partial_2_14 NO_STORE
1984#define vstore_partial_2_15 NO_STORE
1985#define vstore_partial_2_16 NO_STORE
1986
1987#define vstore_partial_3_0 NO_STORE
1988#define vstore_partial_3_1 vstore_partial_1
1989#define vstore_partial_3_2 vstore_partial_2
1990#define vstore_partial_3_3 vstore_partial_3
1991#define vstore_partial_3_4 NO_STORE
1992#define vstore_partial_3_5 NO_STORE
1993#define vstore_partial_3_6 NO_STORE
1994#define vstore_partial_3_7 NO_STORE
1995#define vstore_partial_3_8 NO_STORE
1996#define vstore_partial_3_9 NO_STORE
1997#define vstore_partial_3_10 NO_STORE
1998#define vstore_partial_3_11 NO_STORE
1999#define vstore_partial_3_12 NO_STORE
2000#define vstore_partial_3_13 NO_STORE
2001#define vstore_partial_3_14 NO_STORE
2002#define vstore_partial_3_15 NO_STORE
2003#define vstore_partial_3_16 NO_STORE
2004
2005#define vstore_partial_4_0 NO_STORE
2006#define vstore_partial_4_1 vstore_partial_1
2007#define vstore_partial_4_2 vstore_partial_2
2008#define vstore_partial_4_3 vstore_partial_3
2009#define vstore_partial_4_4 vstore_partial_4
2010#define vstore_partial_4_5 NO_STORE
2011#define vstore_partial_4_6 NO_STORE
2012#define vstore_partial_4_7 NO_STORE
2013#define vstore_partial_4_8 NO_STORE
2014#define vstore_partial_4_9 NO_STORE
2015#define vstore_partial_4_10 NO_STORE
2016#define vstore_partial_4_11 NO_STORE
2017#define vstore_partial_4_12 NO_STORE
2018#define vstore_partial_4_13 NO_STORE
2019#define vstore_partial_4_14 NO_STORE
2020#define vstore_partial_4_15 NO_STORE
2021#define vstore_partial_4_16 NO_STORE
2022
2023#define vstore_partial_8_0 NO_STORE
2024#define vstore_partial_8_1 vstore_partial_1
2025#define vstore_partial_8_2 vstore_partial_2
2026#define vstore_partial_8_3 vstore_partial_3
2027#define vstore_partial_8_4 vstore_partial_4
2028#define vstore_partial_8_5 vstore_partial_5
2029#define vstore_partial_8_6 vstore_partial_6
2030#define vstore_partial_8_7 vstore_partial_7
2031#define vstore_partial_8_8 vstore_partial_8
2032#define vstore_partial_8_9 NO_STORE
2033#define vstore_partial_8_10 NO_STORE
2034#define vstore_partial_8_11 NO_STORE
2035#define vstore_partial_8_12 NO_STORE
2036#define vstore_partial_8_13 NO_STORE
2037#define vstore_partial_8_14 NO_STORE
2038#define vstore_partial_8_15 NO_STORE
2039#define vstore_partial_8_16 NO_STORE
2040
2041#define vstore_partial_16_0 NO_STORE
2042#define vstore_partial_16_1 vstore_partial_1
2043#define vstore_partial_16_2 vstore_partial_2
2044#define vstore_partial_16_3 vstore_partial_3
2045#define vstore_partial_16_4 vstore_partial_4
2046#define vstore_partial_16_5 vstore_partial_5
2047#define vstore_partial_16_6 vstore_partial_6
2048#define vstore_partial_16_7 vstore_partial_7
2049#define vstore_partial_16_8 vstore_partial_8
2050#define vstore_partial_16_9 vstore_partial_9
2051#define vstore_partial_16_10 vstore_partial_10
2052#define vstore_partial_16_11 vstore_partial_11
2053#define vstore_partial_16_12 vstore_partial_12
2054#define vstore_partial_16_13 vstore_partial_13
2055#define vstore_partial_16_14 vstore_partial_14
2056#define vstore_partial_16_15 vstore_partial_15
2057#define vstore_partial_16_16 vstore_partial_16
2058
2059
2060#define vstore_partial_1(DATA, OFFSET, PTR) \
2061    vstore1(DATA.s0, OFFSET, PTR);
2062
2063#define vstore_partial_2(DATA, OFFSET, PTR) \
2064    vstore2(DATA.s01, OFFSET, PTR);
2065
2066#define vstore_partial_3(DATA, OFFSET, PTR) \
2067    vstore3(DATA.s012, OFFSET, PTR);
2068
2069#define vstore_partial_4(DATA, OFFSET, PTR) \
2070    vstore4(DATA.s0123, OFFSET, PTR);
2071
2072#define vstore_partial_5(DATA, OFFSET, PTR)    \
2073    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2074    vstore1(DATA.s4, OFFSET, PTR + 4);
2075
2076#define vstore_partial_6(DATA, OFFSET, PTR)    \
2077    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2078    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2079
2080#define vstore_partial_7(DATA, OFFSET, PTR)    \
2081    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2082    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2083
2084#define vstore_partial_8(DATA, OFFSET, PTR) \
2085    vstore8(DATA.s01234567, OFFSET, PTR);
2086
2087#define vstore_partial_9(DATA, OFFSET, PTR)        \
2088    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2089    vstore1(DATA.s8, OFFSET, PTR + 8);
2090
2091#define vstore_partial_10(DATA, OFFSET, PTR)       \
2092    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2093    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2094
2095#define vstore_partial_11(DATA, OFFSET, PTR)       \
2096    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2097    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2098
2099#define vstore_partial_12(DATA, OFFSET, PTR)       \
2100    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2101    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2102
2103#define vstore_partial_13(DATA, OFFSET, PTR)       \
2104    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2105    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2106
2107#define vstore_partial_14(DATA, OFFSET, PTR)       \
2108    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2109    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2110
2111#define vstore_partial_15(DATA, OFFSET, PTR)       \
2112    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2113    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2114
2115#define vstore_partial_16(DATA, OFFSET, PTR) \
2116    vstore16(DATA, OFFSET, PTR);
2117
2118
2119
2120
2121
2122#define convert_float_sat convert_float
2123#define convert_float1_sat convert_float
2124#define convert_float2_sat convert_float2
2125#define convert_float3_sat convert_float3
2126#define convert_float4_sat convert_float4
2127#define convert_float8_sat convert_float8
2128#define convert_float16_sat convert_float16
2129#define convert_half_sat convert_float
2130#define convert_half1_sat convert_half
2131#define convert_half2_sat convert_half2
2132#define convert_half3_sat convert_half3
2133#define convert_half4_sat convert_half4
2134#define convert_half8_sat convert_half8
2135#define convert_half16_sat convert_half16
2136
2137#define convert_float1 convert_float
2138#define convert_half1 convert_half
2139#define convert_char1 convert_char
2140#define convert_uchar1 convert_uchar
2141#define convert_short1 convert_short
2142#define convert_ushort1 convert_ushort
2143#define convert_int1 convert_int
2144#define convert_uint1 convert_uint
2145#define convert_long1 convert_long
2146#define convert_ulong1 convert_ulong
2147#define convert_double1 convert_double
2148
2149#define convert_char1_sat convert_char_sat
2150#define convert_uchar1_sat convert_uchar_sat
2151#define convert_uchar2_sat convert_uchar2_sat
2152#define convert_uchar3_sat convert_uchar3_sat
2153#define convert_uchar4_sat convert_uchar4_sat
2154#define convert_uchar8_sat convert_uchar8_sat
2155#define convert_uchar16_sat convert_uchar16_sat
2156#define convert_short1_sat convert_short_sat
2157#define convert_ushort1_sat convert_ushort_sat
2158#define convert_int1_sat convert_int_sat
2159#define convert_uint1_sat convert_uint_sat
2160#define convert_long1_sat convert_long_sat
2161#define convert_ulong1_sat convert_ulong_sat
2162#define convert_double1_sat convert_double_sat
2163
2164#define VEC_DATA_TYPE_STR(type, size) type##size
2165#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2166
2167#define CONVERT_STR(x, type) (convert_##type((x)))
2168#define CONVERT(x, type) CONVERT_STR(x, type)
2169
2170#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2171#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2172
2173#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2174#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2175
2176#define select_vec_dt_uchar(size) uchar##size
2177#define select_vec_dt_char(size) char##size
2178#define select_vec_dt_ushort(size) ushort##size
2179#define select_vec_dt_short(size) short##size
2180#define select_vec_dt_half(size) short##size
2181#define select_vec_dt_uint(size) uint##size
2182#define select_vec_dt_int(size) int##size
2183#define select_vec_dt_float(size) int##size
2184#define select_vec_dt_ulong(size) ulong##size
2185#define select_vec_dt_long(size) long##size
2186
2187#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2188#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2189#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2190
2191#define signed_int_vec_dt_uchar(size) char##size
2192#define signed_int_vec_dt_char(size) char##size
2193#define signed_int_vec_dt_ushort(size) short##size
2194#define signed_int_vec_dt_short(size) short##size
2195#define signed_int_vec_dt_half(size) short##size
2196#define signed_int_vec_dt_uint(size) int##size
2197#define signed_int_vec_dt_int(size) int##size
2198#define signed_int_vec_dt_float(size) int##size
2199#define signed_int_vec_dt_ulong(size) long##size
2200#define signed_int_vec_dt_long(size) long##size
2201
2202#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2203#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2204#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2205
2206#define sum_reduce_1(x) (x)
2207#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2208#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2209#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2210#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2211#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2212
2213#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2214#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2215
2216#define prod_reduce_1(x) (x)
2217#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2218#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2219#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2220#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2221#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2222
2223#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2224#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2225
2226#define max_reduce_1(x) (x)
2227#define max_reduce_2(x) max(((x).s0), ((x).s1))
2228#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2229#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2230#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2231#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2232
2233#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2234#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2235
2236#define VECTOR_DECLARATION(name)     \
2237    __global uchar *name##_ptr,      \
2238    uint        name##_stride_x, \
2239    uint        name##_step_x,   \
2240    uint        name##_offset_first_element_in_bytes
2241
2242#define IMAGE_DECLARATION(name)      \
2243    __global uchar *name##_ptr,      \
2244    uint        name##_stride_x, \
2245    uint        name##_step_x,   \
2246    uint        name##_stride_y, \
2247    uint        name##_step_y,   \
2248    uint        name##_offset_first_element_in_bytes
2249
2250#define TENSOR3D_DECLARATION(name)   \
2251    __global uchar *name##_ptr,      \
2252    uint        name##_stride_x, \
2253    uint        name##_step_x,   \
2254    uint        name##_stride_y, \
2255    uint        name##_step_y,   \
2256    uint        name##_stride_z, \
2257    uint        name##_step_z,   \
2258    uint        name##_offset_first_element_in_bytes
2259
2260#define TENSOR4D_DECLARATION(name)   \
2261    __global uchar *name##_ptr,      \
2262    uint        name##_stride_x, \
2263    uint        name##_step_x,   \
2264    uint        name##_stride_y, \
2265    uint        name##_step_y,   \
2266    uint        name##_stride_z, \
2267    uint        name##_step_z,   \
2268    uint        name##_stride_w, \
2269    uint        name##_step_w,   \
2270    uint        name##_offset_first_element_in_bytes
2271
2272#define TENSOR5D_DECLARATION(name)   \
2273    __global uchar *name##_ptr,      \
2274    uint        name##_stride_x, \
2275    uint        name##_step_x,   \
2276    uint        name##_stride_y, \
2277    uint        name##_step_y,   \
2278    uint        name##_stride_z, \
2279    uint        name##_step_z,   \
2280    uint        name##_stride_w, \
2281    uint        name##_step_w,   \
2282    uint        name##_stride_v, \
2283    uint        name##_step_v,   \
2284    uint        name##_offset_first_element_in_bytes
2285
2286#define CONVERT_TO_VECTOR_STRUCT(name) \
2287    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2288
2289#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2290    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2291
2292#define CONVERT_TO_IMAGE_STRUCT(name) \
2293    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2294
2295#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2296    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2297
2298#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2299    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2300
2301#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2302    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2303
2304#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2305    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2306
2307#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2308    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2309                                 name##_stride_z, name##_step_z)
2310
2311#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2312    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2313
2314#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2315    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2316                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2317
2318#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2319    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2320
2321#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2322    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2323                           name##_stride_z, name##_step_z)
2324
2325
2326typedef struct Vector
2327{
2328    __global uchar *ptr;
2329    int             offset_first_element_in_bytes;
2330    int             stride_x;
2331} Vector;
2332
2333
2334typedef struct Image
2335{
2336    __global uchar *ptr;
2337    int             offset_first_element_in_bytes;
2338    int             stride_x;
2339    int             stride_y;
2340} Image;
2341
2342
2343typedef struct Tensor3D
2344{
2345    __global uchar *ptr;
2346    int             offset_first_element_in_bytes;
2347    int             stride_x;
2348    int             stride_y;
2349    int             stride_z;
2350} Tensor3D;
2351
2352
2353typedef struct Tensor4D
2354{
2355    __global uchar *ptr;
2356    int             offset_first_element_in_bytes;
2357    int             stride_x;
2358    int             stride_y;
2359    int             stride_z;
2360    int             stride_w;
2361} Tensor4D;
2362
2363
2364inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2365{
2366    Vector vector =
2367    {
2368        .ptr                           = ptr,
2369        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2370        .stride_x                      = stride_x,
2371    };
2372    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2373    return vector;
2374}
2375
2376
2377inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2378{
2379    Image img =
2380    {
2381        .ptr                           = ptr,
2382        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2383        .stride_x                      = stride_x,
2384        .stride_y                      = stride_y
2385    };
2386    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2387    return img;
2388}
2389
2390
2391inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2392{
2393    Image img =
2394    {
2395        .ptr                           = ptr,
2396        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2397        .stride_x                      = stride_x,
2398        .stride_y                      = stride_y
2399    };
2400    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2401    return img;
2402}
2403
2404
2405inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2406{
2407    Tensor3D tensor =
2408    {
2409        .ptr                           = ptr,
2410        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2411        .stride_x                      = stride_x,
2412        .stride_y                      = stride_y,
2413        .stride_z                      = stride_z
2414    };
2415    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2416    return tensor;
2417}
2418
2419
2420inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2421{
2422    Tensor3D tensor =
2423    {
2424        .ptr                           = ptr,
2425        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2426        .stride_x                      = stride_x,
2427        .stride_y                      = stride_y,
2428        .stride_z                      = stride_z
2429    };
2430    return tensor;
2431}
2432
2433inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2434                                             uint step_w,
2435                                             uint mod_size)
2436{
2437    Tensor4D tensor =
2438    {
2439        .ptr                           = ptr,
2440        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2441        .stride_x                      = stride_x,
2442        .stride_y                      = stride_y,
2443        .stride_z                      = stride_z,
2444        .stride_w                      = stride_w
2445    };
2446
2447    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2448    return tensor;
2449}
2450
2451
2452inline __global const uchar *vector_offset(const Vector *vec, int x)
2453{
2454    return vec->ptr + x * vec->stride_x;
2455}
2456
2457
2458inline __global uchar *offset(const Image *img, int x, int y)
2459{
2460    return img->ptr + x * img->stride_x + y * img->stride_y;
2461}
2462
2463
2464inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2465{
2466    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2467}
2468
2469
2470inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2471{
2472    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2473}
2474
2475
2476inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2477{
2478    uint num_elements = width * height;
2479
2480    const uint z = index / num_elements;
2481
2482    index %= num_elements;
2483
2484    const uint y = index / width;
2485
2486    index %= width;
2487
2488    const uint x = index;
2489
2490    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2491}
2492
2493#endif
2494
2495
2496#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
2497#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
2498
2499
2500inline uchar quantize_qasymm8(float input, float offset, float scale)
2501{
2502    float out_f32 = input / scale + offset;
2503    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
2504    return res_u8;
2505}
2506
2507
2508inline float dequantize_qasymm8(uchar input, float offset, float scale)
2509{
2510    return ((float)input - offset) * scale;
2511}
2512
2513
2514inline float dequantize_qasymm8_signed(char input, float offset, float scale)
2515{
2516    return ((float)input - offset) * scale;
2517}
2518
2519
2520#define QUANTIZE_IMPL(type, size)                                                                                       \
2521    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
2522    {                                                                                                                   \
2523        VEC_DATA_TYPE(float, size)                                                                                      \
2524        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
2525        VEC_DATA_TYPE(type, size)                                                                                       \
2526        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
2527        return res;                                                                                                     \
2528    }
2529
2530
2531#define DEQUANTIZE_IMPL(type, size)                                                                                       \
2532    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
2533    {                                                                                                                     \
2534        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
2535    }
2536
2537
2538#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
2539    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
2540    {                                                                                                                                   \
2541        const VEC_DATA_TYPE(int, size)                                                                                                  \
2542        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
2543        const VEC_DATA_TYPE(int, size)                                                                                                  \
2544        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
2545        VEC_DATA_TYPE(int, size)                                                                                                        \
2546        mask = (one << exponent) - one;                                                                                                 \
2547        VEC_DATA_TYPE(int, size)                                                                                                        \
2548        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
2549        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
2550    }
2551
2552
2553#define ASYMM_MULT_IMPL(size)                                                                                \
2554    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
2555    {                                                                                                        \
2556        VEC_DATA_TYPE(int, size)                                                                             \
2557        overflow = a == b && a == INT_MIN;                                                                   \
2558        VEC_DATA_TYPE(long, size)                                                                            \
2559        a_64 = convert_long##size(a);                                                                        \
2560        VEC_DATA_TYPE(long, size)                                                                            \
2561        b_64 = convert_long##size(b);                                                                        \
2562        VEC_DATA_TYPE(long, size)                                                                            \
2563        ab_64 = a_64 * b_64;                                                                                 \
2564                                                                                      \
2565        VEC_DATA_TYPE(long, size)                                                                            \
2566        mask1 = 1 << 30;                                                                                     \
2567        VEC_DATA_TYPE(long, size)                                                                            \
2568        mask2 = 1 - (1 << 30);                                                                               \
2569        VEC_DATA_TYPE(long, size)                                                                            \
2570        is_positive_or_zero = ab_64 >= 0;                                                                    \
2571        VEC_DATA_TYPE(long, size)                                                                            \
2572        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
2573        VEC_DATA_TYPE(long, size)                                                                            \
2574        mask = 1ll << 31;                                                                                    \
2575        VEC_DATA_TYPE(int, size)                                                                             \
2576        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
2577        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
2578    }
2579
2580
2581#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
2582    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
2583    {                                                                                                                               \
2584        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
2585        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
2586        const int k_fractional_bits = 31;                                                                                           \
2587        VEC_DATA_TYPE(int, size)                                                                                                    \
2588        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
2589        VEC_DATA_TYPE(int, size)                                                                                                    \
2590        x2 = ASYMM_MULT(x, x, size);                                                                                                \
2591        VEC_DATA_TYPE(int, size)                                                                                                    \
2592        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
2593        VEC_DATA_TYPE(int, size)                                                                                                    \
2594        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
2595        VEC_DATA_TYPE(int, size)                                                                                                    \
2596        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
2597        VEC_DATA_TYPE(int, size)                                                                                                    \
2598        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
2599        VEC_DATA_TYPE(int, size)                                                                                                    \
2600        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
2601        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
2602    }
2603
2604
2605#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
2606    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
2607    {                                                                                                                                                                     \
2608        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
2609    }
2610
2611
2612#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
2613    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
2614    {                                                                                    \
2615        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
2616        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
2617        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
2618    }
2619
2620
2621#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
2622    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
2623    {                                                                                        \
2624        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
2625        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
2626        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
2627    }
2628
2629#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
2630    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
2631    {                                                                                                                                                                                                         \
2632        if(k_integer_bits > exponent)                                                                                                                                                                         \
2633        {                                                                                                                                                                                                     \
2634            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
2635            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
2636                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
2637                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
2638        }                                                                                                                                                                                                     \
2639        \
2640        return result;                                                                                                                                                                                        \
2641    }
2642
2643
2644#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
2645    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
2646    {                                                                                                                         \
2647        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
2648        VEC_DATA_TYPE(int, size)                                                                                              \
2649        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
2650        VEC_DATA_TYPE(int, size)                                                                                              \
2651        mask = k_one_quarter - 1;                                                                                             \
2652        VEC_DATA_TYPE(int, size)                                                                                              \
2653        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
2654        VEC_DATA_TYPE(int, size)                                                                                              \
2655        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
2656        VEC_DATA_TYPE(int, size)                                                                                              \
2657        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
2658        VEC_DATA_TYPE(int, size)                                                                                              \
2659        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
2660        \
2661        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
2662        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
2663        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
2664        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
2665        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
2666        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
2667        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
2668        \
2669        if(k_integer_bits > 5)                                                                                                \
2670        {                                                                                                                     \
2671            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
2672            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
2673        }                                                                                                                     \
2674        \
2675        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
2676        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
2677    }
2678
2679
2680#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
2681    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
2682    {                                                                                                                      \
2683        if(exponent < 0)                                                                                                   \
2684        {                                                                                                                  \
2685            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
2686        }                                                                                                                  \
2687        \
2688        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
2689        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
2690        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
2691        VEC_DATA_TYPE(int, size)                                                                                           \
2692        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
2693        VEC_DATA_TYPE(int, size)                                                                                           \
2694        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
2695        VEC_DATA_TYPE(int, size)                                                                                           \
2696        result = x << exponent;                                                                                            \
2697        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
2698        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
2699        return result;                                                                                                     \
2700    }
2701
2702
2703#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
2704    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
2705    {                                                                                                                     \
2706        VEC_DATA_TYPE(long, size)                                                                                         \
2707        a64 = convert_long##size(a);                                                                                      \
2708        VEC_DATA_TYPE(long, size)                                                                                         \
2709        b64 = convert_long##size(b);                                                                                      \
2710        VEC_DATA_TYPE(long, size)                                                                                         \
2711        sum = a64 + b64;                                                                                                  \
2712        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
2713        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
2714        VEC_DATA_TYPE(long, size)                                                                                         \
2715        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
2716        return convert_int##size((sum + sign) / 2);                                                                       \
2717    }
2718
2719
2720#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
2721    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
2722    {                                                                                                        \
2723        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
2724        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
2725        VEC_DATA_TYPE(int, size)                                                                             \
2726        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
2727        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
2728        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
2729        VEC_DATA_TYPE(int, size)                                                                             \
2730        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
2731        for(int i = 0; i < 3; i++)                                                                           \
2732        {                                                                                                    \
2733            VEC_DATA_TYPE(int, size)                                                                         \
2734            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
2735            VEC_DATA_TYPE(int, size)                                                                         \
2736            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
2737            VEC_DATA_TYPE(int, size)                                                                         \
2738            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
2739            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
2740        }                                                                                                    \
2741        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
2742    }
2743
2744
2745#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
2746    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
2747    {                                                                                                                               \
2748        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
2749        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
2750    }
2751
2752#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
2753#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
2754#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
2755#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
2756
2757#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
2758#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
2759#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
2760#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
2761#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
2762    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
2763#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
2764    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
2765#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
2766#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
2767#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
2768#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
2769#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
2770#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
2771#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
2772#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
2773#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
2774#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
2775#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
2776#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
2777#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
2778
2779#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
2780    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
2781    {                                                                                                                           \
2782        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
2783        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
2784        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
2785    }
2786#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
2787
2788QUANTIZE_IMPL(uchar, 1)
2789QUANTIZE_IMPL(char, 1)
2790QUANTIZE_IMPL(uint, 1)
2791QUANTIZE_IMPL(int, 1)
2792QUANTIZE_IMPL(uchar, 2)
2793QUANTIZE_IMPL(char, 2)
2794QUANTIZE_IMPL(uint, 2)
2795QUANTIZE_IMPL(int, 2)
2796QUANTIZE_IMPL(uchar, 3)
2797QUANTIZE_IMPL(char, 3)
2798QUANTIZE_IMPL(uint, 3)
2799QUANTIZE_IMPL(int, 3)
2800QUANTIZE_IMPL(uchar, 4)
2801QUANTIZE_IMPL(ushort, 4)
2802QUANTIZE_IMPL(short, 4)
2803QUANTIZE_IMPL(int, 4)
2804QUANTIZE_IMPL(uchar, 8)
2805QUANTIZE_IMPL(char, 8)
2806QUANTIZE_IMPL(uint, 8)
2807QUANTIZE_IMPL(int, 8)
2808QUANTIZE_IMPL(uchar, 16)
2809QUANTIZE_IMPL(char, 16)
2810QUANTIZE_IMPL(ushort, 16)
2811QUANTIZE_IMPL(short, 16)
2812QUANTIZE_IMPL(uint, 16)
2813QUANTIZE_IMPL(int, 16)
2814
2815DEQUANTIZE_IMPL(uchar, 1)
2816DEQUANTIZE_IMPL(char, 1)
2817DEQUANTIZE_IMPL(uint, 1)
2818DEQUANTIZE_IMPL(int, 1)
2819DEQUANTIZE_IMPL(uchar, 2)
2820DEQUANTIZE_IMPL(char, 2)
2821DEQUANTIZE_IMPL(uint, 2)
2822DEQUANTIZE_IMPL(int, 2)
2823DEQUANTIZE_IMPL(uchar, 3)
2824DEQUANTIZE_IMPL(char, 3)
2825DEQUANTIZE_IMPL(uint, 3)
2826DEQUANTIZE_IMPL(int, 3)
2827DEQUANTIZE_IMPL(uchar, 4)
2828DEQUANTIZE_IMPL(ushort, 4)
2829DEQUANTIZE_IMPL(short, 4)
2830DEQUANTIZE_IMPL(int, 4)
2831DEQUANTIZE_IMPL(uchar, 8)
2832DEQUANTIZE_IMPL(char, 8)
2833DEQUANTIZE_IMPL(uint, 8)
2834DEQUANTIZE_IMPL(int, 8)
2835DEQUANTIZE_IMPL(uchar, 16)
2836DEQUANTIZE_IMPL(char, 16)
2837DEQUANTIZE_IMPL(ushort, 16)
2838DEQUANTIZE_IMPL(short, 16)
2839DEQUANTIZE_IMPL(uint, 16)
2840DEQUANTIZE_IMPL(int, 16)
2841
2842ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
2843ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
2844ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
2845ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
2846ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
2847ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
2848
2849ASYMM_MULT_IMPL(1)
2850ASYMM_MULT_IMPL(2)
2851ASYMM_MULT_IMPL(3)
2852ASYMM_MULT_IMPL(4)
2853ASYMM_MULT_IMPL(8)
2854ASYMM_MULT_IMPL(16)
2855
2856ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
2857ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
2858ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
2859ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
2860ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
2861ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
2862
2863ASYMM_SELECT_USING_MASK_IMPL(1)
2864ASYMM_SELECT_USING_MASK_IMPL(2)
2865ASYMM_SELECT_USING_MASK_IMPL(3)
2866ASYMM_SELECT_USING_MASK_IMPL(4)
2867ASYMM_SELECT_USING_MASK_IMPL(8)
2868ASYMM_SELECT_USING_MASK_IMPL(16)
2869
2870ASYMM_MASK_IF_ZERO_IMPL(1)
2871ASYMM_MASK_IF_ZERO_IMPL(2)
2872ASYMM_MASK_IF_ZERO_IMPL(3)
2873ASYMM_MASK_IF_ZERO_IMPL(4)
2874ASYMM_MASK_IF_ZERO_IMPL(8)
2875ASYMM_MASK_IF_ZERO_IMPL(16)
2876
2877ASYMM_MASK_IF_NON_ZERO_IMPL(1)
2878ASYMM_MASK_IF_NON_ZERO_IMPL(2)
2879ASYMM_MASK_IF_NON_ZERO_IMPL(3)
2880ASYMM_MASK_IF_NON_ZERO_IMPL(4)
2881ASYMM_MASK_IF_NON_ZERO_IMPL(8)
2882ASYMM_MASK_IF_NON_ZERO_IMPL(16)
2883
2884EXP_BARREL_SHIFTER_IMPL(1)
2885EXP_BARREL_SHIFTER_IMPL(2)
2886EXP_BARREL_SHIFTER_IMPL(3)
2887EXP_BARREL_SHIFTER_IMPL(4)
2888EXP_BARREL_SHIFTER_IMPL(8)
2889EXP_BARREL_SHIFTER_IMPL(16)
2890
2891ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
2892ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
2893ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
2894ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
2895ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
2896ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
2897
2898ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
2899ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
2900ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
2901ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
2902ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
2903ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
2904
2905ASYMM_ROUNDING_HALF_SUM_IMPL(1)
2906ASYMM_ROUNDING_HALF_SUM_IMPL(2)
2907ASYMM_ROUNDING_HALF_SUM_IMPL(3)
2908ASYMM_ROUNDING_HALF_SUM_IMPL(4)
2909ASYMM_ROUNDING_HALF_SUM_IMPL(8)
2910ASYMM_ROUNDING_HALF_SUM_IMPL(16)
2911
2912ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
2913ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
2914ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
2915ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
2916ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
2917ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
2918
2919ASYMM_RESCALE_IMPL(1)
2920ASYMM_RESCALE_IMPL(2)
2921ASYMM_RESCALE_IMPL(3)
2922ASYMM_RESCALE_IMPL(4)
2923ASYMM_RESCALE_IMPL(8)
2924ASYMM_RESCALE_IMPL(16)
2925
2926MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
2927MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
2928MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
2929MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
2930MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
2931MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
2932
2933#endif
2934
2935
2936__kernel void direct_convolution_nchw(
2937    TENSOR3D_DECLARATION(src),
2938    TENSOR3D_DECLARATION(dst),
2939    TENSOR3D_DECLARATION(weights),
2940#ifdef HAS_BIAS
2941    VECTOR_DECLARATION(biases),
2942#endif
2943    unsigned int weights_stride_w)
2944{
2945    const int id0 = get_global_id(0);
2946    const int id1 = get_global_id(1);
2947    const int id2 = get_global_id(2);
2948
2949    const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
2950    const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
2951
2952    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
2953
2954    __global uchar *src_addr     = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
2955    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
2956    __global uchar *dst_addr     = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
2957
2958#ifdef IS_QUANTIZED
2959    int acc_value = 0;
2960#else
2961    DATA_TYPE                 acc_value = 0;
2962#endif
2963    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
2964    {
2965        for(int y = 0; y < WEI_HEIGHT; ++y)
2966        {
2967            for(int x = 0; x < WEI_WIDTH; ++x)
2968            {
2969                const int idx_x = (x_coords + x);
2970                const int idx_y = (y_coords + y);
2971                if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
2972                {
2973                    const int weight_offset = x + (WEI_HEIGHT * y);
2974                    const int input_offset  = idx_x + SRC_WIDTH * idx_y;
2975#ifdef IS_QUANTIZED
2976                    int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
2977                    int input  = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
2978                    acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
2979#else
2980                    DATA_TYPE weight    = *((__global DATA_TYPE *)weights_addr + weight_offset);
2981                    DATA_TYPE input     = *((__global DATA_TYPE *)src_addr + input_offset);
2982                    acc_value += input * weight;
2983#endif
2984                }
2985            }
2986        }
2987        src_addr += src_stride_z;
2988        weights_addr += weights_stride_z;
2989    }
2990
2991#ifdef HAS_BIAS
2992
2993    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
2994#ifdef IS_QUANTIZED
2995    int bias = *((__global int *)(vector_offset(&biases, id2)));
2996#else
2997    DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2)));
2998#endif
2999    acc_value += bias;
3000
3001#endif
3002
3003#ifdef IS_QUANTIZED
3004
3005#if OUTPUT_SHIFT < 0
3006    acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
3007#else
3008    acc_value      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
3009#endif
3010    acc_value = acc_value + OUTPUT_OFFSET;
3011#endif
3012
3013    *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE);
3014})"