1R"( 2 3 4#ifndef ARM_COMPUTE_HELPER_H 5#define ARM_COMPUTE_HELPER_H 6 7 8 9 10#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 11 VSTORE(N0) \ 12 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 13 14#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 15 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 16 VSTORE(N0) \ 17 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 18 19#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 20 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 21 VSTORE(N0) \ 22 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 23 24#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 25 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 26 VSTORE(N0) \ 27 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 28 29#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 30 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 31 VSTORE(N0) \ 32 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 33 34#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 35 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 36 VSTORE(N0) \ 37 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 38 39#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 40 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 41 VSTORE(N0) \ 42 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 43 44#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 45 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 46 VSTORE(N0) \ 47 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 48 49#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 50 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 51 VSTORE(N0) \ 52 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 53 54#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 55 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 56 VSTORE(N0) \ 57 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 58 59#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 60 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 61 VSTORE(N0) \ 62 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 63 64#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 65 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 66 VSTORE(N0) \ 67 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 68 69#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 70 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 71 VSTORE(N0) \ 72 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 73 74#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 75 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 76 VSTORE(N0) \ 77 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 78 79#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 80 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 81 VSTORE(N0) \ 82 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 83 84#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 85 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 86 VSTORE(N0) \ 87 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 88 89 90 91#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 92 VSTORE(N0) \ 93 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 94 95#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 96 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 97 VSTORE(N0) \ 98 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 99 100#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 101 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 102 VSTORE(N0) \ 103 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 104 105#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 106 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 107 VSTORE(N0) \ 108 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 109 110#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 111 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 112 VSTORE(N0) \ 113 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 114 115#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 116 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 117 VSTORE(N0) \ 118 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 119 120#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 121 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 122 VSTORE(N0) \ 123 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 124 125#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 126 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 127 VSTORE(N0) \ 128 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 129 130#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 131 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 132 VSTORE(N0) \ 133 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 134 135#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 136 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 137 VSTORE(N0) \ 138 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 139 140#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 141 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 142 VSTORE(N0) \ 143 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 144 145#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 146 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 147 VSTORE(N0) \ 148 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 149 150#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 151 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 152 VSTORE(N0) \ 153 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 154 155#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 156 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 157 VSTORE(N0) \ 158 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 159 160#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 161 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 162 VSTORE(N0) \ 163 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 164 165#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 166 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 167 VSTORE(N0) \ 168 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 169 170 171 172 173#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 174#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 175 176 177 178#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 179#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 180 181 182 183#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 184 VSTORE_PARTIAL(N0, STORE_N0) \ 185 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 186 187#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 188 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 189 VSTORE_PARTIAL(N0, STORE_N0) \ 190 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 191 192#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 193 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 194 VSTORE_PARTIAL(N0, STORE_N0) \ 195 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 196 197#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 198 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 199 VSTORE_PARTIAL(N0, STORE_N0) \ 200 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 201 202#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 203 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 204 VSTORE_PARTIAL(N0, STORE_N0) \ 205 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 206 207#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 208 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 209 VSTORE_PARTIAL(N0, STORE_N0) \ 210 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 211 212#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 213 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 214 VSTORE_PARTIAL(N0, STORE_N0) \ 215 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 216 217#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 218 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 219 VSTORE_PARTIAL(N0, STORE_N0) \ 220 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 221 222#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 223 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 224 VSTORE_PARTIAL(N0, STORE_N0) \ 225 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 226 227#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 228 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 229 VSTORE_PARTIAL(N0, STORE_N0) \ 230 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 231 232#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 233 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 234 VSTORE_PARTIAL(N0, STORE_N0) \ 235 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 236 237#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 238 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 239 VSTORE_PARTIAL(N0, STORE_N0) \ 240 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 241 242#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 243 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 244 VSTORE_PARTIAL(N0, STORE_N0) \ 245 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 246 247#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 248 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 249 VSTORE_PARTIAL(N0, STORE_N0) \ 250 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 251 252#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 253 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 254 VSTORE_PARTIAL(N0, STORE_N0) \ 255 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 256 257#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 258 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 259 VSTORE_PARTIAL(N0, STORE_N0) \ 260 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 261 262 263 264#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 265#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 266 267#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 268 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 269 { \ 270 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 271 } \ 272 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 273 { \ 274 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 275 } \ 276 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 277 { \ 278 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 279 } \ 280 else \ 281 { \ 282 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 283 } 284 285#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 286 if(!(PARTIAL_COND_X)) \ 287 { \ 288 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 289 } \ 290 else \ 291 { \ 292 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 293 } 294 295#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 296 if(!(PARTIAL_COND_Y)) \ 297 { \ 298 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 299 } \ 300 else \ 301 { \ 302 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 303 } 304 305 306#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 307 308 309#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 310 311#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 312 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 313 314#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 315 316#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 317 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 318 319#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 320 321#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 322 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 323 324#else 325 326#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 327 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 328 329#endif 330 331#endif 332 333 334#if defined(PARTIAL_STORE_M0) 335 336#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 337 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 338#else 339#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 340 ((uint)(y * M0)) 341#endif 342 343 344 345#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 346 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 347 348 349#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 350#pragma OPENCL EXTENSION cl_khr_fp16 : enable 351#endif 352 353#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 354#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 355#endif 356 357#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 358#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 359#endif 360 361#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 362#pragma OPENCL EXTENSION cl_arm_printf : enable 363#endif 364 365#define GPU_ARCH_MIDGARD 0x100 366#define GPU_ARCH_BIFROST 0x200 367#define GPU_ARCH_VALHALL 0x300 368 369 370#define CONCAT(a, b) a##b 371 372 373#define EXPAND(x) x 374 375 376#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 377 378 379#define REV1(x) ((x)) 380#define REV2(x) ((x).s10) 381#define REV3(x) ((x).s210) 382#define REV4(x) ((x).s3210) 383#define REV8(x) ((x).s76543210) 384#define REV16(x) ((x).sFEDCBA9876543210) 385 386 387 388#define REVERSE_STR(x, s) REV##s((x)) 389#define REVERSE(x, s) REVERSE_STR(x, s) 390 391 392 393#define ROT1_0(x) ((x)) 394#define ROT1_1(x) ((x)) 395 396#define ROT2_0(x) ((x)) 397#define ROT2_1(x) ((x).s10) 398#define ROT2_2(x) ((x)) 399 400#define ROT3_0(x) ((x)) 401#define ROT3_1(x) ((x).s201) 402#define ROT3_2(x) ((x).s120) 403#define ROT3_3(x) ((x)) 404 405#define ROT4_0(x) ((x)) 406#define ROT4_1(x) ((x).s3012) 407#define ROT4_2(x) ((x).s2301) 408#define ROT4_3(x) ((x).s1230) 409#define ROT4_4(x) ((x)) 410 411#define ROT8_0(x) ((x)) 412#define ROT8_1(x) ((x).s70123456) 413#define ROT8_2(x) ((x).s67012345) 414#define ROT8_3(x) ((x).s56701234) 415#define ROT8_4(x) ((x).s45670123) 416#define ROT8_5(x) ((x).s34567012) 417#define ROT8_6(x) ((x).s23456701) 418#define ROT8_7(x) ((x).s12345670) 419#define ROT8_8(x) ((x)) 420 421#define ROT16_0(x) ((x)) 422#define ROT16_1(x) ((x).sF0123456789ABCDE) 423#define ROT16_2(x) ((x).sEF0123456789ABCD) 424#define ROT16_3(x) ((x).sDEF0123456789ABC) 425#define ROT16_4(x) ((x).sCDEF0123456789AB) 426#define ROT16_5(x) ((x).sBCDEF0123456789A) 427#define ROT16_6(x) ((x).sABCDEF0123456789) 428#define ROT16_7(x) ((x).s9ABCDEF012345678) 429#define ROT16_8(x) ((x).s89ABCDEF01234567) 430#define ROT16_9(x) ((x).s789ABCDEF0123456) 431#define ROT16_10(x) ((x).s6789ABCDEF012345) 432#define ROT16_11(x) ((x).s56789ABCDEF01234) 433#define ROT16_12(x) ((x).s456789ABCDEF0123) 434#define ROT16_13(x) ((x).s3456789ABCDEF012) 435#define ROT16_14(x) ((x).s23456789ABCDEF01) 436#define ROT16_15(x) ((x).s123456789ABCDEF0) 437#define ROT16_16(x) ((x)) 438 439 440 441#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 442#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 443 444 445 446#define V_OFFS1(dt) (dt##1)(0) 447#define V_OFFS2(dt) (dt##2)(0, 1) 448#define V_OFFS3(dt) (dt##3)(0, 1, 2) 449#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 450#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 451#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 452 453 454 455#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 456#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 457 458 459#define VLOAD_STR(size) vload##size 460#define VLOAD(size) VLOAD_STR(size) 461 462 463#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 464#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 465 466#define NO_LOAD(data, offs, ptr) \ 467 { \ 468 } 469 470 471#define vload_partial_1_0 NO_LOAD 472#define vload_partial_1_1 vload1 473#define vload_partial_1_2 NO_LOAD 474#define vload_partial_1_3 NO_LOAD 475#define vload_partial_1_4 NO_LOAD 476#define vload_partial_1_5 NO_LOAD 477#define vload_partial_1_6 NO_LOAD 478#define vload_partial_1_7 NO_LOAD 479#define vload_partial_1_8 NO_LOAD 480#define vload_partial_1_9 NO_LOAD 481#define vload_partial_1_10 NO_LOAD 482#define vload_partial_1_11 NO_LOAD 483#define vload_partial_1_12 NO_LOAD 484#define vload_partial_1_13 NO_LOAD 485#define vload_partial_1_14 NO_LOAD 486#define vload_partial_1_15 NO_LOAD 487#define vload_partial_1_16 NO_LOAD 488 489#define vload_partial_2_0 NO_LOAD 490#define vload_partial_2_1 vload_partial_1 491#define vload_partial_2_2 vload_partial_2 492#define vload_partial_2_3 NO_LOAD 493#define vload_partial_2_4 NO_LOAD 494#define vload_partial_2_5 NO_LOAD 495#define vload_partial_2_6 NO_LOAD 496#define vload_partial_2_7 NO_LOAD 497#define vload_partial_2_8 NO_LOAD 498#define vload_partial_2_9 NO_LOAD 499#define vload_partial_2_10 NO_LOAD 500#define vload_partial_2_11 NO_LOAD 501#define vload_partial_2_12 NO_LOAD 502#define vload_partial_2_13 NO_LOAD 503#define vload_partial_2_14 NO_LOAD 504#define vload_partial_2_15 NO_LOAD 505#define vload_partial_2_16 NO_LOAD 506 507#define vload_partial_3_0 NO_LOAD 508#define vload_partial_3_1 vload_partial_1 509#define vload_partial_3_2 vload_partial_2 510#define vload_partial_3_3 vload_partial_3 511#define vload_partial_3_4 NO_LOAD 512#define vload_partial_3_5 NO_LOAD 513#define vload_partial_3_6 NO_LOAD 514#define vload_partial_3_7 NO_LOAD 515#define vload_partial_3_8 NO_LOAD 516#define vload_partial_3_9 NO_LOAD 517#define vload_partial_3_10 NO_LOAD 518#define vload_partial_3_11 NO_LOAD 519#define vload_partial_3_12 NO_LOAD 520#define vload_partial_3_13 NO_LOAD 521#define vload_partial_3_14 NO_LOAD 522#define vload_partial_3_15 NO_LOAD 523#define vload_partial_3_16 NO_LOAD 524 525#define vload_partial_4_0 NO_LOAD 526#define vload_partial_4_1 vload_partial_1 527#define vload_partial_4_2 vload_partial_2 528#define vload_partial_4_3 vload_partial_3 529#define vload_partial_4_4 vload_partial_4 530#define vload_partial_4_5 NO_LOAD 531#define vload_partial_4_6 NO_LOAD 532#define vload_partial_4_7 NO_LOAD 533#define vload_partial_4_8 NO_LOAD 534#define vload_partial_4_9 NO_LOAD 535#define vload_partial_4_10 NO_LOAD 536#define vload_partial_4_11 NO_LOAD 537#define vload_partial_4_12 NO_LOAD 538#define vload_partial_4_13 NO_LOAD 539#define vload_partial_4_14 NO_LOAD 540#define vload_partial_4_15 NO_LOAD 541#define vload_partial_4_16 NO_LOAD 542 543#define vload_partial_8_0 NO_LOAD 544#define vload_partial_8_1 vload_partial_1 545#define vload_partial_8_2 vload_partial_2 546#define vload_partial_8_3 vload_partial_3 547#define vload_partial_8_4 vload_partial_4 548#define vload_partial_8_5 vload_partial_5 549#define vload_partial_8_6 vload_partial_6 550#define vload_partial_8_7 vload_partial_7 551#define vload_partial_8_8 vload_partial_8 552#define vload_partial_8_9 NO_LOAD 553#define vload_partial_8_10 NO_LOAD 554#define vload_partial_8_11 NO_LOAD 555#define vload_partial_8_12 NO_LOAD 556#define vload_partial_8_13 NO_LOAD 557#define vload_partial_8_14 NO_LOAD 558#define vload_partial_8_15 NO_LOAD 559#define vload_partial_8_16 NO_LOAD 560 561#define vload_partial_16_0 NO_LOAD 562#define vload_partial_16_1 vload_partial_1 563#define vload_partial_16_2 vload_partial_2 564#define vload_partial_16_3 vload_partial_3 565#define vload_partial_16_4 vload_partial_4 566#define vload_partial_16_5 vload_partial_5 567#define vload_partial_16_6 vload_partial_6 568#define vload_partial_16_7 vload_partial_7 569#define vload_partial_16_8 vload_partial_8 570#define vload_partial_16_9 vload_partial_9 571#define vload_partial_16_10 vload_partial_10 572#define vload_partial_16_11 vload_partial_11 573#define vload_partial_16_12 vload_partial_12 574#define vload_partial_16_13 vload_partial_13 575#define vload_partial_16_14 vload_partial_14 576#define vload_partial_16_15 vload_partial_15 577#define vload_partial_16_16 vload_partial_16 578 579 580#define vload_partial_1(DATA, OFFSET, PTR) \ 581 DATA.s0 = vload1(OFFSET, PTR); 582 583#define vload_partial_2(DATA, OFFSET, PTR) \ 584 DATA.s01 = vload2(OFFSET, PTR); 585 586#define vload_partial_3(DATA, OFFSET, PTR) \ 587 DATA.s012 = vload3(OFFSET, PTR); 588 589#define vload_partial_4(DATA, OFFSET, PTR) \ 590 DATA.s0123 = vload4(OFFSET, PTR); 591 592#define vload_partial_5(DATA, OFFSET, PTR) \ 593 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 594 DATA.s4 = vload1(OFFSET, PTR + 4); 595 596#define vload_partial_6(DATA, OFFSET, PTR) \ 597 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 598 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 599 600#define vload_partial_7(DATA, OFFSET, PTR) \ 601 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 602 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 603 604#define vload_partial_8(DATA, OFFSET, PTR) \ 605 DATA.s01234567 = vload8(OFFSET, PTR); 606 607#define vload_partial_9(DATA, OFFSET, PTR) \ 608 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 609 DATA.s8 = vload1(OFFSET, PTR + 8); 610 611#define vload_partial_10(DATA, OFFSET, PTR) \ 612 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 613 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 614 615#define vload_partial_11(DATA, OFFSET, PTR) \ 616 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 617 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 618 619#define vload_partial_12(DATA, OFFSET, PTR) \ 620 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 621 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 622 623#define vload_partial_13(DATA, OFFSET, PTR) \ 624 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 625 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 626 627#define vload_partial_14(DATA, OFFSET, PTR) \ 628 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 629 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 630 631#define vload_partial_15(DATA, OFFSET, PTR) \ 632 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 633 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 634 635#define vload_partial_16(DATA, OFFSET, PTR) \ 636 DATA = vload16(OFFSET, PTR); 637 638 639 640#define PIXEL_UNIT4 1 641#define PIXEL_UNIT8 2 642#define PIXEL_UNIT16 4 643 644 645#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 646#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 647 648 649#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 650#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 651#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 652 653#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 654#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 655#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 656#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 657#endif 658 659#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 660#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 661#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 662 663#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 664#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 665#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 666#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 667#endif 668 669 670#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 671#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 672 673 674#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 675#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 676 677#define VSTORE_STR(size) vstore##size 678#define VSTORE(size) VSTORE_STR(size) 679 680#define float1 float 681#define half1 half 682#define char1 char 683#define uchar1 uchar 684#define short1 short 685#define ushort1 ushort 686#define int1 int 687#define uint1 uint 688#define long1 long 689#define ulong1 ulong 690#define double1 double 691 692#define vload1(OFFSET, PTR) *(OFFSET + PTR) 693#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 694 695 696#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 697#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 698 699#define NO_STORE(data, offs, ptr) \ 700 { \ 701 } 702 703 704#define vstore_partial_1_0 NO_STORE 705#define vstore_partial_1_1 vstore1 706#define vstore_partial_1_2 NO_STORE 707#define vstore_partial_1_3 NO_STORE 708#define vstore_partial_1_4 NO_STORE 709#define vstore_partial_1_5 NO_STORE 710#define vstore_partial_1_6 NO_STORE 711#define vstore_partial_1_7 NO_STORE 712#define vstore_partial_1_8 NO_STORE 713#define vstore_partial_1_9 NO_STORE 714#define vstore_partial_1_10 NO_STORE 715#define vstore_partial_1_11 NO_STORE 716#define vstore_partial_1_12 NO_STORE 717#define vstore_partial_1_13 NO_STORE 718#define vstore_partial_1_14 NO_STORE 719#define vstore_partial_1_15 NO_STORE 720#define vstore_partial_1_16 NO_STORE 721 722#define vstore_partial_2_0 NO_STORE 723#define vstore_partial_2_1 vstore_partial_1 724#define vstore_partial_2_2 vstore_partial_2 725#define vstore_partial_2_3 NO_STORE 726#define vstore_partial_2_4 NO_STORE 727#define vstore_partial_2_5 NO_STORE 728#define vstore_partial_2_6 NO_STORE 729#define vstore_partial_2_7 NO_STORE 730#define vstore_partial_2_8 NO_STORE 731#define vstore_partial_2_9 NO_STORE 732#define vstore_partial_2_10 NO_STORE 733#define vstore_partial_2_11 NO_STORE 734#define vstore_partial_2_12 NO_STORE 735#define vstore_partial_2_13 NO_STORE 736#define vstore_partial_2_14 NO_STORE 737#define vstore_partial_2_15 NO_STORE 738#define vstore_partial_2_16 NO_STORE 739 740#define vstore_partial_3_0 NO_STORE 741#define vstore_partial_3_1 vstore_partial_1 742#define vstore_partial_3_2 vstore_partial_2 743#define vstore_partial_3_3 vstore_partial_3 744#define vstore_partial_3_4 NO_STORE 745#define vstore_partial_3_5 NO_STORE 746#define vstore_partial_3_6 NO_STORE 747#define vstore_partial_3_7 NO_STORE 748#define vstore_partial_3_8 NO_STORE 749#define vstore_partial_3_9 NO_STORE 750#define vstore_partial_3_10 NO_STORE 751#define vstore_partial_3_11 NO_STORE 752#define vstore_partial_3_12 NO_STORE 753#define vstore_partial_3_13 NO_STORE 754#define vstore_partial_3_14 NO_STORE 755#define vstore_partial_3_15 NO_STORE 756#define vstore_partial_3_16 NO_STORE 757 758#define vstore_partial_4_0 NO_STORE 759#define vstore_partial_4_1 vstore_partial_1 760#define vstore_partial_4_2 vstore_partial_2 761#define vstore_partial_4_3 vstore_partial_3 762#define vstore_partial_4_4 vstore_partial_4 763#define vstore_partial_4_5 NO_STORE 764#define vstore_partial_4_6 NO_STORE 765#define vstore_partial_4_7 NO_STORE 766#define vstore_partial_4_8 NO_STORE 767#define vstore_partial_4_9 NO_STORE 768#define vstore_partial_4_10 NO_STORE 769#define vstore_partial_4_11 NO_STORE 770#define vstore_partial_4_12 NO_STORE 771#define vstore_partial_4_13 NO_STORE 772#define vstore_partial_4_14 NO_STORE 773#define vstore_partial_4_15 NO_STORE 774#define vstore_partial_4_16 NO_STORE 775 776#define vstore_partial_8_0 NO_STORE 777#define vstore_partial_8_1 vstore_partial_1 778#define vstore_partial_8_2 vstore_partial_2 779#define vstore_partial_8_3 vstore_partial_3 780#define vstore_partial_8_4 vstore_partial_4 781#define vstore_partial_8_5 vstore_partial_5 782#define vstore_partial_8_6 vstore_partial_6 783#define vstore_partial_8_7 vstore_partial_7 784#define vstore_partial_8_8 vstore_partial_8 785#define vstore_partial_8_9 NO_STORE 786#define vstore_partial_8_10 NO_STORE 787#define vstore_partial_8_11 NO_STORE 788#define vstore_partial_8_12 NO_STORE 789#define vstore_partial_8_13 NO_STORE 790#define vstore_partial_8_14 NO_STORE 791#define vstore_partial_8_15 NO_STORE 792#define vstore_partial_8_16 NO_STORE 793 794#define vstore_partial_16_0 NO_STORE 795#define vstore_partial_16_1 vstore_partial_1 796#define vstore_partial_16_2 vstore_partial_2 797#define vstore_partial_16_3 vstore_partial_3 798#define vstore_partial_16_4 vstore_partial_4 799#define vstore_partial_16_5 vstore_partial_5 800#define vstore_partial_16_6 vstore_partial_6 801#define vstore_partial_16_7 vstore_partial_7 802#define vstore_partial_16_8 vstore_partial_8 803#define vstore_partial_16_9 vstore_partial_9 804#define vstore_partial_16_10 vstore_partial_10 805#define vstore_partial_16_11 vstore_partial_11 806#define vstore_partial_16_12 vstore_partial_12 807#define vstore_partial_16_13 vstore_partial_13 808#define vstore_partial_16_14 vstore_partial_14 809#define vstore_partial_16_15 vstore_partial_15 810#define vstore_partial_16_16 vstore_partial_16 811 812 813#define vstore_partial_1(DATA, OFFSET, PTR) \ 814 vstore1(DATA.s0, OFFSET, PTR); 815 816#define vstore_partial_2(DATA, OFFSET, PTR) \ 817 vstore2(DATA.s01, OFFSET, PTR); 818 819#define vstore_partial_3(DATA, OFFSET, PTR) \ 820 vstore3(DATA.s012, OFFSET, PTR); 821 822#define vstore_partial_4(DATA, OFFSET, PTR) \ 823 vstore4(DATA.s0123, OFFSET, PTR); 824 825#define vstore_partial_5(DATA, OFFSET, PTR) \ 826 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 827 vstore1(DATA.s4, OFFSET, PTR + 4); 828 829#define vstore_partial_6(DATA, OFFSET, PTR) \ 830 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 831 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 832 833#define vstore_partial_7(DATA, OFFSET, PTR) \ 834 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 835 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 836 837#define vstore_partial_8(DATA, OFFSET, PTR) \ 838 vstore8(DATA.s01234567, OFFSET, PTR); 839 840#define vstore_partial_9(DATA, OFFSET, PTR) \ 841 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 842 vstore1(DATA.s8, OFFSET, PTR + 8); 843 844#define vstore_partial_10(DATA, OFFSET, PTR) \ 845 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 846 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 847 848#define vstore_partial_11(DATA, OFFSET, PTR) \ 849 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 850 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 851 852#define vstore_partial_12(DATA, OFFSET, PTR) \ 853 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 854 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 855 856#define vstore_partial_13(DATA, OFFSET, PTR) \ 857 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 858 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 859 860#define vstore_partial_14(DATA, OFFSET, PTR) \ 861 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 862 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 863 864#define vstore_partial_15(DATA, OFFSET, PTR) \ 865 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 866 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 867 868#define vstore_partial_16(DATA, OFFSET, PTR) \ 869 vstore16(DATA, OFFSET, PTR); 870 871 872 873 874 875#define convert_float_sat convert_float 876#define convert_float1_sat convert_float 877#define convert_float2_sat convert_float2 878#define convert_float3_sat convert_float3 879#define convert_float4_sat convert_float4 880#define convert_float8_sat convert_float8 881#define convert_float16_sat convert_float16 882#define convert_half_sat convert_float 883#define convert_half1_sat convert_half 884#define convert_half2_sat convert_half2 885#define convert_half3_sat convert_half3 886#define convert_half4_sat convert_half4 887#define convert_half8_sat convert_half8 888#define convert_half16_sat convert_half16 889 890#define convert_float1 convert_float 891#define convert_half1 convert_half 892#define convert_char1 convert_char 893#define convert_uchar1 convert_uchar 894#define convert_short1 convert_short 895#define convert_ushort1 convert_ushort 896#define convert_int1 convert_int 897#define convert_uint1 convert_uint 898#define convert_long1 convert_long 899#define convert_ulong1 convert_ulong 900#define convert_double1 convert_double 901 902#define convert_char1_sat convert_char_sat 903#define convert_uchar1_sat convert_uchar_sat 904#define convert_uchar2_sat convert_uchar2_sat 905#define convert_uchar3_sat convert_uchar3_sat 906#define convert_uchar4_sat convert_uchar4_sat 907#define convert_uchar8_sat convert_uchar8_sat 908#define convert_uchar16_sat convert_uchar16_sat 909#define convert_short1_sat convert_short_sat 910#define convert_ushort1_sat convert_ushort_sat 911#define convert_int1_sat convert_int_sat 912#define convert_uint1_sat convert_uint_sat 913#define convert_long1_sat convert_long_sat 914#define convert_ulong1_sat convert_ulong_sat 915#define convert_double1_sat convert_double_sat 916 917#define VEC_DATA_TYPE_STR(type, size) type##size 918#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 919 920#define CONVERT_STR(x, type) (convert_##type((x))) 921#define CONVERT(x, type) CONVERT_STR(x, type) 922 923#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 924#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 925 926#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 927#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 928 929#define select_vec_dt_uchar(size) uchar##size 930#define select_vec_dt_char(size) char##size 931#define select_vec_dt_ushort(size) ushort##size 932#define select_vec_dt_short(size) short##size 933#define select_vec_dt_half(size) short##size 934#define select_vec_dt_uint(size) uint##size 935#define select_vec_dt_int(size) int##size 936#define select_vec_dt_float(size) int##size 937#define select_vec_dt_ulong(size) ulong##size 938#define select_vec_dt_long(size) long##size 939 940#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 941#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 942#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 943 944#define signed_int_vec_dt_uchar(size) char##size 945#define signed_int_vec_dt_char(size) char##size 946#define signed_int_vec_dt_ushort(size) short##size 947#define signed_int_vec_dt_short(size) short##size 948#define signed_int_vec_dt_half(size) short##size 949#define signed_int_vec_dt_uint(size) int##size 950#define signed_int_vec_dt_int(size) int##size 951#define signed_int_vec_dt_float(size) int##size 952#define signed_int_vec_dt_ulong(size) long##size 953#define signed_int_vec_dt_long(size) long##size 954 955#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 956#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 957#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 958 959#define sum_reduce_1(x) (x) 960#define sum_reduce_2(x) ((x).s0) + ((x).s1) 961#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 962#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 963#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 964#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 965 966#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 967#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 968 969#define prod_reduce_1(x) (x) 970#define prod_reduce_2(x) ((x).s0) * ((x).s1) 971#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 972#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 973#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 974#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 975 976#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 977#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 978 979#define max_reduce_1(x) (x) 980#define max_reduce_2(x) max(((x).s0), ((x).s1)) 981#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 982#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 983#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 984#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 985 986#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 987#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 988 989#define VECTOR_DECLARATION(name) \ 990 __global uchar *name##_ptr, \ 991 uint name##_stride_x, \ 992 uint name##_step_x, \ 993 uint name##_offset_first_element_in_bytes 994 995#define IMAGE_DECLARATION(name) \ 996 __global uchar *name##_ptr, \ 997 uint name##_stride_x, \ 998 uint name##_step_x, \ 999 uint name##_stride_y, \ 1000 uint name##_step_y, \ 1001 uint name##_offset_first_element_in_bytes 1002 1003#define TENSOR3D_DECLARATION(name) \ 1004 __global uchar *name##_ptr, \ 1005 uint name##_stride_x, \ 1006 uint name##_step_x, \ 1007 uint name##_stride_y, \ 1008 uint name##_step_y, \ 1009 uint name##_stride_z, \ 1010 uint name##_step_z, \ 1011 uint name##_offset_first_element_in_bytes 1012 1013#define TENSOR4D_DECLARATION(name) \ 1014 __global uchar *name##_ptr, \ 1015 uint name##_stride_x, \ 1016 uint name##_step_x, \ 1017 uint name##_stride_y, \ 1018 uint name##_step_y, \ 1019 uint name##_stride_z, \ 1020 uint name##_step_z, \ 1021 uint name##_stride_w, \ 1022 uint name##_step_w, \ 1023 uint name##_offset_first_element_in_bytes 1024 1025#define TENSOR5D_DECLARATION(name) \ 1026 __global uchar *name##_ptr, \ 1027 uint name##_stride_x, \ 1028 uint name##_step_x, \ 1029 uint name##_stride_y, \ 1030 uint name##_step_y, \ 1031 uint name##_stride_z, \ 1032 uint name##_step_z, \ 1033 uint name##_stride_w, \ 1034 uint name##_step_w, \ 1035 uint name##_stride_v, \ 1036 uint name##_step_v, \ 1037 uint name##_offset_first_element_in_bytes 1038 1039#define CONVERT_TO_VECTOR_STRUCT(name) \ 1040 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1041 1042#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1043 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1044 1045#define CONVERT_TO_IMAGE_STRUCT(name) \ 1046 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1047 1048#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1049 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1050 1051#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1052 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1053 1054#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1055 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1056 1057#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1058 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1059 1060#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1061 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1062 name##_stride_z, name##_step_z) 1063 1064#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1065 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1066 1067#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1068 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1069 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1070 1071#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1072 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1073 1074#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1075 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1076 name##_stride_z, name##_step_z) 1077 1078 1079typedef struct Vector 1080{ 1081 __global uchar *ptr; 1082 int offset_first_element_in_bytes; 1083 int stride_x; 1084} Vector; 1085 1086 1087typedef struct Image 1088{ 1089 __global uchar *ptr; 1090 int offset_first_element_in_bytes; 1091 int stride_x; 1092 int stride_y; 1093} Image; 1094 1095 1096typedef struct Tensor3D 1097{ 1098 __global uchar *ptr; 1099 int offset_first_element_in_bytes; 1100 int stride_x; 1101 int stride_y; 1102 int stride_z; 1103} Tensor3D; 1104 1105 1106typedef struct Tensor4D 1107{ 1108 __global uchar *ptr; 1109 int offset_first_element_in_bytes; 1110 int stride_x; 1111 int stride_y; 1112 int stride_z; 1113 int stride_w; 1114} Tensor4D; 1115 1116 1117inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1118{ 1119 Vector vector = 1120 { 1121 .ptr = ptr, 1122 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1123 .stride_x = stride_x, 1124 }; 1125 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1126 return vector; 1127} 1128 1129 1130inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1131{ 1132 Image img = 1133 { 1134 .ptr = ptr, 1135 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1136 .stride_x = stride_x, 1137 .stride_y = stride_y 1138 }; 1139 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1140 return img; 1141} 1142 1143 1144inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1145{ 1146 Image img = 1147 { 1148 .ptr = ptr, 1149 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1150 .stride_x = stride_x, 1151 .stride_y = stride_y 1152 }; 1153 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1154 return img; 1155} 1156 1157 1158inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1159{ 1160 Tensor3D tensor = 1161 { 1162 .ptr = ptr, 1163 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1164 .stride_x = stride_x, 1165 .stride_y = stride_y, 1166 .stride_z = stride_z 1167 }; 1168 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1169 return tensor; 1170} 1171 1172 1173inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1174{ 1175 Tensor3D tensor = 1176 { 1177 .ptr = ptr, 1178 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1179 .stride_x = stride_x, 1180 .stride_y = stride_y, 1181 .stride_z = stride_z 1182 }; 1183 return tensor; 1184} 1185 1186inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1187 uint step_w, 1188 uint mod_size) 1189{ 1190 Tensor4D tensor = 1191 { 1192 .ptr = ptr, 1193 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1194 .stride_x = stride_x, 1195 .stride_y = stride_y, 1196 .stride_z = stride_z, 1197 .stride_w = stride_w 1198 }; 1199 1200 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1201 return tensor; 1202} 1203 1204 1205inline __global const uchar *vector_offset(const Vector *vec, int x) 1206{ 1207 return vec->ptr + x * vec->stride_x; 1208} 1209 1210 1211inline __global uchar *offset(const Image *img, int x, int y) 1212{ 1213 return img->ptr + x * img->stride_x + y * img->stride_y; 1214} 1215 1216 1217inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1218{ 1219 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1220} 1221 1222 1223inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1224{ 1225 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1226} 1227 1228 1229inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1230{ 1231 uint num_elements = width * height; 1232 1233 const uint z = index / num_elements; 1234 1235 index %= num_elements; 1236 1237 const uint y = index / width; 1238 1239 index %= width; 1240 1241 const uint x = index; 1242 1243 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1244} 1245 1246#endif 1247 1248 1249 1250 1251#if defined(N0) && !defined(VEC_SIZE) 1252#define VEC_SIZE N0 1253#endif 1254 1255#if defined(VEC_SIZE) && defined(DATA_TYPE) 1256 1257#define ADD_X_POS_0(x, y) (x) + (y) 1258#define SUB_X_POS_0(x, y) (x) - (y) 1259#define MAX_X_POS_0(x, y) max(x, y) 1260#define MIN_X_POS_0(x, y) min(x, y) 1261#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y) 1262#define POWER_X_POS_0(x, y) pow(x, y) 1263#if VEC_SIZE == 1 1264#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y) 1265#else 1266 1267#if defined(MIXED_PRECISION) 1268#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE)))) 1269#else 1270#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)))) 1271#endif 1272 1273#endif 1274#define DIV_X_POS_0(x, y) (x / y) 1275#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1)) 1276#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1)) 1277 1278#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y) 1279#define SUB_X_POS_1(x, y) (y) - (x) 1280#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y) 1281#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y) 1282#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y) 1283#define POWER_X_POS_1(x, y) pow(y, x) 1284#if VEC_SIZE == 1 1285#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x) 1286#else 1287 1288#if defined(MIXED_PRECISION) 1289#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE)))) 1290#else 1291#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)))) 1292#endif 1293 1294#endif 1295#define DIV_X_POS_1(x, y) (y / x) 1296#define AND_X_POS_1(x, y) AND_X_POS_0(x, y) 1297#define OR_X_POS_1(x, y) OR_X_POS_0(x, y) 1298 1299 1300#define ADD(x, y) ADD_X_POS_0(x, y) 1301#define SUB(x, y) SUB_X_POS_0(x, y) 1302#define MAX(x, y) MAX_X_POS_0(x, y) 1303#define MIN(x, y) MIN_X_POS_0(x, y) 1304#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y) 1305#define POWER(x, y) POWER_X_POS_0(x, y) 1306#define PRELU(x, y) PRELU_X_POS_0(x, y) 1307#define DIV(x, y) DIV_X_POS_0(x, y) 1308#define AND(x, y) AND_X_POS_0(x, y) 1309#define OR(x, y) OR_X_POS_0(x, y) 1310 1311#endif 1312 1313 1314 1315#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \ 1316 OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0); 1317 1318#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \ 1319 ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \ 1320 OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1); 1321 1322#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \ 1323 ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \ 1324 OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2); 1325 1326#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \ 1327 ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \ 1328 OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3); 1329 1330#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \ 1331 ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \ 1332 OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4); 1333 1334#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \ 1335 ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \ 1336 OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5); 1337 1338#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \ 1339 ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \ 1340 OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6); 1341 1342#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \ 1343 ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \ 1344 OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7); 1345 1346#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \ 1347 ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \ 1348 OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8); 1349 1350#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \ 1351 ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \ 1352 OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9); 1353 1354#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \ 1355 ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \ 1356 OPERAND1##A = OP(OPERAND1##A, OPERAND2##A); 1357 1358#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \ 1359 ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \ 1360 OPERAND1##B = OP(OPERAND1##B, OPERAND2##B); 1361 1362#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \ 1363 ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \ 1364 OPERAND1##C = OP(OPERAND1##C, OPERAND2##C); 1365 1366#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \ 1367 ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \ 1368 OPERAND1##D = OP(OPERAND1##D, OPERAND2##D); 1369 1370#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \ 1371 ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \ 1372 OPERAND1##E = OP(OPERAND1##E, OPERAND2##E); 1373 1374#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \ 1375 ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \ 1376 OPERAND1##F = OP(OPERAND1##F, OPERAND2##F); 1377 1378 1379 1380 1381#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2) 1382#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) 1383 1384 1385 1386#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \ 1387 OPERAND1##0 = OP(OPERAND1##0, OPERAND2); 1388 1389#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \ 1390 ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \ 1391 OPERAND1##1 = OP(OPERAND1##1, OPERAND2); 1392 1393#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \ 1394 ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \ 1395 OPERAND1##2 = OP(OPERAND1##2, OPERAND2); 1396 1397#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \ 1398 ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \ 1399 OPERAND1##3 = OP(OPERAND1##3, OPERAND2); 1400 1401#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \ 1402 ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \ 1403 OPERAND1##4 = OP(OPERAND1##4, OPERAND2); 1404 1405#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \ 1406 ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \ 1407 OPERAND1##5 = OP(OPERAND1##5, OPERAND2); 1408 1409#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \ 1410 ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \ 1411 OPERAND1##6 = OP(OPERAND1##6, OPERAND2); 1412 1413#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \ 1414 ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \ 1415 OPERAND1##7 = OP(OPERAND1##7, OPERAND2); 1416 1417#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \ 1418 ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \ 1419 OPERAND1##8 = OP(OPERAND1##8, OPERAND2); 1420 1421#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \ 1422 ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \ 1423 OPERAND1##9 = OP(OPERAND1##9, OPERAND2); 1424 1425#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \ 1426 ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \ 1427 OPERAND1##A = OP(OPERAND1##A, OPERAND2); 1428 1429#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \ 1430 ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \ 1431 OPERAND1##B = OP(OPERAND1##B, OPERAND2); 1432 1433#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \ 1434 ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \ 1435 OPERAND1##C = OP(OPERAND1##C, OPERAND2); 1436 1437#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \ 1438 ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \ 1439 OPERAND1##D = OP(OPERAND1##D, OPERAND2); 1440 1441#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \ 1442 ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \ 1443 OPERAND1##E = OP(OPERAND1##E, OPERAND2); 1444 1445#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \ 1446 ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \ 1447 OPERAND1##F = OP(OPERAND1##F, OPERAND2); 1448 1449 1450 1451 1452#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2) 1453#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) 1454 1455 1456 1457 1458 1459#ifndef ARM_COMPUTE_HELPER_H 1460#define ARM_COMPUTE_HELPER_H 1461 1462 1463 1464 1465#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1466 VSTORE(N0) \ 1467 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1468 1469#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1470 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1471 VSTORE(N0) \ 1472 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1473 1474#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1475 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1476 VSTORE(N0) \ 1477 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1478 1479#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1480 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1481 VSTORE(N0) \ 1482 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1483 1484#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1486 VSTORE(N0) \ 1487 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1488 1489#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1491 VSTORE(N0) \ 1492 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1493 1494#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1496 VSTORE(N0) \ 1497 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1498 1499#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1501 VSTORE(N0) \ 1502 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1503 1504#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1506 VSTORE(N0) \ 1507 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1508 1509#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1511 VSTORE(N0) \ 1512 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1513 1514#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1516 VSTORE(N0) \ 1517 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1518 1519#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1521 VSTORE(N0) \ 1522 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1523 1524#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1526 VSTORE(N0) \ 1527 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1528 1529#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1531 VSTORE(N0) \ 1532 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1533 1534#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1536 VSTORE(N0) \ 1537 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1538 1539#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1541 VSTORE(N0) \ 1542 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1543 1544 1545 1546#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1547 VSTORE(N0) \ 1548 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1549 1550#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1551 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1552 VSTORE(N0) \ 1553 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1554 1555#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1556 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1557 VSTORE(N0) \ 1558 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1559 1560#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1561 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1562 VSTORE(N0) \ 1563 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1564 1565#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1566 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1567 VSTORE(N0) \ 1568 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1569 1570#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1571 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1572 VSTORE(N0) \ 1573 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1574 1575#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1576 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1577 VSTORE(N0) \ 1578 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1579 1580#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1581 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1582 VSTORE(N0) \ 1583 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1584 1585#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1586 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1587 VSTORE(N0) \ 1588 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1589 1590#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1591 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1592 VSTORE(N0) \ 1593 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1594 1595#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1596 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1597 VSTORE(N0) \ 1598 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1599 1600#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1601 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1602 VSTORE(N0) \ 1603 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1604 1605#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1606 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1607 VSTORE(N0) \ 1608 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1609 1610#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1611 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1612 VSTORE(N0) \ 1613 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1614 1615#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1616 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1617 VSTORE(N0) \ 1618 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1619 1620#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1621 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1622 VSTORE(N0) \ 1623 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1624 1625 1626 1627 1628#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1629#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1630 1631 1632 1633#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1634#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1635 1636 1637 1638#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1639 VSTORE_PARTIAL(N0, STORE_N0) \ 1640 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1641 1642#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1643 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1644 VSTORE_PARTIAL(N0, STORE_N0) \ 1645 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1646 1647#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1648 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1649 VSTORE_PARTIAL(N0, STORE_N0) \ 1650 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1651 1652#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1653 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1654 VSTORE_PARTIAL(N0, STORE_N0) \ 1655 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1656 1657#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1658 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1659 VSTORE_PARTIAL(N0, STORE_N0) \ 1660 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1661 1662#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1663 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1664 VSTORE_PARTIAL(N0, STORE_N0) \ 1665 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1666 1667#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1668 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1669 VSTORE_PARTIAL(N0, STORE_N0) \ 1670 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1671 1672#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1673 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1674 VSTORE_PARTIAL(N0, STORE_N0) \ 1675 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1676 1677#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1678 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1679 VSTORE_PARTIAL(N0, STORE_N0) \ 1680 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1681 1682#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1683 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1684 VSTORE_PARTIAL(N0, STORE_N0) \ 1685 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1686 1687#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1688 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1689 VSTORE_PARTIAL(N0, STORE_N0) \ 1690 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1691 1692#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1693 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1694 VSTORE_PARTIAL(N0, STORE_N0) \ 1695 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1696 1697#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1698 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1699 VSTORE_PARTIAL(N0, STORE_N0) \ 1700 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1701 1702#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1703 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1704 VSTORE_PARTIAL(N0, STORE_N0) \ 1705 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1706 1707#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1708 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1709 VSTORE_PARTIAL(N0, STORE_N0) \ 1710 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1711 1712#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1713 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1714 VSTORE_PARTIAL(N0, STORE_N0) \ 1715 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1716 1717 1718 1719#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1720#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1721 1722#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1723 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1724 { \ 1725 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1726 } \ 1727 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1728 { \ 1729 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1730 } \ 1731 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1732 { \ 1733 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1734 } \ 1735 else \ 1736 { \ 1737 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1738 } 1739 1740#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1741 if(!(PARTIAL_COND_X)) \ 1742 { \ 1743 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1744 } \ 1745 else \ 1746 { \ 1747 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1748 } 1749 1750#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1751 if(!(PARTIAL_COND_Y)) \ 1752 { \ 1753 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1754 } \ 1755 else \ 1756 { \ 1757 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1758 } 1759 1760 1761#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1762 1763 1764#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1765 1766#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1767 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1768 1769#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1770 1771#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1772 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1773 1774#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1775 1776#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1777 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1778 1779#else 1780 1781#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1782 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1783 1784#endif 1785 1786#endif 1787 1788 1789#if defined(PARTIAL_STORE_M0) 1790 1791#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1792 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1793#else 1794#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1795 ((uint)(y * M0)) 1796#endif 1797 1798 1799 1800#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1801 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1802 1803 1804#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1805#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1806#endif 1807 1808#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1809#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1810#endif 1811 1812#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1813#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1814#endif 1815 1816#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1817#pragma OPENCL EXTENSION cl_arm_printf : enable 1818#endif 1819 1820#define GPU_ARCH_MIDGARD 0x100 1821#define GPU_ARCH_BIFROST 0x200 1822#define GPU_ARCH_VALHALL 0x300 1823 1824 1825#define CONCAT(a, b) a##b 1826 1827 1828#define EXPAND(x) x 1829 1830 1831#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1832 1833 1834#define REV1(x) ((x)) 1835#define REV2(x) ((x).s10) 1836#define REV3(x) ((x).s210) 1837#define REV4(x) ((x).s3210) 1838#define REV8(x) ((x).s76543210) 1839#define REV16(x) ((x).sFEDCBA9876543210) 1840 1841 1842 1843#define REVERSE_STR(x, s) REV##s((x)) 1844#define REVERSE(x, s) REVERSE_STR(x, s) 1845 1846 1847 1848#define ROT1_0(x) ((x)) 1849#define ROT1_1(x) ((x)) 1850 1851#define ROT2_0(x) ((x)) 1852#define ROT2_1(x) ((x).s10) 1853#define ROT2_2(x) ((x)) 1854 1855#define ROT3_0(x) ((x)) 1856#define ROT3_1(x) ((x).s201) 1857#define ROT3_2(x) ((x).s120) 1858#define ROT3_3(x) ((x)) 1859 1860#define ROT4_0(x) ((x)) 1861#define ROT4_1(x) ((x).s3012) 1862#define ROT4_2(x) ((x).s2301) 1863#define ROT4_3(x) ((x).s1230) 1864#define ROT4_4(x) ((x)) 1865 1866#define ROT8_0(x) ((x)) 1867#define ROT8_1(x) ((x).s70123456) 1868#define ROT8_2(x) ((x).s67012345) 1869#define ROT8_3(x) ((x).s56701234) 1870#define ROT8_4(x) ((x).s45670123) 1871#define ROT8_5(x) ((x).s34567012) 1872#define ROT8_6(x) ((x).s23456701) 1873#define ROT8_7(x) ((x).s12345670) 1874#define ROT8_8(x) ((x)) 1875 1876#define ROT16_0(x) ((x)) 1877#define ROT16_1(x) ((x).sF0123456789ABCDE) 1878#define ROT16_2(x) ((x).sEF0123456789ABCD) 1879#define ROT16_3(x) ((x).sDEF0123456789ABC) 1880#define ROT16_4(x) ((x).sCDEF0123456789AB) 1881#define ROT16_5(x) ((x).sBCDEF0123456789A) 1882#define ROT16_6(x) ((x).sABCDEF0123456789) 1883#define ROT16_7(x) ((x).s9ABCDEF012345678) 1884#define ROT16_8(x) ((x).s89ABCDEF01234567) 1885#define ROT16_9(x) ((x).s789ABCDEF0123456) 1886#define ROT16_10(x) ((x).s6789ABCDEF012345) 1887#define ROT16_11(x) ((x).s56789ABCDEF01234) 1888#define ROT16_12(x) ((x).s456789ABCDEF0123) 1889#define ROT16_13(x) ((x).s3456789ABCDEF012) 1890#define ROT16_14(x) ((x).s23456789ABCDEF01) 1891#define ROT16_15(x) ((x).s123456789ABCDEF0) 1892#define ROT16_16(x) ((x)) 1893 1894 1895 1896#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1897#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1898 1899 1900 1901#define V_OFFS1(dt) (dt##1)(0) 1902#define V_OFFS2(dt) (dt##2)(0, 1) 1903#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1904#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1905#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1906#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1907 1908 1909 1910#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1911#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1912 1913 1914#define VLOAD_STR(size) vload##size 1915#define VLOAD(size) VLOAD_STR(size) 1916 1917 1918#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1919#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1920 1921#define NO_LOAD(data, offs, ptr) \ 1922 { \ 1923 } 1924 1925 1926#define vload_partial_1_0 NO_LOAD 1927#define vload_partial_1_1 vload1 1928#define vload_partial_1_2 NO_LOAD 1929#define vload_partial_1_3 NO_LOAD 1930#define vload_partial_1_4 NO_LOAD 1931#define vload_partial_1_5 NO_LOAD 1932#define vload_partial_1_6 NO_LOAD 1933#define vload_partial_1_7 NO_LOAD 1934#define vload_partial_1_8 NO_LOAD 1935#define vload_partial_1_9 NO_LOAD 1936#define vload_partial_1_10 NO_LOAD 1937#define vload_partial_1_11 NO_LOAD 1938#define vload_partial_1_12 NO_LOAD 1939#define vload_partial_1_13 NO_LOAD 1940#define vload_partial_1_14 NO_LOAD 1941#define vload_partial_1_15 NO_LOAD 1942#define vload_partial_1_16 NO_LOAD 1943 1944#define vload_partial_2_0 NO_LOAD 1945#define vload_partial_2_1 vload_partial_1 1946#define vload_partial_2_2 vload_partial_2 1947#define vload_partial_2_3 NO_LOAD 1948#define vload_partial_2_4 NO_LOAD 1949#define vload_partial_2_5 NO_LOAD 1950#define vload_partial_2_6 NO_LOAD 1951#define vload_partial_2_7 NO_LOAD 1952#define vload_partial_2_8 NO_LOAD 1953#define vload_partial_2_9 NO_LOAD 1954#define vload_partial_2_10 NO_LOAD 1955#define vload_partial_2_11 NO_LOAD 1956#define vload_partial_2_12 NO_LOAD 1957#define vload_partial_2_13 NO_LOAD 1958#define vload_partial_2_14 NO_LOAD 1959#define vload_partial_2_15 NO_LOAD 1960#define vload_partial_2_16 NO_LOAD 1961 1962#define vload_partial_3_0 NO_LOAD 1963#define vload_partial_3_1 vload_partial_1 1964#define vload_partial_3_2 vload_partial_2 1965#define vload_partial_3_3 vload_partial_3 1966#define vload_partial_3_4 NO_LOAD 1967#define vload_partial_3_5 NO_LOAD 1968#define vload_partial_3_6 NO_LOAD 1969#define vload_partial_3_7 NO_LOAD 1970#define vload_partial_3_8 NO_LOAD 1971#define vload_partial_3_9 NO_LOAD 1972#define vload_partial_3_10 NO_LOAD 1973#define vload_partial_3_11 NO_LOAD 1974#define vload_partial_3_12 NO_LOAD 1975#define vload_partial_3_13 NO_LOAD 1976#define vload_partial_3_14 NO_LOAD 1977#define vload_partial_3_15 NO_LOAD 1978#define vload_partial_3_16 NO_LOAD 1979 1980#define vload_partial_4_0 NO_LOAD 1981#define vload_partial_4_1 vload_partial_1 1982#define vload_partial_4_2 vload_partial_2 1983#define vload_partial_4_3 vload_partial_3 1984#define vload_partial_4_4 vload_partial_4 1985#define vload_partial_4_5 NO_LOAD 1986#define vload_partial_4_6 NO_LOAD 1987#define vload_partial_4_7 NO_LOAD 1988#define vload_partial_4_8 NO_LOAD 1989#define vload_partial_4_9 NO_LOAD 1990#define vload_partial_4_10 NO_LOAD 1991#define vload_partial_4_11 NO_LOAD 1992#define vload_partial_4_12 NO_LOAD 1993#define vload_partial_4_13 NO_LOAD 1994#define vload_partial_4_14 NO_LOAD 1995#define vload_partial_4_15 NO_LOAD 1996#define vload_partial_4_16 NO_LOAD 1997 1998#define vload_partial_8_0 NO_LOAD 1999#define vload_partial_8_1 vload_partial_1 2000#define vload_partial_8_2 vload_partial_2 2001#define vload_partial_8_3 vload_partial_3 2002#define vload_partial_8_4 vload_partial_4 2003#define vload_partial_8_5 vload_partial_5 2004#define vload_partial_8_6 vload_partial_6 2005#define vload_partial_8_7 vload_partial_7 2006#define vload_partial_8_8 vload_partial_8 2007#define vload_partial_8_9 NO_LOAD 2008#define vload_partial_8_10 NO_LOAD 2009#define vload_partial_8_11 NO_LOAD 2010#define vload_partial_8_12 NO_LOAD 2011#define vload_partial_8_13 NO_LOAD 2012#define vload_partial_8_14 NO_LOAD 2013#define vload_partial_8_15 NO_LOAD 2014#define vload_partial_8_16 NO_LOAD 2015 2016#define vload_partial_16_0 NO_LOAD 2017#define vload_partial_16_1 vload_partial_1 2018#define vload_partial_16_2 vload_partial_2 2019#define vload_partial_16_3 vload_partial_3 2020#define vload_partial_16_4 vload_partial_4 2021#define vload_partial_16_5 vload_partial_5 2022#define vload_partial_16_6 vload_partial_6 2023#define vload_partial_16_7 vload_partial_7 2024#define vload_partial_16_8 vload_partial_8 2025#define vload_partial_16_9 vload_partial_9 2026#define vload_partial_16_10 vload_partial_10 2027#define vload_partial_16_11 vload_partial_11 2028#define vload_partial_16_12 vload_partial_12 2029#define vload_partial_16_13 vload_partial_13 2030#define vload_partial_16_14 vload_partial_14 2031#define vload_partial_16_15 vload_partial_15 2032#define vload_partial_16_16 vload_partial_16 2033 2034 2035#define vload_partial_1(DATA, OFFSET, PTR) \ 2036 DATA.s0 = vload1(OFFSET, PTR); 2037 2038#define vload_partial_2(DATA, OFFSET, PTR) \ 2039 DATA.s01 = vload2(OFFSET, PTR); 2040 2041#define vload_partial_3(DATA, OFFSET, PTR) \ 2042 DATA.s012 = vload3(OFFSET, PTR); 2043 2044#define vload_partial_4(DATA, OFFSET, PTR) \ 2045 DATA.s0123 = vload4(OFFSET, PTR); 2046 2047#define vload_partial_5(DATA, OFFSET, PTR) \ 2048 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 2049 DATA.s4 = vload1(OFFSET, PTR + 4); 2050 2051#define vload_partial_6(DATA, OFFSET, PTR) \ 2052 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 2053 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 2054 2055#define vload_partial_7(DATA, OFFSET, PTR) \ 2056 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 2057 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 2058 2059#define vload_partial_8(DATA, OFFSET, PTR) \ 2060 DATA.s01234567 = vload8(OFFSET, PTR); 2061 2062#define vload_partial_9(DATA, OFFSET, PTR) \ 2063 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2064 DATA.s8 = vload1(OFFSET, PTR + 8); 2065 2066#define vload_partial_10(DATA, OFFSET, PTR) \ 2067 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2068 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 2069 2070#define vload_partial_11(DATA, OFFSET, PTR) \ 2071 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2072 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 2073 2074#define vload_partial_12(DATA, OFFSET, PTR) \ 2075 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2076 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 2077 2078#define vload_partial_13(DATA, OFFSET, PTR) \ 2079 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2080 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 2081 2082#define vload_partial_14(DATA, OFFSET, PTR) \ 2083 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2084 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 2085 2086#define vload_partial_15(DATA, OFFSET, PTR) \ 2087 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 2088 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 2089 2090#define vload_partial_16(DATA, OFFSET, PTR) \ 2091 DATA = vload16(OFFSET, PTR); 2092 2093 2094 2095#define PIXEL_UNIT4 1 2096#define PIXEL_UNIT8 2 2097#define PIXEL_UNIT16 4 2098 2099 2100#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 2101#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 2102 2103 2104#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 2105#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 2106#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 2107 2108#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2109#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 2110#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 2111#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 2112#endif 2113 2114#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 2115#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 2116#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 2117 2118#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2119#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 2120#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 2121#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 2122#endif 2123 2124 2125#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 2126#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 2127 2128 2129#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 2130#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 2131 2132#define VSTORE_STR(size) vstore##size 2133#define VSTORE(size) VSTORE_STR(size) 2134 2135#define float1 float 2136#define half1 half 2137#define char1 char 2138#define uchar1 uchar 2139#define short1 short 2140#define ushort1 ushort 2141#define int1 int 2142#define uint1 uint 2143#define long1 long 2144#define ulong1 ulong 2145#define double1 double 2146 2147#define vload1(OFFSET, PTR) *(OFFSET + PTR) 2148#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 2149 2150 2151#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 2152#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 2153 2154#define NO_STORE(data, offs, ptr) \ 2155 { \ 2156 } 2157 2158 2159#define vstore_partial_1_0 NO_STORE 2160#define vstore_partial_1_1 vstore1 2161#define vstore_partial_1_2 NO_STORE 2162#define vstore_partial_1_3 NO_STORE 2163#define vstore_partial_1_4 NO_STORE 2164#define vstore_partial_1_5 NO_STORE 2165#define vstore_partial_1_6 NO_STORE 2166#define vstore_partial_1_7 NO_STORE 2167#define vstore_partial_1_8 NO_STORE 2168#define vstore_partial_1_9 NO_STORE 2169#define vstore_partial_1_10 NO_STORE 2170#define vstore_partial_1_11 NO_STORE 2171#define vstore_partial_1_12 NO_STORE 2172#define vstore_partial_1_13 NO_STORE 2173#define vstore_partial_1_14 NO_STORE 2174#define vstore_partial_1_15 NO_STORE 2175#define vstore_partial_1_16 NO_STORE 2176 2177#define vstore_partial_2_0 NO_STORE 2178#define vstore_partial_2_1 vstore_partial_1 2179#define vstore_partial_2_2 vstore_partial_2 2180#define vstore_partial_2_3 NO_STORE 2181#define vstore_partial_2_4 NO_STORE 2182#define vstore_partial_2_5 NO_STORE 2183#define vstore_partial_2_6 NO_STORE 2184#define vstore_partial_2_7 NO_STORE 2185#define vstore_partial_2_8 NO_STORE 2186#define vstore_partial_2_9 NO_STORE 2187#define vstore_partial_2_10 NO_STORE 2188#define vstore_partial_2_11 NO_STORE 2189#define vstore_partial_2_12 NO_STORE 2190#define vstore_partial_2_13 NO_STORE 2191#define vstore_partial_2_14 NO_STORE 2192#define vstore_partial_2_15 NO_STORE 2193#define vstore_partial_2_16 NO_STORE 2194 2195#define vstore_partial_3_0 NO_STORE 2196#define vstore_partial_3_1 vstore_partial_1 2197#define vstore_partial_3_2 vstore_partial_2 2198#define vstore_partial_3_3 vstore_partial_3 2199#define vstore_partial_3_4 NO_STORE 2200#define vstore_partial_3_5 NO_STORE 2201#define vstore_partial_3_6 NO_STORE 2202#define vstore_partial_3_7 NO_STORE 2203#define vstore_partial_3_8 NO_STORE 2204#define vstore_partial_3_9 NO_STORE 2205#define vstore_partial_3_10 NO_STORE 2206#define vstore_partial_3_11 NO_STORE 2207#define vstore_partial_3_12 NO_STORE 2208#define vstore_partial_3_13 NO_STORE 2209#define vstore_partial_3_14 NO_STORE 2210#define vstore_partial_3_15 NO_STORE 2211#define vstore_partial_3_16 NO_STORE 2212 2213#define vstore_partial_4_0 NO_STORE 2214#define vstore_partial_4_1 vstore_partial_1 2215#define vstore_partial_4_2 vstore_partial_2 2216#define vstore_partial_4_3 vstore_partial_3 2217#define vstore_partial_4_4 vstore_partial_4 2218#define vstore_partial_4_5 NO_STORE 2219#define vstore_partial_4_6 NO_STORE 2220#define vstore_partial_4_7 NO_STORE 2221#define vstore_partial_4_8 NO_STORE 2222#define vstore_partial_4_9 NO_STORE 2223#define vstore_partial_4_10 NO_STORE 2224#define vstore_partial_4_11 NO_STORE 2225#define vstore_partial_4_12 NO_STORE 2226#define vstore_partial_4_13 NO_STORE 2227#define vstore_partial_4_14 NO_STORE 2228#define vstore_partial_4_15 NO_STORE 2229#define vstore_partial_4_16 NO_STORE 2230 2231#define vstore_partial_8_0 NO_STORE 2232#define vstore_partial_8_1 vstore_partial_1 2233#define vstore_partial_8_2 vstore_partial_2 2234#define vstore_partial_8_3 vstore_partial_3 2235#define vstore_partial_8_4 vstore_partial_4 2236#define vstore_partial_8_5 vstore_partial_5 2237#define vstore_partial_8_6 vstore_partial_6 2238#define vstore_partial_8_7 vstore_partial_7 2239#define vstore_partial_8_8 vstore_partial_8 2240#define vstore_partial_8_9 NO_STORE 2241#define vstore_partial_8_10 NO_STORE 2242#define vstore_partial_8_11 NO_STORE 2243#define vstore_partial_8_12 NO_STORE 2244#define vstore_partial_8_13 NO_STORE 2245#define vstore_partial_8_14 NO_STORE 2246#define vstore_partial_8_15 NO_STORE 2247#define vstore_partial_8_16 NO_STORE 2248 2249#define vstore_partial_16_0 NO_STORE 2250#define vstore_partial_16_1 vstore_partial_1 2251#define vstore_partial_16_2 vstore_partial_2 2252#define vstore_partial_16_3 vstore_partial_3 2253#define vstore_partial_16_4 vstore_partial_4 2254#define vstore_partial_16_5 vstore_partial_5 2255#define vstore_partial_16_6 vstore_partial_6 2256#define vstore_partial_16_7 vstore_partial_7 2257#define vstore_partial_16_8 vstore_partial_8 2258#define vstore_partial_16_9 vstore_partial_9 2259#define vstore_partial_16_10 vstore_partial_10 2260#define vstore_partial_16_11 vstore_partial_11 2261#define vstore_partial_16_12 vstore_partial_12 2262#define vstore_partial_16_13 vstore_partial_13 2263#define vstore_partial_16_14 vstore_partial_14 2264#define vstore_partial_16_15 vstore_partial_15 2265#define vstore_partial_16_16 vstore_partial_16 2266 2267 2268#define vstore_partial_1(DATA, OFFSET, PTR) \ 2269 vstore1(DATA.s0, OFFSET, PTR); 2270 2271#define vstore_partial_2(DATA, OFFSET, PTR) \ 2272 vstore2(DATA.s01, OFFSET, PTR); 2273 2274#define vstore_partial_3(DATA, OFFSET, PTR) \ 2275 vstore3(DATA.s012, OFFSET, PTR); 2276 2277#define vstore_partial_4(DATA, OFFSET, PTR) \ 2278 vstore4(DATA.s0123, OFFSET, PTR); 2279 2280#define vstore_partial_5(DATA, OFFSET, PTR) \ 2281 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2282 vstore1(DATA.s4, OFFSET, PTR + 4); 2283 2284#define vstore_partial_6(DATA, OFFSET, PTR) \ 2285 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2286 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2287 2288#define vstore_partial_7(DATA, OFFSET, PTR) \ 2289 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2290 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2291 2292#define vstore_partial_8(DATA, OFFSET, PTR) \ 2293 vstore8(DATA.s01234567, OFFSET, PTR); 2294 2295#define vstore_partial_9(DATA, OFFSET, PTR) \ 2296 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2297 vstore1(DATA.s8, OFFSET, PTR + 8); 2298 2299#define vstore_partial_10(DATA, OFFSET, PTR) \ 2300 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2301 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2302 2303#define vstore_partial_11(DATA, OFFSET, PTR) \ 2304 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2305 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2306 2307#define vstore_partial_12(DATA, OFFSET, PTR) \ 2308 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2309 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2310 2311#define vstore_partial_13(DATA, OFFSET, PTR) \ 2312 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2313 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2314 2315#define vstore_partial_14(DATA, OFFSET, PTR) \ 2316 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2317 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2318 2319#define vstore_partial_15(DATA, OFFSET, PTR) \ 2320 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2321 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2322 2323#define vstore_partial_16(DATA, OFFSET, PTR) \ 2324 vstore16(DATA, OFFSET, PTR); 2325 2326 2327 2328 2329 2330#define convert_float_sat convert_float 2331#define convert_float1_sat convert_float 2332#define convert_float2_sat convert_float2 2333#define convert_float3_sat convert_float3 2334#define convert_float4_sat convert_float4 2335#define convert_float8_sat convert_float8 2336#define convert_float16_sat convert_float16 2337#define convert_half_sat convert_float 2338#define convert_half1_sat convert_half 2339#define convert_half2_sat convert_half2 2340#define convert_half3_sat convert_half3 2341#define convert_half4_sat convert_half4 2342#define convert_half8_sat convert_half8 2343#define convert_half16_sat convert_half16 2344 2345#define convert_float1 convert_float 2346#define convert_half1 convert_half 2347#define convert_char1 convert_char 2348#define convert_uchar1 convert_uchar 2349#define convert_short1 convert_short 2350#define convert_ushort1 convert_ushort 2351#define convert_int1 convert_int 2352#define convert_uint1 convert_uint 2353#define convert_long1 convert_long 2354#define convert_ulong1 convert_ulong 2355#define convert_double1 convert_double 2356 2357#define convert_char1_sat convert_char_sat 2358#define convert_uchar1_sat convert_uchar_sat 2359#define convert_uchar2_sat convert_uchar2_sat 2360#define convert_uchar3_sat convert_uchar3_sat 2361#define convert_uchar4_sat convert_uchar4_sat 2362#define convert_uchar8_sat convert_uchar8_sat 2363#define convert_uchar16_sat convert_uchar16_sat 2364#define convert_short1_sat convert_short_sat 2365#define convert_ushort1_sat convert_ushort_sat 2366#define convert_int1_sat convert_int_sat 2367#define convert_uint1_sat convert_uint_sat 2368#define convert_long1_sat convert_long_sat 2369#define convert_ulong1_sat convert_ulong_sat 2370#define convert_double1_sat convert_double_sat 2371 2372#define VEC_DATA_TYPE_STR(type, size) type##size 2373#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2374 2375#define CONVERT_STR(x, type) (convert_##type((x))) 2376#define CONVERT(x, type) CONVERT_STR(x, type) 2377 2378#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2379#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2380 2381#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2382#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2383 2384#define select_vec_dt_uchar(size) uchar##size 2385#define select_vec_dt_char(size) char##size 2386#define select_vec_dt_ushort(size) ushort##size 2387#define select_vec_dt_short(size) short##size 2388#define select_vec_dt_half(size) short##size 2389#define select_vec_dt_uint(size) uint##size 2390#define select_vec_dt_int(size) int##size 2391#define select_vec_dt_float(size) int##size 2392#define select_vec_dt_ulong(size) ulong##size 2393#define select_vec_dt_long(size) long##size 2394 2395#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2396#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2397#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2398 2399#define signed_int_vec_dt_uchar(size) char##size 2400#define signed_int_vec_dt_char(size) char##size 2401#define signed_int_vec_dt_ushort(size) short##size 2402#define signed_int_vec_dt_short(size) short##size 2403#define signed_int_vec_dt_half(size) short##size 2404#define signed_int_vec_dt_uint(size) int##size 2405#define signed_int_vec_dt_int(size) int##size 2406#define signed_int_vec_dt_float(size) int##size 2407#define signed_int_vec_dt_ulong(size) long##size 2408#define signed_int_vec_dt_long(size) long##size 2409 2410#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2411#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2412#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2413 2414#define sum_reduce_1(x) (x) 2415#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2416#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2417#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2418#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2419#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2420 2421#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2422#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2423 2424#define prod_reduce_1(x) (x) 2425#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2426#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2427#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2428#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2429#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2430 2431#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2432#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2433 2434#define max_reduce_1(x) (x) 2435#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2436#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2437#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2438#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2439#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2440 2441#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2442#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2443 2444#define VECTOR_DECLARATION(name) \ 2445 __global uchar *name##_ptr, \ 2446 uint name##_stride_x, \ 2447 uint name##_step_x, \ 2448 uint name##_offset_first_element_in_bytes 2449 2450#define IMAGE_DECLARATION(name) \ 2451 __global uchar *name##_ptr, \ 2452 uint name##_stride_x, \ 2453 uint name##_step_x, \ 2454 uint name##_stride_y, \ 2455 uint name##_step_y, \ 2456 uint name##_offset_first_element_in_bytes 2457 2458#define TENSOR3D_DECLARATION(name) \ 2459 __global uchar *name##_ptr, \ 2460 uint name##_stride_x, \ 2461 uint name##_step_x, \ 2462 uint name##_stride_y, \ 2463 uint name##_step_y, \ 2464 uint name##_stride_z, \ 2465 uint name##_step_z, \ 2466 uint name##_offset_first_element_in_bytes 2467 2468#define TENSOR4D_DECLARATION(name) \ 2469 __global uchar *name##_ptr, \ 2470 uint name##_stride_x, \ 2471 uint name##_step_x, \ 2472 uint name##_stride_y, \ 2473 uint name##_step_y, \ 2474 uint name##_stride_z, \ 2475 uint name##_step_z, \ 2476 uint name##_stride_w, \ 2477 uint name##_step_w, \ 2478 uint name##_offset_first_element_in_bytes 2479 2480#define TENSOR5D_DECLARATION(name) \ 2481 __global uchar *name##_ptr, \ 2482 uint name##_stride_x, \ 2483 uint name##_step_x, \ 2484 uint name##_stride_y, \ 2485 uint name##_step_y, \ 2486 uint name##_stride_z, \ 2487 uint name##_step_z, \ 2488 uint name##_stride_w, \ 2489 uint name##_step_w, \ 2490 uint name##_stride_v, \ 2491 uint name##_step_v, \ 2492 uint name##_offset_first_element_in_bytes 2493 2494#define CONVERT_TO_VECTOR_STRUCT(name) \ 2495 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2496 2497#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2498 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2499 2500#define CONVERT_TO_IMAGE_STRUCT(name) \ 2501 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2502 2503#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2504 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2505 2506#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2507 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2508 2509#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2510 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2511 2512#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2513 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2514 2515#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2516 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2517 name##_stride_z, name##_step_z) 2518 2519#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2520 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2521 2522#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2523 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2524 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2525 2526#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2527 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2528 2529#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2530 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2531 name##_stride_z, name##_step_z) 2532 2533 2534typedef struct Vector 2535{ 2536 __global uchar *ptr; 2537 int offset_first_element_in_bytes; 2538 int stride_x; 2539} Vector; 2540 2541 2542typedef struct Image 2543{ 2544 __global uchar *ptr; 2545 int offset_first_element_in_bytes; 2546 int stride_x; 2547 int stride_y; 2548} Image; 2549 2550 2551typedef struct Tensor3D 2552{ 2553 __global uchar *ptr; 2554 int offset_first_element_in_bytes; 2555 int stride_x; 2556 int stride_y; 2557 int stride_z; 2558} Tensor3D; 2559 2560 2561typedef struct Tensor4D 2562{ 2563 __global uchar *ptr; 2564 int offset_first_element_in_bytes; 2565 int stride_x; 2566 int stride_y; 2567 int stride_z; 2568 int stride_w; 2569} Tensor4D; 2570 2571 2572inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2573{ 2574 Vector vector = 2575 { 2576 .ptr = ptr, 2577 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2578 .stride_x = stride_x, 2579 }; 2580 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2581 return vector; 2582} 2583 2584 2585inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2586{ 2587 Image img = 2588 { 2589 .ptr = ptr, 2590 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2591 .stride_x = stride_x, 2592 .stride_y = stride_y 2593 }; 2594 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2595 return img; 2596} 2597 2598 2599inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2600{ 2601 Image img = 2602 { 2603 .ptr = ptr, 2604 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2605 .stride_x = stride_x, 2606 .stride_y = stride_y 2607 }; 2608 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2609 return img; 2610} 2611 2612 2613inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2614{ 2615 Tensor3D tensor = 2616 { 2617 .ptr = ptr, 2618 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2619 .stride_x = stride_x, 2620 .stride_y = stride_y, 2621 .stride_z = stride_z 2622 }; 2623 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2624 return tensor; 2625} 2626 2627 2628inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2629{ 2630 Tensor3D tensor = 2631 { 2632 .ptr = ptr, 2633 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2634 .stride_x = stride_x, 2635 .stride_y = stride_y, 2636 .stride_z = stride_z 2637 }; 2638 return tensor; 2639} 2640 2641inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2642 uint step_w, 2643 uint mod_size) 2644{ 2645 Tensor4D tensor = 2646 { 2647 .ptr = ptr, 2648 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2649 .stride_x = stride_x, 2650 .stride_y = stride_y, 2651 .stride_z = stride_z, 2652 .stride_w = stride_w 2653 }; 2654 2655 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2656 return tensor; 2657} 2658 2659 2660inline __global const uchar *vector_offset(const Vector *vec, int x) 2661{ 2662 return vec->ptr + x * vec->stride_x; 2663} 2664 2665 2666inline __global uchar *offset(const Image *img, int x, int y) 2667{ 2668 return img->ptr + x * img->stride_x + y * img->stride_y; 2669} 2670 2671 2672inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2673{ 2674 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2675} 2676 2677 2678inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2679{ 2680 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2681} 2682 2683 2684inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2685{ 2686 uint num_elements = width * height; 2687 2688 const uint z = index / num_elements; 2689 2690 index %= num_elements; 2691 2692 const uint y = index / width; 2693 2694 index %= width; 2695 2696 const uint x = index; 2697 2698 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2699} 2700 2701#endif 2702 2703#if GPU_ARCH == GPU_ARCH_BIFROST 2704#define MLA(a, b, c) (fma(c, b, a)) 2705#else 2706#define MLA(a, b, c) ((b) * (c) + (a)) 2707#endif 2708 2709 2710#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 2711 2712 2713#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 2714 2715 2716#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 2717 2718 2719#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 2720 2721 2722#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 2723 2724 2725#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 2726 2727 2728#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 2729 2730 2731#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 2732 2733 2734#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 2735 2736 2737#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 2738 2739 2740#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 2741 2742 2743#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 2744 2745 2746#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 2747 2748 2749#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 2750 2751 2752#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 2753 2754#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 2755 2756#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 2757 2758#ifndef ARM_COMPUTE_HELPER_H 2759#define ARM_COMPUTE_HELPER_H 2760 2761 2762 2763 2764#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2765 VSTORE(N0) \ 2766 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2767 2768#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2769 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2770 VSTORE(N0) \ 2771 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2772 2773#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2774 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2775 VSTORE(N0) \ 2776 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2777 2778#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2779 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2780 VSTORE(N0) \ 2781 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2782 2783#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2784 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2785 VSTORE(N0) \ 2786 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2787 2788#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2789 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2790 VSTORE(N0) \ 2791 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2792 2793#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2794 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2795 VSTORE(N0) \ 2796 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2797 2798#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2799 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2800 VSTORE(N0) \ 2801 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2802 2803#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2804 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2805 VSTORE(N0) \ 2806 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2807 2808#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2809 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2810 VSTORE(N0) \ 2811 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2812 2813#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2814 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2815 VSTORE(N0) \ 2816 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2817 2818#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2819 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2820 VSTORE(N0) \ 2821 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2822 2823#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2824 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2825 VSTORE(N0) \ 2826 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2827 2828#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2829 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2830 VSTORE(N0) \ 2831 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2832 2833#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2834 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2835 VSTORE(N0) \ 2836 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2837 2838#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2839 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2840 VSTORE(N0) \ 2841 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2842 2843 2844 2845#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2846 VSTORE(N0) \ 2847 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2848 2849#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2850 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2851 VSTORE(N0) \ 2852 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2853 2854#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2855 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2856 VSTORE(N0) \ 2857 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2858 2859#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2860 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2861 VSTORE(N0) \ 2862 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2863 2864#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2865 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2866 VSTORE(N0) \ 2867 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2868 2869#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2870 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2871 VSTORE(N0) \ 2872 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2873 2874#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2875 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2876 VSTORE(N0) \ 2877 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2878 2879#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2880 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2881 VSTORE(N0) \ 2882 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2883 2884#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2885 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2886 VSTORE(N0) \ 2887 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2888 2889#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 2890 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2891 VSTORE(N0) \ 2892 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2893 2894#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2895 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2896 VSTORE(N0) \ 2897 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2898 2899#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2900 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2901 VSTORE(N0) \ 2902 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2903 2904#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2905 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2906 VSTORE(N0) \ 2907 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2908 2909#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2910 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2911 VSTORE(N0) \ 2912 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2913 2914#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2915 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2916 VSTORE(N0) \ 2917 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2918 2919#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2920 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2921 VSTORE(N0) \ 2922 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2923 2924 2925 2926 2927#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2928#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2929 2930 2931 2932#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2933#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2934 2935 2936 2937#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2938 VSTORE_PARTIAL(N0, STORE_N0) \ 2939 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2940 2941#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2942 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2943 VSTORE_PARTIAL(N0, STORE_N0) \ 2944 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2945 2946#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2947 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2948 VSTORE_PARTIAL(N0, STORE_N0) \ 2949 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2950 2951#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2952 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2953 VSTORE_PARTIAL(N0, STORE_N0) \ 2954 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2955 2956#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2957 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2958 VSTORE_PARTIAL(N0, STORE_N0) \ 2959 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2960 2961#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2962 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2963 VSTORE_PARTIAL(N0, STORE_N0) \ 2964 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2965 2966#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2967 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2968 VSTORE_PARTIAL(N0, STORE_N0) \ 2969 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2970 2971#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2972 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2973 VSTORE_PARTIAL(N0, STORE_N0) \ 2974 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2975 2976#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2977 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2978 VSTORE_PARTIAL(N0, STORE_N0) \ 2979 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2980 2981#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2982 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2983 VSTORE_PARTIAL(N0, STORE_N0) \ 2984 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2985 2986#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2987 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2988 VSTORE_PARTIAL(N0, STORE_N0) \ 2989 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2990 2991#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2992 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2993 VSTORE_PARTIAL(N0, STORE_N0) \ 2994 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2995 2996#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2997 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2998 VSTORE_PARTIAL(N0, STORE_N0) \ 2999 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3000 3001#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3002 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3003 VSTORE_PARTIAL(N0, STORE_N0) \ 3004 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3005 3006#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3007 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3008 VSTORE_PARTIAL(N0, STORE_N0) \ 3009 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3010 3011#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3012 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3013 VSTORE_PARTIAL(N0, STORE_N0) \ 3014 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3015 3016 3017 3018#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3019#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3020 3021#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 3022 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 3023 { \ 3024 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3025 } \ 3026 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 3027 { \ 3028 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3029 } \ 3030 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 3031 { \ 3032 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3033 } \ 3034 else \ 3035 { \ 3036 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3037 } 3038 3039#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 3040 if(!(PARTIAL_COND_X)) \ 3041 { \ 3042 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3043 } \ 3044 else \ 3045 { \ 3046 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3047 } 3048 3049#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 3050 if(!(PARTIAL_COND_Y)) \ 3051 { \ 3052 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3053 } \ 3054 else \ 3055 { \ 3056 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 3057 } 3058 3059 3060#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 3061 3062 3063#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 3064 3065#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 3066 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3067 3068#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 3069 3070#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 3071 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 3072 3073#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 3074 3075#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 3076 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 3077 3078#else 3079 3080#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 3081 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 3082 3083#endif 3084 3085#endif 3086 3087 3088#if defined(PARTIAL_STORE_M0) 3089 3090#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 3091 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 3092#else 3093#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 3094 ((uint)(y * M0)) 3095#endif 3096 3097 3098 3099#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 3100 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 3101 3102 3103#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 3104#pragma OPENCL EXTENSION cl_khr_fp16 : enable 3105#endif 3106 3107#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 3108#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 3109#endif 3110 3111#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 3112#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 3113#endif 3114 3115#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 3116#pragma OPENCL EXTENSION cl_arm_printf : enable 3117#endif 3118 3119#define GPU_ARCH_MIDGARD 0x100 3120#define GPU_ARCH_BIFROST 0x200 3121#define GPU_ARCH_VALHALL 0x300 3122 3123 3124#define CONCAT(a, b) a##b 3125 3126 3127#define EXPAND(x) x 3128 3129 3130#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 3131 3132 3133#define REV1(x) ((x)) 3134#define REV2(x) ((x).s10) 3135#define REV3(x) ((x).s210) 3136#define REV4(x) ((x).s3210) 3137#define REV8(x) ((x).s76543210) 3138#define REV16(x) ((x).sFEDCBA9876543210) 3139 3140 3141 3142#define REVERSE_STR(x, s) REV##s((x)) 3143#define REVERSE(x, s) REVERSE_STR(x, s) 3144 3145 3146 3147#define ROT1_0(x) ((x)) 3148#define ROT1_1(x) ((x)) 3149 3150#define ROT2_0(x) ((x)) 3151#define ROT2_1(x) ((x).s10) 3152#define ROT2_2(x) ((x)) 3153 3154#define ROT3_0(x) ((x)) 3155#define ROT3_1(x) ((x).s201) 3156#define ROT3_2(x) ((x).s120) 3157#define ROT3_3(x) ((x)) 3158 3159#define ROT4_0(x) ((x)) 3160#define ROT4_1(x) ((x).s3012) 3161#define ROT4_2(x) ((x).s2301) 3162#define ROT4_3(x) ((x).s1230) 3163#define ROT4_4(x) ((x)) 3164 3165#define ROT8_0(x) ((x)) 3166#define ROT8_1(x) ((x).s70123456) 3167#define ROT8_2(x) ((x).s67012345) 3168#define ROT8_3(x) ((x).s56701234) 3169#define ROT8_4(x) ((x).s45670123) 3170#define ROT8_5(x) ((x).s34567012) 3171#define ROT8_6(x) ((x).s23456701) 3172#define ROT8_7(x) ((x).s12345670) 3173#define ROT8_8(x) ((x)) 3174 3175#define ROT16_0(x) ((x)) 3176#define ROT16_1(x) ((x).sF0123456789ABCDE) 3177#define ROT16_2(x) ((x).sEF0123456789ABCD) 3178#define ROT16_3(x) ((x).sDEF0123456789ABC) 3179#define ROT16_4(x) ((x).sCDEF0123456789AB) 3180#define ROT16_5(x) ((x).sBCDEF0123456789A) 3181#define ROT16_6(x) ((x).sABCDEF0123456789) 3182#define ROT16_7(x) ((x).s9ABCDEF012345678) 3183#define ROT16_8(x) ((x).s89ABCDEF01234567) 3184#define ROT16_9(x) ((x).s789ABCDEF0123456) 3185#define ROT16_10(x) ((x).s6789ABCDEF012345) 3186#define ROT16_11(x) ((x).s56789ABCDEF01234) 3187#define ROT16_12(x) ((x).s456789ABCDEF0123) 3188#define ROT16_13(x) ((x).s3456789ABCDEF012) 3189#define ROT16_14(x) ((x).s23456789ABCDEF01) 3190#define ROT16_15(x) ((x).s123456789ABCDEF0) 3191#define ROT16_16(x) ((x)) 3192 3193 3194 3195#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 3196#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 3197 3198 3199 3200#define V_OFFS1(dt) (dt##1)(0) 3201#define V_OFFS2(dt) (dt##2)(0, 1) 3202#define V_OFFS3(dt) (dt##3)(0, 1, 2) 3203#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 3204#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 3205#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 3206 3207 3208 3209#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 3210#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 3211 3212 3213#define VLOAD_STR(size) vload##size 3214#define VLOAD(size) VLOAD_STR(size) 3215 3216 3217#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 3218#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 3219 3220#define NO_LOAD(data, offs, ptr) \ 3221 { \ 3222 } 3223 3224 3225#define vload_partial_1_0 NO_LOAD 3226#define vload_partial_1_1 vload1 3227#define vload_partial_1_2 NO_LOAD 3228#define vload_partial_1_3 NO_LOAD 3229#define vload_partial_1_4 NO_LOAD 3230#define vload_partial_1_5 NO_LOAD 3231#define vload_partial_1_6 NO_LOAD 3232#define vload_partial_1_7 NO_LOAD 3233#define vload_partial_1_8 NO_LOAD 3234#define vload_partial_1_9 NO_LOAD 3235#define vload_partial_1_10 NO_LOAD 3236#define vload_partial_1_11 NO_LOAD 3237#define vload_partial_1_12 NO_LOAD 3238#define vload_partial_1_13 NO_LOAD 3239#define vload_partial_1_14 NO_LOAD 3240#define vload_partial_1_15 NO_LOAD 3241#define vload_partial_1_16 NO_LOAD 3242 3243#define vload_partial_2_0 NO_LOAD 3244#define vload_partial_2_1 vload_partial_1 3245#define vload_partial_2_2 vload_partial_2 3246#define vload_partial_2_3 NO_LOAD 3247#define vload_partial_2_4 NO_LOAD 3248#define vload_partial_2_5 NO_LOAD 3249#define vload_partial_2_6 NO_LOAD 3250#define vload_partial_2_7 NO_LOAD 3251#define vload_partial_2_8 NO_LOAD 3252#define vload_partial_2_9 NO_LOAD 3253#define vload_partial_2_10 NO_LOAD 3254#define vload_partial_2_11 NO_LOAD 3255#define vload_partial_2_12 NO_LOAD 3256#define vload_partial_2_13 NO_LOAD 3257#define vload_partial_2_14 NO_LOAD 3258#define vload_partial_2_15 NO_LOAD 3259#define vload_partial_2_16 NO_LOAD 3260 3261#define vload_partial_3_0 NO_LOAD 3262#define vload_partial_3_1 vload_partial_1 3263#define vload_partial_3_2 vload_partial_2 3264#define vload_partial_3_3 vload_partial_3 3265#define vload_partial_3_4 NO_LOAD 3266#define vload_partial_3_5 NO_LOAD 3267#define vload_partial_3_6 NO_LOAD 3268#define vload_partial_3_7 NO_LOAD 3269#define vload_partial_3_8 NO_LOAD 3270#define vload_partial_3_9 NO_LOAD 3271#define vload_partial_3_10 NO_LOAD 3272#define vload_partial_3_11 NO_LOAD 3273#define vload_partial_3_12 NO_LOAD 3274#define vload_partial_3_13 NO_LOAD 3275#define vload_partial_3_14 NO_LOAD 3276#define vload_partial_3_15 NO_LOAD 3277#define vload_partial_3_16 NO_LOAD 3278 3279#define vload_partial_4_0 NO_LOAD 3280#define vload_partial_4_1 vload_partial_1 3281#define vload_partial_4_2 vload_partial_2 3282#define vload_partial_4_3 vload_partial_3 3283#define vload_partial_4_4 vload_partial_4 3284#define vload_partial_4_5 NO_LOAD 3285#define vload_partial_4_6 NO_LOAD 3286#define vload_partial_4_7 NO_LOAD 3287#define vload_partial_4_8 NO_LOAD 3288#define vload_partial_4_9 NO_LOAD 3289#define vload_partial_4_10 NO_LOAD 3290#define vload_partial_4_11 NO_LOAD 3291#define vload_partial_4_12 NO_LOAD 3292#define vload_partial_4_13 NO_LOAD 3293#define vload_partial_4_14 NO_LOAD 3294#define vload_partial_4_15 NO_LOAD 3295#define vload_partial_4_16 NO_LOAD 3296 3297#define vload_partial_8_0 NO_LOAD 3298#define vload_partial_8_1 vload_partial_1 3299#define vload_partial_8_2 vload_partial_2 3300#define vload_partial_8_3 vload_partial_3 3301#define vload_partial_8_4 vload_partial_4 3302#define vload_partial_8_5 vload_partial_5 3303#define vload_partial_8_6 vload_partial_6 3304#define vload_partial_8_7 vload_partial_7 3305#define vload_partial_8_8 vload_partial_8 3306#define vload_partial_8_9 NO_LOAD 3307#define vload_partial_8_10 NO_LOAD 3308#define vload_partial_8_11 NO_LOAD 3309#define vload_partial_8_12 NO_LOAD 3310#define vload_partial_8_13 NO_LOAD 3311#define vload_partial_8_14 NO_LOAD 3312#define vload_partial_8_15 NO_LOAD 3313#define vload_partial_8_16 NO_LOAD 3314 3315#define vload_partial_16_0 NO_LOAD 3316#define vload_partial_16_1 vload_partial_1 3317#define vload_partial_16_2 vload_partial_2 3318#define vload_partial_16_3 vload_partial_3 3319#define vload_partial_16_4 vload_partial_4 3320#define vload_partial_16_5 vload_partial_5 3321#define vload_partial_16_6 vload_partial_6 3322#define vload_partial_16_7 vload_partial_7 3323#define vload_partial_16_8 vload_partial_8 3324#define vload_partial_16_9 vload_partial_9 3325#define vload_partial_16_10 vload_partial_10 3326#define vload_partial_16_11 vload_partial_11 3327#define vload_partial_16_12 vload_partial_12 3328#define vload_partial_16_13 vload_partial_13 3329#define vload_partial_16_14 vload_partial_14 3330#define vload_partial_16_15 vload_partial_15 3331#define vload_partial_16_16 vload_partial_16 3332 3333 3334#define vload_partial_1(DATA, OFFSET, PTR) \ 3335 DATA.s0 = vload1(OFFSET, PTR); 3336 3337#define vload_partial_2(DATA, OFFSET, PTR) \ 3338 DATA.s01 = vload2(OFFSET, PTR); 3339 3340#define vload_partial_3(DATA, OFFSET, PTR) \ 3341 DATA.s012 = vload3(OFFSET, PTR); 3342 3343#define vload_partial_4(DATA, OFFSET, PTR) \ 3344 DATA.s0123 = vload4(OFFSET, PTR); 3345 3346#define vload_partial_5(DATA, OFFSET, PTR) \ 3347 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 3348 DATA.s4 = vload1(OFFSET, PTR + 4); 3349 3350#define vload_partial_6(DATA, OFFSET, PTR) \ 3351 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 3352 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 3353 3354#define vload_partial_7(DATA, OFFSET, PTR) \ 3355 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 3356 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 3357 3358#define vload_partial_8(DATA, OFFSET, PTR) \ 3359 DATA.s01234567 = vload8(OFFSET, PTR); 3360 3361#define vload_partial_9(DATA, OFFSET, PTR) \ 3362 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3363 DATA.s8 = vload1(OFFSET, PTR + 8); 3364 3365#define vload_partial_10(DATA, OFFSET, PTR) \ 3366 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3367 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 3368 3369#define vload_partial_11(DATA, OFFSET, PTR) \ 3370 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3371 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 3372 3373#define vload_partial_12(DATA, OFFSET, PTR) \ 3374 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3375 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 3376 3377#define vload_partial_13(DATA, OFFSET, PTR) \ 3378 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3379 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 3380 3381#define vload_partial_14(DATA, OFFSET, PTR) \ 3382 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3383 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 3384 3385#define vload_partial_15(DATA, OFFSET, PTR) \ 3386 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 3387 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 3388 3389#define vload_partial_16(DATA, OFFSET, PTR) \ 3390 DATA = vload16(OFFSET, PTR); 3391 3392 3393 3394#define PIXEL_UNIT4 1 3395#define PIXEL_UNIT8 2 3396#define PIXEL_UNIT16 4 3397 3398 3399#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 3400#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 3401 3402 3403#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 3404#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 3405#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 3406 3407#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 3408#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 3409#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 3410#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 3411#endif 3412 3413#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 3414#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 3415#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 3416 3417#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 3418#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 3419#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 3420#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 3421#endif 3422 3423 3424#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 3425#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 3426 3427 3428#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 3429#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 3430 3431#define VSTORE_STR(size) vstore##size 3432#define VSTORE(size) VSTORE_STR(size) 3433 3434#define float1 float 3435#define half1 half 3436#define char1 char 3437#define uchar1 uchar 3438#define short1 short 3439#define ushort1 ushort 3440#define int1 int 3441#define uint1 uint 3442#define long1 long 3443#define ulong1 ulong 3444#define double1 double 3445 3446#define vload1(OFFSET, PTR) *(OFFSET + PTR) 3447#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 3448 3449 3450#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 3451#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 3452 3453#define NO_STORE(data, offs, ptr) \ 3454 { \ 3455 } 3456 3457 3458#define vstore_partial_1_0 NO_STORE 3459#define vstore_partial_1_1 vstore1 3460#define vstore_partial_1_2 NO_STORE 3461#define vstore_partial_1_3 NO_STORE 3462#define vstore_partial_1_4 NO_STORE 3463#define vstore_partial_1_5 NO_STORE 3464#define vstore_partial_1_6 NO_STORE 3465#define vstore_partial_1_7 NO_STORE 3466#define vstore_partial_1_8 NO_STORE 3467#define vstore_partial_1_9 NO_STORE 3468#define vstore_partial_1_10 NO_STORE 3469#define vstore_partial_1_11 NO_STORE 3470#define vstore_partial_1_12 NO_STORE 3471#define vstore_partial_1_13 NO_STORE 3472#define vstore_partial_1_14 NO_STORE 3473#define vstore_partial_1_15 NO_STORE 3474#define vstore_partial_1_16 NO_STORE 3475 3476#define vstore_partial_2_0 NO_STORE 3477#define vstore_partial_2_1 vstore_partial_1 3478#define vstore_partial_2_2 vstore_partial_2 3479#define vstore_partial_2_3 NO_STORE 3480#define vstore_partial_2_4 NO_STORE 3481#define vstore_partial_2_5 NO_STORE 3482#define vstore_partial_2_6 NO_STORE 3483#define vstore_partial_2_7 NO_STORE 3484#define vstore_partial_2_8 NO_STORE 3485#define vstore_partial_2_9 NO_STORE 3486#define vstore_partial_2_10 NO_STORE 3487#define vstore_partial_2_11 NO_STORE 3488#define vstore_partial_2_12 NO_STORE 3489#define vstore_partial_2_13 NO_STORE 3490#define vstore_partial_2_14 NO_STORE 3491#define vstore_partial_2_15 NO_STORE 3492#define vstore_partial_2_16 NO_STORE 3493 3494#define vstore_partial_3_0 NO_STORE 3495#define vstore_partial_3_1 vstore_partial_1 3496#define vstore_partial_3_2 vstore_partial_2 3497#define vstore_partial_3_3 vstore_partial_3 3498#define vstore_partial_3_4 NO_STORE 3499#define vstore_partial_3_5 NO_STORE 3500#define vstore_partial_3_6 NO_STORE 3501#define vstore_partial_3_7 NO_STORE 3502#define vstore_partial_3_8 NO_STORE 3503#define vstore_partial_3_9 NO_STORE 3504#define vstore_partial_3_10 NO_STORE 3505#define vstore_partial_3_11 NO_STORE 3506#define vstore_partial_3_12 NO_STORE 3507#define vstore_partial_3_13 NO_STORE 3508#define vstore_partial_3_14 NO_STORE 3509#define vstore_partial_3_15 NO_STORE 3510#define vstore_partial_3_16 NO_STORE 3511 3512#define vstore_partial_4_0 NO_STORE 3513#define vstore_partial_4_1 vstore_partial_1 3514#define vstore_partial_4_2 vstore_partial_2 3515#define vstore_partial_4_3 vstore_partial_3 3516#define vstore_partial_4_4 vstore_partial_4 3517#define vstore_partial_4_5 NO_STORE 3518#define vstore_partial_4_6 NO_STORE 3519#define vstore_partial_4_7 NO_STORE 3520#define vstore_partial_4_8 NO_STORE 3521#define vstore_partial_4_9 NO_STORE 3522#define vstore_partial_4_10 NO_STORE 3523#define vstore_partial_4_11 NO_STORE 3524#define vstore_partial_4_12 NO_STORE 3525#define vstore_partial_4_13 NO_STORE 3526#define vstore_partial_4_14 NO_STORE 3527#define vstore_partial_4_15 NO_STORE 3528#define vstore_partial_4_16 NO_STORE 3529 3530#define vstore_partial_8_0 NO_STORE 3531#define vstore_partial_8_1 vstore_partial_1 3532#define vstore_partial_8_2 vstore_partial_2 3533#define vstore_partial_8_3 vstore_partial_3 3534#define vstore_partial_8_4 vstore_partial_4 3535#define vstore_partial_8_5 vstore_partial_5 3536#define vstore_partial_8_6 vstore_partial_6 3537#define vstore_partial_8_7 vstore_partial_7 3538#define vstore_partial_8_8 vstore_partial_8 3539#define vstore_partial_8_9 NO_STORE 3540#define vstore_partial_8_10 NO_STORE 3541#define vstore_partial_8_11 NO_STORE 3542#define vstore_partial_8_12 NO_STORE 3543#define vstore_partial_8_13 NO_STORE 3544#define vstore_partial_8_14 NO_STORE 3545#define vstore_partial_8_15 NO_STORE 3546#define vstore_partial_8_16 NO_STORE 3547 3548#define vstore_partial_16_0 NO_STORE 3549#define vstore_partial_16_1 vstore_partial_1 3550#define vstore_partial_16_2 vstore_partial_2 3551#define vstore_partial_16_3 vstore_partial_3 3552#define vstore_partial_16_4 vstore_partial_4 3553#define vstore_partial_16_5 vstore_partial_5 3554#define vstore_partial_16_6 vstore_partial_6 3555#define vstore_partial_16_7 vstore_partial_7 3556#define vstore_partial_16_8 vstore_partial_8 3557#define vstore_partial_16_9 vstore_partial_9 3558#define vstore_partial_16_10 vstore_partial_10 3559#define vstore_partial_16_11 vstore_partial_11 3560#define vstore_partial_16_12 vstore_partial_12 3561#define vstore_partial_16_13 vstore_partial_13 3562#define vstore_partial_16_14 vstore_partial_14 3563#define vstore_partial_16_15 vstore_partial_15 3564#define vstore_partial_16_16 vstore_partial_16 3565 3566 3567#define vstore_partial_1(DATA, OFFSET, PTR) \ 3568 vstore1(DATA.s0, OFFSET, PTR); 3569 3570#define vstore_partial_2(DATA, OFFSET, PTR) \ 3571 vstore2(DATA.s01, OFFSET, PTR); 3572 3573#define vstore_partial_3(DATA, OFFSET, PTR) \ 3574 vstore3(DATA.s012, OFFSET, PTR); 3575 3576#define vstore_partial_4(DATA, OFFSET, PTR) \ 3577 vstore4(DATA.s0123, OFFSET, PTR); 3578 3579#define vstore_partial_5(DATA, OFFSET, PTR) \ 3580 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3581 vstore1(DATA.s4, OFFSET, PTR + 4); 3582 3583#define vstore_partial_6(DATA, OFFSET, PTR) \ 3584 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3585 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 3586 3587#define vstore_partial_7(DATA, OFFSET, PTR) \ 3588 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3589 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 3590 3591#define vstore_partial_8(DATA, OFFSET, PTR) \ 3592 vstore8(DATA.s01234567, OFFSET, PTR); 3593 3594#define vstore_partial_9(DATA, OFFSET, PTR) \ 3595 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3596 vstore1(DATA.s8, OFFSET, PTR + 8); 3597 3598#define vstore_partial_10(DATA, OFFSET, PTR) \ 3599 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3600 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 3601 3602#define vstore_partial_11(DATA, OFFSET, PTR) \ 3603 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3604 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 3605 3606#define vstore_partial_12(DATA, OFFSET, PTR) \ 3607 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3608 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 3609 3610#define vstore_partial_13(DATA, OFFSET, PTR) \ 3611 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3612 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 3613 3614#define vstore_partial_14(DATA, OFFSET, PTR) \ 3615 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3616 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 3617 3618#define vstore_partial_15(DATA, OFFSET, PTR) \ 3619 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3620 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 3621 3622#define vstore_partial_16(DATA, OFFSET, PTR) \ 3623 vstore16(DATA, OFFSET, PTR); 3624 3625 3626 3627 3628 3629#define convert_float_sat convert_float 3630#define convert_float1_sat convert_float 3631#define convert_float2_sat convert_float2 3632#define convert_float3_sat convert_float3 3633#define convert_float4_sat convert_float4 3634#define convert_float8_sat convert_float8 3635#define convert_float16_sat convert_float16 3636#define convert_half_sat convert_float 3637#define convert_half1_sat convert_half 3638#define convert_half2_sat convert_half2 3639#define convert_half3_sat convert_half3 3640#define convert_half4_sat convert_half4 3641#define convert_half8_sat convert_half8 3642#define convert_half16_sat convert_half16 3643 3644#define convert_float1 convert_float 3645#define convert_half1 convert_half 3646#define convert_char1 convert_char 3647#define convert_uchar1 convert_uchar 3648#define convert_short1 convert_short 3649#define convert_ushort1 convert_ushort 3650#define convert_int1 convert_int 3651#define convert_uint1 convert_uint 3652#define convert_long1 convert_long 3653#define convert_ulong1 convert_ulong 3654#define convert_double1 convert_double 3655 3656#define convert_char1_sat convert_char_sat 3657#define convert_uchar1_sat convert_uchar_sat 3658#define convert_uchar2_sat convert_uchar2_sat 3659#define convert_uchar3_sat convert_uchar3_sat 3660#define convert_uchar4_sat convert_uchar4_sat 3661#define convert_uchar8_sat convert_uchar8_sat 3662#define convert_uchar16_sat convert_uchar16_sat 3663#define convert_short1_sat convert_short_sat 3664#define convert_ushort1_sat convert_ushort_sat 3665#define convert_int1_sat convert_int_sat 3666#define convert_uint1_sat convert_uint_sat 3667#define convert_long1_sat convert_long_sat 3668#define convert_ulong1_sat convert_ulong_sat 3669#define convert_double1_sat convert_double_sat 3670 3671#define VEC_DATA_TYPE_STR(type, size) type##size 3672#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 3673 3674#define CONVERT_STR(x, type) (convert_##type((x))) 3675#define CONVERT(x, type) CONVERT_STR(x, type) 3676 3677#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 3678#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 3679 3680#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 3681#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 3682 3683#define select_vec_dt_uchar(size) uchar##size 3684#define select_vec_dt_char(size) char##size 3685#define select_vec_dt_ushort(size) ushort##size 3686#define select_vec_dt_short(size) short##size 3687#define select_vec_dt_half(size) short##size 3688#define select_vec_dt_uint(size) uint##size 3689#define select_vec_dt_int(size) int##size 3690#define select_vec_dt_float(size) int##size 3691#define select_vec_dt_ulong(size) ulong##size 3692#define select_vec_dt_long(size) long##size 3693 3694#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 3695#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 3696#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 3697 3698#define signed_int_vec_dt_uchar(size) char##size 3699#define signed_int_vec_dt_char(size) char##size 3700#define signed_int_vec_dt_ushort(size) short##size 3701#define signed_int_vec_dt_short(size) short##size 3702#define signed_int_vec_dt_half(size) short##size 3703#define signed_int_vec_dt_uint(size) int##size 3704#define signed_int_vec_dt_int(size) int##size 3705#define signed_int_vec_dt_float(size) int##size 3706#define signed_int_vec_dt_ulong(size) long##size 3707#define signed_int_vec_dt_long(size) long##size 3708 3709#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 3710#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 3711#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 3712 3713#define sum_reduce_1(x) (x) 3714#define sum_reduce_2(x) ((x).s0) + ((x).s1) 3715#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 3716#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 3717#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 3718#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 3719 3720#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 3721#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 3722 3723#define prod_reduce_1(x) (x) 3724#define prod_reduce_2(x) ((x).s0) * ((x).s1) 3725#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 3726#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 3727#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 3728#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 3729 3730#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 3731#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 3732 3733#define max_reduce_1(x) (x) 3734#define max_reduce_2(x) max(((x).s0), ((x).s1)) 3735#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 3736#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 3737#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 3738#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 3739 3740#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 3741#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 3742 3743#define VECTOR_DECLARATION(name) \ 3744 __global uchar *name##_ptr, \ 3745 uint name##_stride_x, \ 3746 uint name##_step_x, \ 3747 uint name##_offset_first_element_in_bytes 3748 3749#define IMAGE_DECLARATION(name) \ 3750 __global uchar *name##_ptr, \ 3751 uint name##_stride_x, \ 3752 uint name##_step_x, \ 3753 uint name##_stride_y, \ 3754 uint name##_step_y, \ 3755 uint name##_offset_first_element_in_bytes 3756 3757#define TENSOR3D_DECLARATION(name) \ 3758 __global uchar *name##_ptr, \ 3759 uint name##_stride_x, \ 3760 uint name##_step_x, \ 3761 uint name##_stride_y, \ 3762 uint name##_step_y, \ 3763 uint name##_stride_z, \ 3764 uint name##_step_z, \ 3765 uint name##_offset_first_element_in_bytes 3766 3767#define TENSOR4D_DECLARATION(name) \ 3768 __global uchar *name##_ptr, \ 3769 uint name##_stride_x, \ 3770 uint name##_step_x, \ 3771 uint name##_stride_y, \ 3772 uint name##_step_y, \ 3773 uint name##_stride_z, \ 3774 uint name##_step_z, \ 3775 uint name##_stride_w, \ 3776 uint name##_step_w, \ 3777 uint name##_offset_first_element_in_bytes 3778 3779#define TENSOR5D_DECLARATION(name) \ 3780 __global uchar *name##_ptr, \ 3781 uint name##_stride_x, \ 3782 uint name##_step_x, \ 3783 uint name##_stride_y, \ 3784 uint name##_step_y, \ 3785 uint name##_stride_z, \ 3786 uint name##_step_z, \ 3787 uint name##_stride_w, \ 3788 uint name##_step_w, \ 3789 uint name##_stride_v, \ 3790 uint name##_step_v, \ 3791 uint name##_offset_first_element_in_bytes 3792 3793#define CONVERT_TO_VECTOR_STRUCT(name) \ 3794 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 3795 3796#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 3797 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 3798 3799#define CONVERT_TO_IMAGE_STRUCT(name) \ 3800 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 3801 3802#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 3803 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 3804 3805#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 3806 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 3807 3808#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 3809 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 3810 3811#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 3812 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 3813 3814#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 3815 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3816 name##_stride_z, name##_step_z) 3817 3818#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 3819 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 3820 3821#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 3822 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3823 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 3824 3825#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 3826 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 3827 3828#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 3829 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3830 name##_stride_z, name##_step_z) 3831 3832 3833typedef struct Vector 3834{ 3835 __global uchar *ptr; 3836 int offset_first_element_in_bytes; 3837 int stride_x; 3838} Vector; 3839 3840 3841typedef struct Image 3842{ 3843 __global uchar *ptr; 3844 int offset_first_element_in_bytes; 3845 int stride_x; 3846 int stride_y; 3847} Image; 3848 3849 3850typedef struct Tensor3D 3851{ 3852 __global uchar *ptr; 3853 int offset_first_element_in_bytes; 3854 int stride_x; 3855 int stride_y; 3856 int stride_z; 3857} Tensor3D; 3858 3859 3860typedef struct Tensor4D 3861{ 3862 __global uchar *ptr; 3863 int offset_first_element_in_bytes; 3864 int stride_x; 3865 int stride_y; 3866 int stride_z; 3867 int stride_w; 3868} Tensor4D; 3869 3870 3871inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 3872{ 3873 Vector vector = 3874 { 3875 .ptr = ptr, 3876 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3877 .stride_x = stride_x, 3878 }; 3879 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 3880 return vector; 3881} 3882 3883 3884inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 3885{ 3886 Image img = 3887 { 3888 .ptr = ptr, 3889 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3890 .stride_x = stride_x, 3891 .stride_y = stride_y 3892 }; 3893 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 3894 return img; 3895} 3896 3897 3898inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3899{ 3900 Image img = 3901 { 3902 .ptr = ptr, 3903 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3904 .stride_x = stride_x, 3905 .stride_y = stride_y 3906 }; 3907 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 3908 return img; 3909} 3910 3911 3912inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3913{ 3914 Tensor3D tensor = 3915 { 3916 .ptr = ptr, 3917 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3918 .stride_x = stride_x, 3919 .stride_y = stride_y, 3920 .stride_z = stride_z 3921 }; 3922 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 3923 return tensor; 3924} 3925 3926 3927inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3928{ 3929 Tensor3D tensor = 3930 { 3931 .ptr = ptr, 3932 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3933 .stride_x = stride_x, 3934 .stride_y = stride_y, 3935 .stride_z = stride_z 3936 }; 3937 return tensor; 3938} 3939 3940inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 3941 uint step_w, 3942 uint mod_size) 3943{ 3944 Tensor4D tensor = 3945 { 3946 .ptr = ptr, 3947 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3948 .stride_x = stride_x, 3949 .stride_y = stride_y, 3950 .stride_z = stride_z, 3951 .stride_w = stride_w 3952 }; 3953 3954 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 3955 return tensor; 3956} 3957 3958 3959inline __global const uchar *vector_offset(const Vector *vec, int x) 3960{ 3961 return vec->ptr + x * vec->stride_x; 3962} 3963 3964 3965inline __global uchar *offset(const Image *img, int x, int y) 3966{ 3967 return img->ptr + x * img->stride_x + y * img->stride_y; 3968} 3969 3970 3971inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 3972{ 3973 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 3974} 3975 3976 3977inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 3978{ 3979 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 3980} 3981 3982 3983inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 3984{ 3985 uint num_elements = width * height; 3986 3987 const uint z = index / num_elements; 3988 3989 index %= num_elements; 3990 3991 const uint y = index / width; 3992 3993 index %= width; 3994 3995 const uint x = index; 3996 3997 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 3998} 3999 4000#endif 4001 4002 4003#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 4004#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 4005 4006 4007#define scalar_access_0_1(x) ((x).s0) 4008#define scalar_access_0_2(x) ((x).s01) 4009#define scalar_access_0_3(x) ((x).s012) 4010#define scalar_access_0_4(x) ((x).s0123) 4011#define scalar_access_0_8(x) ((x).s01234567) 4012#define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 4013 4014 4015#define scalar_access_1_1(x) ((x).s1) 4016#define scalar_access_1_2(x) ((x).s12) 4017#define scalar_access_1_3(x) ((x).s123) 4018#define scalar_access_1_4(x) ((x).s1234) 4019#define scalar_access_1_8(x) ((x).s12345678) 4020 4021 4022#define scalar_access_2_1(x) ((x).s2) 4023#define scalar_access_2_2(x) ((x).s23) 4024#define scalar_access_2_3(x) ((x).s234) 4025#define scalar_access_2_4(x) ((x).s2345) 4026#define scalar_access_2_8(x) ((x).s23456789) 4027 4028 4029#define scalar_access_3_1(x) ((x).s3) 4030#define scalar_access_3_2(x) ((x).s34) 4031#define scalar_access_3_3(x) ((x).s345) 4032#define scalar_access_3_4(x) ((x).s3456) 4033#define scalar_access_3_8(x) ((x).s3456789A) 4034 4035 4036#define scalar_access_4_1(x) ((x).s4) 4037#define scalar_access_4_2(x) ((x).s45) 4038#define scalar_access_4_3(x) ((x).s456) 4039#define scalar_access_4_4(x) ((x).s4567) 4040#define scalar_access_4_8(x) ((x).s456789AB) 4041 4042 4043#define scalar_access_8_1(x) ((x).s8) 4044#define scalar_access_8_2(x) ((x).s89) 4045#define scalar_access_8_3(x) ((x).s89A) 4046#define scalar_access_8_4(x) ((x).s89AB) 4047#define scalar_access_8_8(x) ((x).s89ABCDEF) 4048 4049 4050#define scalar_access_12_1(x) ((x).sC) 4051#define scalar_access_12_2(x) ((x).sCD) 4052#define scalar_access_12_3(x) ((x).sCDE) 4053#define scalar_access_12_4(x) ((x).sCDEF) 4054 4055 4056#define scalar_access_16_1(x) ((x).sF) 4057 4058 4059#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4060 ({}) 4061 4062#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4063 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 4064 4065#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4066 LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4067 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 4068 4069#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4070 LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4071 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 4072 4073#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4074 LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4075 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 4076 4077#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4078 LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4079 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 4080 4081#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4082 LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4083 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 4084 4085#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4086 LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4087 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 4088 4089#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4090 LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4091 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 4092 4093#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4094 LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4095 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 4096 4097#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4098 LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4099 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 4100 4101#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4102 LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4103 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 4104 4105#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4106 LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4107 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 4108 4109#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4110 LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4111 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 4112 4113#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4114 LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4115 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 4116 4117#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4118 LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4119 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 4120 4121#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4122 LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 4123 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 4124 4125 4126 4127#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 4128#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 4129 4130 4131 4132#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4133 ({}) 4134 4135#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4136 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 4137 4138#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4139 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 4140 4141#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4142 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 4143 4144#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4145 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 4146 4147#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4148 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4149 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 4150 4151#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4152 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4153 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 4154 4155#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4156 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4157 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 4158 4159#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4160 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 4161 4162#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4163 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 4164 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 4165 4166#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4167 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4168 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 4169 4170#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4171 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4172 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 4173 4174#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4175 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4176 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 4177 4178#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4179 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4180 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 4181 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 4182 4183#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4184 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 4185 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 4186 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 4187 4188#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4189 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 4190 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 4191 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 4192 4193#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 4194 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 4195 4196 4197 4198#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4199#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4200 4201 4202#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4203 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4204 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 4205 4206#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4207 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4208 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4209 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 4210 4211#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4212 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4213 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4214 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 4215 4216#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4217 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4218 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4219 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 4220 4221#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4222 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4223 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4224 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 4225 4226#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4227 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4228 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4229 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 4230 4231#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4232 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4233 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4234 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 4235 4236#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4237 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4238 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4239 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 4240 4241#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4242 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4243 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4244 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 4245 4246#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4247 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4248 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4249 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 4250 4251#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4252 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4253 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4254 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 4255 4256#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4257 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4258 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4259 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 4260 4261#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4262 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4263 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4264 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 4265 4266#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4267 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4268 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4269 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 4270 4271#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4272 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4273 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4274 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 4275 4276#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4277 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4278 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4279 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 4280 4281 4282 4283 4284#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 4285#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 4286 4287 4288 4289#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4290 VLOAD_PARTIAL(N0, LOAD_N0) \ 4291 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 4292 4293#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4294 LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4295 VLOAD_PARTIAL(N0, LOAD_N0) \ 4296 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 4297 4298#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4299 LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4300 VLOAD_PARTIAL(N0, LOAD_N0) \ 4301 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 4302 4303#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4304 LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4305 VLOAD_PARTIAL(N0, LOAD_N0) \ 4306 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 4307 4308#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4309 LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4310 VLOAD_PARTIAL(N0, LOAD_N0) \ 4311 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 4312 4313#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4314 LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4315 VLOAD_PARTIAL(N0, LOAD_N0) \ 4316 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 4317 4318#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4319 LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4320 VLOAD_PARTIAL(N0, LOAD_N0) \ 4321 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 4322 4323#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4324 LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4325 VLOAD_PARTIAL(N0, LOAD_N0) \ 4326 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 4327 4328#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4329 LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4330 VLOAD_PARTIAL(N0, LOAD_N0) \ 4331 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 4332 4333#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4334 LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4335 VLOAD_PARTIAL(N0, LOAD_N0) \ 4336 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 4337 4338#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4339 LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4340 VLOAD_PARTIAL(N0, LOAD_N0) \ 4341 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 4342 4343#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4344 LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4345 VLOAD_PARTIAL(N0, LOAD_N0) \ 4346 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 4347 4348#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4349 LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4350 VLOAD_PARTIAL(N0, LOAD_N0) \ 4351 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 4352 4353#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4354 LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4355 VLOAD_PARTIAL(N0, LOAD_N0) \ 4356 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 4357 4358#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4359 LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4360 VLOAD_PARTIAL(N0, LOAD_N0) \ 4361 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 4362 4363#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4364 LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 4365 VLOAD_PARTIAL(N0, LOAD_N0) \ 4366 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 4367 4368 4369 4370#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 4371#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 4372 4373#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4374 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 4375 { \ 4376 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4377 } \ 4378 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 4379 { \ 4380 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4381 } \ 4382 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 4383 { \ 4384 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4385 } \ 4386 else \ 4387 { \ 4388 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4389 } 4390 4391#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 4392 if(!(PARTIAL_COND_X)) \ 4393 { \ 4394 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4395 } \ 4396 else \ 4397 { \ 4398 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4399 } 4400 4401#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 4402 if(!(PARTIAL_COND_Y)) \ 4403 { \ 4404 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4405 } \ 4406 else \ 4407 { \ 4408 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 4409 } 4410 4411 4412#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 4413 4414#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4415 LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 4416 4417#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 4418 4419#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4420 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 4421 LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 4422 4423#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 4424 4425#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4426 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 4427 LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 4428 4429#else 4430 4431#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4432 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 4433 LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 4434 4435#endif 4436 4437 4438#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4439 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 4440 4441#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4442 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4443 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 4444 4445#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4446 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4447 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 4448 4449#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4450 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4451 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 4452 4453#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4454 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4455 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 4456 4457#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4458 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4459 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 4460 4461#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4462 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4463 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 4464 4465#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4466 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4467 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 4468 4469#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4470 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4471 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 4472 4473#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4474 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4475 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 4476 4477#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4478 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4479 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 4480 4481#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4482 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4483 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 4484 4485#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4486 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4487 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 4488 4489#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4490 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4491 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 4492 4493#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4494 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4495 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 4496 4497#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4498 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 4499 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 4500 4501 4502 4503#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 4504#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 4505 4506 4507 4508#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4509 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4510 BASENAME##0; \ 4511 if(Y_MASK##0 != 0) \ 4512 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 4513 else \ 4514 BASENAME##0 = 0; 4515 4516#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4517 LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4518 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4519 BASENAME##1; \ 4520 if(Y_MASK##1 != 0) \ 4521 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 4522 else \ 4523 BASENAME##1 = 0; 4524 4525#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4526 LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4527 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4528 BASENAME##2; \ 4529 if(Y_MASK##2 != 0) \ 4530 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 4531 else \ 4532 BASENAME##2 = 0; 4533 4534#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4535 LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4536 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4537 BASENAME##3; \ 4538 if(Y_MASK##3 != 0) \ 4539 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 4540 else \ 4541 BASENAME##3 = 0; 4542 4543#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4544 LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4545 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4546 BASENAME##4; \ 4547 if(Y_MASK##4 != 0) \ 4548 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 4549 else \ 4550 BASENAME##4 = 0; 4551 4552#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4553 LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4554 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4555 BASENAME##5; \ 4556 if(Y_MASK##5 != 0) \ 4557 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 4558 else \ 4559 BASENAME##5 = 0; 4560 4561#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4562 LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4563 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4564 BASENAME##6; \ 4565 if(Y_MASK##6 != 0) \ 4566 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 4567 else \ 4568 BASENAME##6 = 0; 4569 4570#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4571 LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4572 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4573 BASENAME##7; \ 4574 if(Y_MASK##7 != 0) \ 4575 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 4576 else \ 4577 BASENAME##7 = 0; 4578 4579#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4580 LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4581 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4582 BASENAME##8; \ 4583 if(Y_MASK##8 != 0) \ 4584 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 4585 else \ 4586 BASENAME##8 = 0; 4587 4588#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4589 LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4590 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4591 BASENAME##9; \ 4592 if(Y_MASK##9 != 0) \ 4593 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 4594 else \ 4595 BASENAME##9 = 0; 4596 4597#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4598 LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4599 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4600 BASENAME##A; \ 4601 if(Y_MASK##A != 0) \ 4602 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 4603 else \ 4604 BASENAME##A = 0; 4605 4606#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4607 LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4608 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4609 BASENAME##B; \ 4610 if(Y_MASK##B != 0) \ 4611 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 4612 else \ 4613 BASENAME##B = 0; 4614 4615#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4616 LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4617 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4618 BASENAME##C; \ 4619 if(Y_MASK##C != 0) \ 4620 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 4621 else \ 4622 BASENAME##C = 0; 4623 4624#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4625 LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4626 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4627 BASENAME##D; \ 4628 if(Y_MASK##D != 0) \ 4629 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 4630 else \ 4631 BASENAME##D = 0; 4632 4633#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4634 LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4635 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4636 BASENAME##E; \ 4637 if(Y_MASK##E != 0) \ 4638 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 4639 else \ 4640 BASENAME##E = 0; 4641 4642#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4643 LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 4644 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4645 BASENAME##F; \ 4646 if(Y_MASK##F != 0) \ 4647 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 4648 else \ 4649 BASENAME##F = 0; 4650 4651 4652#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 4653#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 4654 4655 4656#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4657 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4658 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 4659 4660#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4661 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4662 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4663 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 4664 4665#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4666 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4667 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4668 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 4669 4670#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4671 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4672 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4673 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 4674 4675#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4676 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4677 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4678 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 4679 4680#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4681 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4682 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4683 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 4684 4685#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4686 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4687 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4688 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 4689 4690#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4691 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4692 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4693 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 4694 4695#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4696 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4697 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4698 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 4699 4700#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4701 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4702 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4703 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 4704 4705#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4706 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4707 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4708 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 4709 4710#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4711 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4712 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4713 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 4714 4715#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4716 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4717 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4718 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 4719 4720#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4721 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4722 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4723 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 4724 4725#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4726 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4727 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4728 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 4729 4730#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4731 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 4732 VEC_DATA_TYPE(DATA_TYPE, N0) \ 4733 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 4734 4735 4736 4737 4738#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 4739#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 4740 4741 4742 4743#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4744 Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4745 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 4746 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 4747 4748#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4749 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4750 Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4751 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 4752 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 4753 4754#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4755 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4756 Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4757 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 4758 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 4759 4760#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4761 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4762 Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4763 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 4764 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 4765 4766#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4767 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4768 Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4769 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 4770 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 4771 4772#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4773 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4774 Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4775 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 4776 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 4777 4778#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4779 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4780 Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4781 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 4782 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 4783 4784#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4785 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 4786 Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 4787 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 4788 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 4789 4790 4791 4792 4793#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 4794#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 4795 4796 4797 4798#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 4799 BASENAME##0 *= (DATA_TYPE)SCALE; 4800 4801#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 4802 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 4803 BASENAME##1 *= (DATA_TYPE)SCALE; 4804 4805#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 4806 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 4807 BASENAME##2 *= (DATA_TYPE)SCALE; 4808 4809#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 4810 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 4811 BASENAME##3 *= (DATA_TYPE)SCALE; 4812 4813#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 4814 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 4815 BASENAME##4 *= (DATA_TYPE)SCALE; 4816 4817#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 4818 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 4819 BASENAME##5 *= (DATA_TYPE)SCALE; 4820 4821#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 4822 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 4823 BASENAME##6 *= (DATA_TYPE)SCALE; 4824 4825#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 4826 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 4827 BASENAME##7 *= (DATA_TYPE)SCALE; 4828 4829#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 4830 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 4831 BASENAME##8 *= (DATA_TYPE)SCALE; 4832 4833#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 4834 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 4835 BASENAME##9 *= (DATA_TYPE)SCALE; 4836 4837#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 4838 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 4839 BASENAME##A *= (DATA_TYPE)SCALE; 4840 4841#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 4842 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 4843 BASENAME##B *= (DATA_TYPE)SCALE; 4844 4845#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 4846 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 4847 BASENAME##C *= (DATA_TYPE)SCALE; 4848 4849#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 4850 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 4851 BASENAME##D *= (DATA_TYPE)SCALE; 4852 4853#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 4854 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 4855 BASENAME##E *= (DATA_TYPE)SCALE; 4856 4857#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 4858 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 4859 BASENAME##F *= (DATA_TYPE)SCALE; 4860 4861 4862 4863#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 4864#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 4865 4866 4867 4868#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 4869 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 4870#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 4871 VEC_DATA_TYPE(TYPE, 2) \ 4872 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 4873#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 4874 VEC_DATA_TYPE(TYPE, 3) \ 4875 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 4876#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 4877 VEC_DATA_TYPE(TYPE, 4) \ 4878 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 4879#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 4880 VEC_DATA_TYPE(TYPE, 8) \ 4881 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 4882#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 4883 VEC_DATA_TYPE(TYPE, 16) \ 4884 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 4885 4886 4887 4888#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 4889 TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 4890#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 4891 VEC_DATA_TYPE(TYPE, 2) \ 4892 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 4893#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 4894 VEC_DATA_TYPE(TYPE, 3) \ 4895 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 4896#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 4897 VEC_DATA_TYPE(TYPE, 4) \ 4898 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 4899#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 4900 VEC_DATA_TYPE(TYPE, 8) \ 4901 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 4902#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 4903 VEC_DATA_TYPE(TYPE, 16) \ 4904 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 4905 4906 4907 4908#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 4909 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 4910#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 4911 COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 4912 COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 4913#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 4914 TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 4915 COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 4916#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 4917 TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 4918 COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 4919#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 4920 TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 4921 COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 4922 COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 4923 COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 4924 COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 4925#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 4926 TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 4927 COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 4928 COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 4929 COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 4930 COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 4931 COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 4932 COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 4933 COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 4934 COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 4935 4936 4937 4938 4939#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 4940 CONCAT(COLUMN_VECTOR, K0) \ 4941 (IDX_COL, BASENAME, BS, TYPE); 4942 4943 4944#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 4945 CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 4946 (IDX_COL, BASENAME, BS, TYPE); 4947 4948 4949#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 4950 CONCAT(TRANSPOSE_K0X, N0) \ 4951 (K0, BASENAME, BS, TYPE); 4952 4953 4954#define ADD_ROW_1(BASENAME, BIAS) \ 4955 BASENAME##0 += BIAS##0; 4956 4957#define ADD_ROW_2(BASENAME, BIAS) \ 4958 ADD_ROW_1(BASENAME, BIAS) \ 4959 BASENAME##1 += BIAS##1; 4960 4961#define ADD_ROW_3(BASENAME, BIAS) \ 4962 ADD_ROW_2(BASENAME, BIAS) \ 4963 BASENAME##2 += BIAS##2; 4964 4965#define ADD_ROW_4(BASENAME, BIAS) \ 4966 ADD_ROW_3(BASENAME, BIAS) \ 4967 BASENAME##3 += BIAS##3; 4968 4969#define ADD_ROW_5(BASENAME, BIAS) \ 4970 ADD_ROW_4(BASENAME, BIAS) \ 4971 BASENAME##4 += BIAS##4; 4972 4973#define ADD_ROW_6(BASENAME, BIAS) \ 4974 ADD_ROW_5(BASENAME, BIAS) \ 4975 BASENAME##5 += BIAS##5; 4976 4977#define ADD_ROW_7(BASENAME, BIAS) \ 4978 ADD_ROW_6(BASENAME, BIAS) \ 4979 BASENAME##6 += BIAS##6; 4980 4981#define ADD_ROW_8(BASENAME, BIAS) \ 4982 ADD_ROW_7(BASENAME, BIAS) \ 4983 BASENAME##7 += BIAS##7; 4984 4985#define ADD_ROW_9(BASENAME, BIAS) \ 4986 ADD_ROW_8(BASENAME, BIAS) \ 4987 BASENAME##8 += BIAS##8; 4988 4989#define ADD_ROW_10(BASENAME, BIAS) \ 4990 ADD_ROW_9(BASENAME, BIAS) \ 4991 BASENAME##9 += BIAS##9; 4992 4993#define ADD_ROW_11(BASENAME, BIAS) \ 4994 ADD_ROW_10(BASENAME, BIAS) \ 4995 BASENAME##A += BIAS##A; 4996 4997#define ADD_ROW_12(BASENAME, BIAS) \ 4998 ADD_ROW_11(BASENAME, BIAS) \ 4999 BASENAME##B += BIAS##B; 5000 5001#define ADD_ROW_13(BASENAME, BIAS) \ 5002 ADD_ROW_12(BASENAME, BIAS) \ 5003 BASENAME##C += BIAS##C; 5004 5005#define ADD_ROW_14(BASENAME, BIAS) \ 5006 ADD_ROW_13(BASENAME, BIAS) \ 5007 BASENAME##D += BIAS##D; 5008 5009#define ADD_ROW_15(BASENAME, BIAS) \ 5010 ADD_ROW_14(BASENAME, BIAS) \ 5011 BASENAME##E += BIAS##E; 5012 5013#define ADD_ROW_16(BASENAME, BIAS) \ 5014 ADD_ROW_15(BASENAME, BIAS) \ 5015 BASENAME##F += BIAS##F; 5016 5017 5018 5019 5020#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 5021#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 5022 5023 5024 5025#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 5026 BASENAME##0 += BIAS; 5027 5028#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 5029 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 5030 BASENAME##1 += BIAS; 5031 5032#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 5033 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 5034 BASENAME##2 += BIAS; 5035 5036#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 5037 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 5038 BASENAME##3 += BIAS; 5039 5040#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 5041 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 5042 BASENAME##4 += BIAS; 5043 5044#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 5045 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 5046 BASENAME##5 += BIAS; 5047 5048#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 5049 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 5050 BASENAME##6 += BIAS; 5051 5052#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 5053 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 5054 BASENAME##7 += BIAS; 5055 5056#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 5057 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 5058 BASENAME##8 += BIAS; 5059 5060#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 5061 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 5062 BASENAME##9 += BIAS; 5063 5064#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 5065 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 5066 BASENAME##A += BIAS; 5067 5068#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 5069 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 5070 BASENAME##B += BIAS; 5071 5072#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 5073 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 5074 BASENAME##C += BIAS; 5075 5076#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 5077 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 5078 BASENAME##D += BIAS; 5079 5080#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 5081 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 5082 BASENAME##E += BIAS; 5083 5084#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 5085 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 5086 BASENAME##F += BIAS; 5087 5088 5089#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 5090#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 5091 5092 5093 5094#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5095 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 5096 5097#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5098 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5099 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 5100 5101#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5102 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5103 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 5104 5105#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5106 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5107 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 5108 5109#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5110 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5111 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 5112 5113#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5114 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5115 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 5116 5117#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5118 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5119 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 5120 5121#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5122 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5123 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 5124 5125#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5126 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5127 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 5128 5129#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5130 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5131 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 5132 5133#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5134 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5135 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 5136 5137#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5138 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5139 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 5140 5141#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5142 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5143 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 5144 5145#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5146 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5147 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 5148 5149#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5150 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5151 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 5152 5153#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5154 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 5155 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 5156 5157 5158 5159#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 5160#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 5161 5162 5163 5164#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5165 VEC_DATA_TYPE(DATA_TYPE, N) \ 5166 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 5167 5168#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5169 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5170 VEC_DATA_TYPE(DATA_TYPE, N) \ 5171 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 5172 5173#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5174 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5175 VEC_DATA_TYPE(DATA_TYPE, N) \ 5176 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 5177 5178#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5179 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5180 VEC_DATA_TYPE(DATA_TYPE, N) \ 5181 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 5182 5183#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5184 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5185 VEC_DATA_TYPE(DATA_TYPE, N) \ 5186 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 5187 5188#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5189 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5190 VEC_DATA_TYPE(DATA_TYPE, N) \ 5191 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 5192 5193#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5194 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5195 VEC_DATA_TYPE(DATA_TYPE, N) \ 5196 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 5197 5198#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5199 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5200 VEC_DATA_TYPE(DATA_TYPE, N) \ 5201 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 5202 5203#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5204 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5205 VEC_DATA_TYPE(DATA_TYPE, N) \ 5206 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 5207 5208#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5209 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5210 VEC_DATA_TYPE(DATA_TYPE, N) \ 5211 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 5212 5213#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5214 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5215 VEC_DATA_TYPE(DATA_TYPE, N) \ 5216 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 5217 5218#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5219 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5220 VEC_DATA_TYPE(DATA_TYPE, N) \ 5221 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 5222 5223#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5224 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5225 VEC_DATA_TYPE(DATA_TYPE, N) \ 5226 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 5227 5228#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5229 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5230 VEC_DATA_TYPE(DATA_TYPE, N) \ 5231 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 5232 5233#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5234 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5235 VEC_DATA_TYPE(DATA_TYPE, N) \ 5236 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 5237 5238#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5239 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 5240 VEC_DATA_TYPE(DATA_TYPE, N) \ 5241 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 5242 5243 5244 5245#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 5246#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 5247 5248 5249 5250 5251#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5252 VSTORE(N0) \ 5253 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5254 5255#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5256 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5257 VSTORE(N0) \ 5258 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5259 5260#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5261 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5262 VSTORE(N0) \ 5263 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5264 5265#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5266 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5267 VSTORE(N0) \ 5268 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5269 5270#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5271 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5272 VSTORE(N0) \ 5273 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5274 5275#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5276 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5277 VSTORE(N0) \ 5278 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5279 5280#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5281 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5282 VSTORE(N0) \ 5283 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5284 5285#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5286 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5287 VSTORE(N0) \ 5288 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5289 5290#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5291 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5292 VSTORE(N0) \ 5293 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5294 5295#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5296 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5297 VSTORE(N0) \ 5298 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5299 5300#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5301 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5302 VSTORE(N0) \ 5303 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5304 5305#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5306 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5307 VSTORE(N0) \ 5308 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5309 5310#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5311 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5312 VSTORE(N0) \ 5313 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5314 5315#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5316 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5317 VSTORE(N0) \ 5318 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5319 5320#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5321 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5322 VSTORE(N0) \ 5323 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5324 5325#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5326 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5327 VSTORE(N0) \ 5328 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5329 5330 5331 5332#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5333 VSTORE(N0) \ 5334 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5335 5336#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5337 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5338 VSTORE(N0) \ 5339 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5340 5341#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5342 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5343 VSTORE(N0) \ 5344 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5345 5346#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5347 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5348 VSTORE(N0) \ 5349 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5350 5351#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5352 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5353 VSTORE(N0) \ 5354 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5355 5356#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5357 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5358 VSTORE(N0) \ 5359 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5360 5361#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5362 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5363 VSTORE(N0) \ 5364 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5365 5366#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5367 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5368 VSTORE(N0) \ 5369 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5370 5371#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5372 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5373 VSTORE(N0) \ 5374 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5375 5376#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 5377 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5378 VSTORE(N0) \ 5379 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5380 5381#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5382 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5383 VSTORE(N0) \ 5384 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5385 5386#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5387 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5388 VSTORE(N0) \ 5389 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5390 5391#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5392 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5393 VSTORE(N0) \ 5394 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5395 5396#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5397 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5398 VSTORE(N0) \ 5399 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5400 5401#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5402 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5403 VSTORE(N0) \ 5404 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5405 5406#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5407 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5408 VSTORE(N0) \ 5409 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5410 5411 5412 5413 5414#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5415#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5416 5417 5418 5419#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5420#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5421 5422 5423 5424#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5425 VSTORE_PARTIAL(N0, STORE_N0) \ 5426 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5427 5428#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5429 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5430 VSTORE_PARTIAL(N0, STORE_N0) \ 5431 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5432 5433#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5434 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5435 VSTORE_PARTIAL(N0, STORE_N0) \ 5436 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5437 5438#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5439 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5440 VSTORE_PARTIAL(N0, STORE_N0) \ 5441 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5442 5443#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5444 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5445 VSTORE_PARTIAL(N0, STORE_N0) \ 5446 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5447 5448#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5449 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5450 VSTORE_PARTIAL(N0, STORE_N0) \ 5451 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5452 5453#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5454 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5455 VSTORE_PARTIAL(N0, STORE_N0) \ 5456 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5457 5458#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5459 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5460 VSTORE_PARTIAL(N0, STORE_N0) \ 5461 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5462 5463#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5464 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5465 VSTORE_PARTIAL(N0, STORE_N0) \ 5466 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5467 5468#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5469 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5470 VSTORE_PARTIAL(N0, STORE_N0) \ 5471 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5472 5473#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5474 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5475 VSTORE_PARTIAL(N0, STORE_N0) \ 5476 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5477 5478#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5479 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5480 VSTORE_PARTIAL(N0, STORE_N0) \ 5481 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5482 5483#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5484 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5485 VSTORE_PARTIAL(N0, STORE_N0) \ 5486 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5487 5488#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5489 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5490 VSTORE_PARTIAL(N0, STORE_N0) \ 5491 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5492 5493#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5494 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5495 VSTORE_PARTIAL(N0, STORE_N0) \ 5496 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5497 5498#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5499 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5500 VSTORE_PARTIAL(N0, STORE_N0) \ 5501 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5502 5503 5504 5505#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5506#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5507 5508#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5509 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 5510 { \ 5511 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5512 } \ 5513 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 5514 { \ 5515 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5516 } \ 5517 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 5518 { \ 5519 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5520 } \ 5521 else \ 5522 { \ 5523 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5524 } 5525 5526#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 5527 if(!(PARTIAL_COND_X)) \ 5528 { \ 5529 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5530 } \ 5531 else \ 5532 { \ 5533 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5534 } 5535 5536#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 5537 if(!(PARTIAL_COND_Y)) \ 5538 { \ 5539 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5540 } \ 5541 else \ 5542 { \ 5543 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5544 } 5545 5546 5547#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 5548 5549 5550#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 5551 5552#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5553 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5554 5555#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 5556 5557#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5558 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 5559 5560#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 5561 5562#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5563 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 5564 5565#else 5566 5567#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5568 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 5569 5570#endif 5571 5572#endif 5573 5574 5575#if defined(PARTIAL_STORE_M0) 5576 5577#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5578 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 5579#else 5580#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5581 ((uint)(y * M0)) 5582#endif 5583 5584 5585 5586#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 5587 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 5588 5589 5590 5591 5592 5593#if defined(MIXED_PRECISION) 5594#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \ 5595 ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL); 5596#else 5597#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \ 5598 ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL); 5599#endif 5600 5601 5602 5603#if defined(MIXED_PRECISION) 5604#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ 5605 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \ 5606 ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2); 5607#else 5608#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ 5609 ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2); 5610#endif 5611 5612 5613 5614#if defined(MIXED_PRECISION) 5615#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ 5616 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \ 5617 ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0); 5618#else 5619#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ 5620 ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0); 5621#endif 5622 5623 5624 5625#if defined(MIXED_PRECISION) 5626#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \ 5627 CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP); \ 5628 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X); 5629#else 5630#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \ 5631 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X); 5632#endif 5633 )"