1R"( 2 3/* 4 * Copyright (c) 2016-2020 Arm Limited. 5 * 6 * SPDX-License-Identifier: MIT 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to 10 * deal in the Software without restriction, including without limitation the 11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 * sell copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in all 16 * copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 * SOFTWARE. 25 */ 26#ifndef ARM_COMPUTE_HELPER_H 27#define ARM_COMPUTE_HELPER_H 28 29/* 30 * Copyright (c) 2020 Arm Limited. 31 * 32 * SPDX-License-Identifier: MIT 33 * 34 * Permission is hereby granted, free of charge, to any person obtaining a copy 35 * of this software and associated documentation files (the "Software"), to 36 * deal in the Software without restriction, including without limitation the 37 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 38 * sell copies of the Software, and to permit persons to whom the Software is 39 * furnished to do so, subject to the following conditions: 40 * 41 * The above copyright notice and this permission notice shall be included in all 42 * copies or substantial portions of the Software. 43 * 44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 49 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 50 * SOFTWARE. 51 */ 52 53/** Store the 0 to (n-1)th rows of the given variables 54 * @name STORE_ROW_n 55 * 56 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 57 * @param[in] DATA_TYPE The data type of the vectors 58 * @param[in] BASENAME The basename of the variables 59 * @param[in] PTR The base pointer 60 * @param[in] STRIDE_Y The stride value in y-axis direction 61 * @param[in] Z The offset in z-axis direction 62 * @{ 63 */ 64#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 65 VSTORE(N0) \ 66 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 67 68#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 69 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 70 VSTORE(N0) \ 71 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 72 73#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 74 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 75 VSTORE(N0) \ 76 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 77 78#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 79 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 80 VSTORE(N0) \ 81 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 82 83#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 84 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 85 VSTORE(N0) \ 86 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 87 88#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 89 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 90 VSTORE(N0) \ 91 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 92 93#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 95 VSTORE(N0) \ 96 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 97 98#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 100 VSTORE(N0) \ 101 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 102 103#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 105 VSTORE(N0) \ 106 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 107 108#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 110 VSTORE(N0) \ 111 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 112 113#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 115 VSTORE(N0) \ 116 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 117 118#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 120 VSTORE(N0) \ 121 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 122 123#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 125 VSTORE(N0) \ 126 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 127 128#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 130 VSTORE(N0) \ 131 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 132 133#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 135 VSTORE(N0) \ 136 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 137 138#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 140 VSTORE(N0) \ 141 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 142/** @} */ // end of groupd STORE_ROW_n 143 144/** Convert and store the 0th to (n-1)th rows of the given variables 145 * @name CONVERT_STORE_ROW_n 146 * 147 * @param[in] N0 The size of the vectors 148 * @param[in] DATA_TYPE The data type of the vectors 149 * @param[in] BASENAME The basename of the variables 150 * @param[in] PTR The base pointer 151 * @param[in] STRIDE_Y The stride value in y-axis direction 152 * @param[in] Z The offset in z-axis direction 153 * @{ 154 */ 155#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 156 VSTORE(N0) \ 157 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 158 159#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 160 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 161 VSTORE(N0) \ 162 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 163 164#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 165 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 166 VSTORE(N0) \ 167 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 168 169#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 170 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 171 VSTORE(N0) \ 172 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 173 174#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 175 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 176 VSTORE(N0) \ 177 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 178 179#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 180 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 181 VSTORE(N0) \ 182 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 183 184#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 185 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 VSTORE(N0) \ 187 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 188 189#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 VSTORE(N0) \ 192 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 193 194#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 VSTORE(N0) \ 197 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 198 199#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 200 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 VSTORE(N0) \ 202 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 203 204#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 VSTORE(N0) \ 207 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 208 209#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 VSTORE(N0) \ 212 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 213 214#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 VSTORE(N0) \ 217 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 218 219#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 VSTORE(N0) \ 222 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 223 224#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 VSTORE(N0) \ 227 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 228 229#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 VSTORE(N0) \ 232 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 233 234/** @} */ // end of groupd CONVERT_STORE_ROW_n 235 236/** Store a block of the given size M0xN0 237 * @name STORE_BLOCK 238 * 239 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 240 * The data to store is expected to have consecutive names for each row. 241 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 242 * The Z offset is expected to have consecutive names. 243 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 244 * 245 * @param[in] M0 The number of rows to store 246 * @param[in] N0 The size of each vector 247 * @param[in] DATA_TYPE The data type of the vectors 248 * @param[in] BASENAME The basename of the variables 249 * @param[in] PTR The base pointer 250 * @param[in] STRIDE_Y The stride value in y-axis direction 251 * @param[in] Z The offset in z-axis direction 252 * @{ 253 */ 254#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 255#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 256/** @} */ // end of group STORE_BLOCK 257 258/** Convert and store a block of the given size M0xN0 259 * @name CONVERT_STORE_BLOCK 260 * 261 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 262 * The data to store is expected to have consecutive names for each row. 263 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 264 * The Z offset is expected to have consecutive names. 265 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 266 * 267 * @param[in] M0 The number of rows to store 268 * @param[in] N0 The size of each vector 269 * @param[in] DATA_TYPE The data type of the vectors 270 * @param[in] BASENAME The basename of the variables 271 * @param[in] PTR The base pointer 272 * @param[in] STRIDE_Y The stride value in y-axis direction 273 * @param[in] Z The offset in z-axis direction 274 * @{ 275 */ 276#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 277#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 278/** @} */ // end of group CONVERT_STORE_BLOCK 279 280/** Partially store the 0 to (n-1)th rows of the given variables 281 * @name STORE_ROW_PARTIAL_n 282 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0 283 * 284 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 285 * 286 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 287 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0 288 * @param[in] DATA_TYPE The data type of the vectors 289 * @param[in] BASENAME The basename of the variables 290 * @param[in] PTR The base pointer 291 * @param[in] STRIDE_Y The stride value in y-axis direction 292 * @param[in] Z The offset in z-axis direction 293 * @{ 294 */ 295#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 296 VSTORE_PARTIAL(N0, STORE_N0) \ 297 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 298 299#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 300 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 301 VSTORE_PARTIAL(N0, STORE_N0) \ 302 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 303 304#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 305 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 306 VSTORE_PARTIAL(N0, STORE_N0) \ 307 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 308 309#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 310 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 311 VSTORE_PARTIAL(N0, STORE_N0) \ 312 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 313 314#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 315 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 316 VSTORE_PARTIAL(N0, STORE_N0) \ 317 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 318 319#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 320 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 321 VSTORE_PARTIAL(N0, STORE_N0) \ 322 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 323 324#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 325 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 326 VSTORE_PARTIAL(N0, STORE_N0) \ 327 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 328 329#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 330 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 331 VSTORE_PARTIAL(N0, STORE_N0) \ 332 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 333 334#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 335 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 336 VSTORE_PARTIAL(N0, STORE_N0) \ 337 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 338 339#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 340 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 341 VSTORE_PARTIAL(N0, STORE_N0) \ 342 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 343 344#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 345 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 346 VSTORE_PARTIAL(N0, STORE_N0) \ 347 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 348 349#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 350 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 351 VSTORE_PARTIAL(N0, STORE_N0) \ 352 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 353 354#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 355 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 356 VSTORE_PARTIAL(N0, STORE_N0) \ 357 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 358 359#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 360 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 361 VSTORE_PARTIAL(N0, STORE_N0) \ 362 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 363 364#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 365 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 366 VSTORE_PARTIAL(N0, STORE_N0) \ 367 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 368 369#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 370 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 371 VSTORE_PARTIAL(N0, STORE_N0) \ 372 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 373/** @} */ // end of groupd STORE_ROW_PARTIAL_n 374 375/** Partially store a block of the given size STORE_M0xSTORE_N0 376 * @name STORE_BLOCK_PARTIAL 377 * 378 * @note The vector width @p N0 is also required for correct partial storing behaviour. 379 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 380 * 381 * The data to store is expected to have consecutive names for each row. 382 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2. 383 * The Z offset is expected to have consecutive names. 384 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 385 * 386 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16 387 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0 388 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 389 * @param[in] DATA_TYPE The data type of the vectors 390 * @param[in] BASENAME The basename of the variables 391 * @param[in] PTR The base pointer 392 * @param[in] STRIDE_Y The stride value in y-axis direction 393 * @param[in] Z The offset in z-axis direction 394 * @{ 395 */ 396#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 397#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 398/** Store a block that can be partial in both x and y dimensions 399 * 400 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 401 * 402 * The data to store is expected to have consecutive names for each row. 403 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 404 * The Z offset is expected to have consecutive names. 405 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 406 * 407 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 408 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 409 * @param[in] DATA_TYPE The data type of the vectors 410 * @param[in] BASENAME The basename of the variables 411 * @param[in] PTR The base pointer 412 * @param[in] STRIDE_Y The stride value in y-axis direction 413 * @param[in] Z The offset in z-axis direction 414 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 415 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 416 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 417 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 418 */ 419#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 420 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 421 { \ 422 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 423 } \ 424 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 425 { \ 426 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 427 } \ 428 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 429 { \ 430 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 431 } \ 432 else \ 433 { \ 434 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 435 } 436/** Store a block that can only be partial in x but not y. 437 * 438 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 439 * 440 * The data to store is expected to have consecutive names for each row. 441 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 442 * The Z offset is expected to have consecutive names. 443 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 444 * 445 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 446 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 447 * @param[in] DATA_TYPE The data type of the vectors 448 * @param[in] BASENAME The basename of the variables 449 * @param[in] PTR The base pointer 450 * @param[in] STRIDE_Y The stride value in y-axis direction 451 * @param[in] Z The offset in z-axis direction 452 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 453 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 454 */ 455#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 456 if(!(PARTIAL_COND_X)) \ 457 { \ 458 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 459 } \ 460 else \ 461 { \ 462 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 463 } 464/** Store a block that can only be partial in y but not x. 465 * 466 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 467 * 468 * The data to store is expected to have consecutive names for each row. 469 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 470 * The Z offset is expected to have consecutive names. 471 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 472 * 473 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 474 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 475 * @param[in] DATA_TYPE The data type of the vectors 476 * @param[in] BASENAME The basename of the variables 477 * @param[in] PTR The base pointer 478 * @param[in] STRIDE_Y The stride value in y-axis direction 479 * @param[in] Z The offset in z-axis direction 480 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 481 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 482 */ 483#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 484 if(!(PARTIAL_COND_Y)) \ 485 { \ 486 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 487 } \ 488 else \ 489 { \ 490 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 491 } 492/** @} */ // end of group STORE_BLOCK_PARTIAL 493 494#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 495 496/** Boundary-aware GEMM block store 497 * @name STORE_BLOCK_BOUNDARY_AWARE 498 * This macro assumes the following schemes to achieve boundary-awareness: 499 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 500 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 501 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 502 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim. 503 * 504 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 505 * blocks **at the end**. 506 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 507 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 508 * 509 * *--x--> x == 0 x == 1 510 * | |<------------------------------N-------------------------->| 511 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 512 * | -------------############################################################# 513 * * | | |...............................|...........................| 514 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 515 * | | |...............................|...........................| 516 * M --############################################################# 517 * | | | |...........................| 518 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 519 * | | | |...........................| 520 * |------------############################################################# 521 * 522 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 523 * 524 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 525 * 526 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 527 * and select corresponding store methods such that the boundary detection logic is only added when needed. 528 * 529 * The data to store is expected to have consecutive names for each row. 530 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 531 * The Z offset is expected to have consecutive names. 532 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 533 * 534 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 535 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 536 * @param[in] DATA_TYPE The data type of the vectors 537 * @param[in] BASENAME The basename of the variables 538 * @param[in] PTR The base pointer 539 * @param[in] STRIDE_Y The stride value in y-axis direction 540 * @param[in] Z The offset in z-axis direction 541 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 542 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 543 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 544 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 545 * @{ 546 */ 547#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 548// Case1: No partial blocks in either x or y 549#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 550 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 551 552#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 553// Case2: Partial blocks in y 554#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 555 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 556 557#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 558// Case3: Partial blocks in x 559#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 560 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 561 562#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 563// Case4: Partial blocks in both x and y 564#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 565 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 566 567#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 568 569#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 570/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE 571 572#if defined(PARTIAL_STORE_M0) 573/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding 574 * @name COMPUTE_M0_START_ROW 575 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows. 576 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent 577 * blocks in the y dimension to avoid any padding. 578 * EG: M0=4, PARTIAL_STORE_M0=1: 579 * | Non-overlapping | +M0_ROW_SHIFT (Overlapping) 580 * block 0 (partial)| start row = 0 | start row = 0 581 * block 1 (full) | start row = 4 | start row = 1 582 * block 2 (full) | start row = 8 | start row = 5 583 * 584 * @param[in] y Global id of current block in y. 585 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 586 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 587 * @{ 588 */ 589#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 590 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 591#else // defined(PARTIAL_STORE_M0) 592#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 593 ((uint)(y * M0)) 594#endif // defined(PARTIAL_STORE_M0) 595/** @} */ // end of group COMPUTE_M0_START_ROW 596 597/** Store a vector that can only be partial in x. 598 * 599 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 600 * 601 * The data to store is expected to end in a 0. 602 * E.g., for basename=c, the expected name is c0. 603 * 604 * @param[in] basename The name of the variable without trailing 0 605 * @param[in] data_type The data type of the vector 606 * @param[in] ptr The base pointer 607 * @param[in] vec_size The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16 608 * @param[in] leftover The vector size if cond = true. Supported range: [1, @p vec_size0) 609 * @param[in] cond Condition to select either vec_size0 or vec_size1 610 * @{ 611 */ 612#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 613 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 614/** @} */ // end of group STORE_VECTOR_SELECT 615 616#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 617#pragma OPENCL EXTENSION cl_khr_fp16 : enable 618#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 619 620#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 621#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 622#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 623 624#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 625#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 626#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 627 628#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 629#pragma OPENCL EXTENSION cl_arm_printf : enable 630#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 631 632#define GPU_ARCH_MIDGARD 0x100 633#define GPU_ARCH_BIFROST 0x200 634 635/** Concatenate two inputs. 636 * 637 * @param[in] a The first input to be concatenated 638 * @param[in] b The second input to be concatenated 639 * 640 * @return The concatenated output 641 */ 642#define CONCAT(a, b) a##b 643 644/** Expand the given vector 645 * 646 * @param[in] x The vector to be expanded 647 * 648 * @return The expanded output 649 */ 650#define EXPAND(x) x 651 652/** Clamp the given value between an upper and lower bound. 653 * 654 * @param[in] x The value to be clamped 655 * @param[in] min_val The lower bound 656 * @param[in] max_val The upper bound 657 * 658 * @return The clamped value. 659 */ 660#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 661 662/** REVn reverses the given vector whose size is n. 663 * @name REVn 664 * 665 * @param[in] x The vector to be reversed 666 * 667 * @return The reversed vector 668 * @{ 669 */ 670#define REV1(x) ((x)) 671#define REV2(x) ((x).s10) 672#define REV3(x) ((x).s210) 673#define REV4(x) ((x).s3210) 674#define REV8(x) ((x).s76543210) 675#define REV16(x) ((x).sFEDCBA9876543210) 676/** @} */ // end of group REVn 677 678/** Reverse the given vector. 679 * @name REVERSE 680 * 681 * @param[in] x The vector to be reversed 682 * @param[in] s The size of the vector 683 * 684 * @return The reversed vector 685 * @{ 686 */ 687#define REVERSE_STR(x, s) REV##s((x)) 688#define REVERSE(x, s) REVERSE_STR(x, s) 689/** @} */ // end of group REVERSE 690 691/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. 692 * @name ROTs_n 693 * 694 * @param[in] x The vector to be shifted 695 * 696 * @return The shifted vector 697 * @{ 698 */ 699#define ROT1_0(x) ((x)) 700 701#define ROT2_0(x) ((x)) 702#define ROT2_1(x) ((x).s10) 703 704#define ROT3_0(x) ((x)) 705#define ROT3_1(x) ((x).s201) 706#define ROT3_2(x) ((x).s120) 707 708#define ROT4_0(x) ((x)) 709#define ROT4_1(x) ((x).s3012) 710#define ROT4_2(x) ((x).s2301) 711#define ROT4_3(x) ((x).s1230) 712 713#define ROT8_0(x) ((x)) 714#define ROT8_1(x) ((x).s70123456) 715#define ROT8_2(x) ((x).s67012345) 716#define ROT8_3(x) ((x).s56701234) 717#define ROT8_4(x) ((x).s45670123) 718#define ROT8_5(x) ((x).s34567012) 719#define ROT8_6(x) ((x).s23456701) 720#define ROT8_7(x) ((x).s12345670) 721 722#define ROT16_0(x) ((x)) 723#define ROT16_1(x) ((x).sF0123456789ABCDE) 724#define ROT16_2(x) ((x).sEF0123456789ABCD) 725#define ROT16_3(x) ((x).sDEF0123456789ABC) 726#define ROT16_4(x) ((x).sCDEF0123456789AB) 727#define ROT16_5(x) ((x).sBCDEF0123456789A) 728#define ROT16_6(x) ((x).sABCDEF0123456789) 729#define ROT16_7(x) ((x).s9ABCDEF012345678) 730#define ROT16_8(x) ((x).s89ABCDEF01234567) 731#define ROT16_9(x) ((x).s789ABCDEF0123456) 732#define ROT16_10(x) ((x).s6789ABCDEF012345) 733#define ROT16_11(x) ((x).s56789ABCDEF01234) 734#define ROT16_12(x) ((x).s456789ABCDEF0123) 735#define ROT16_13(x) ((x).s3456789ABCDEF012) 736#define ROT16_14(x) ((x).s23456789ABCDEF01) 737#define ROT16_15(x) ((x).s123456789ABCDEF0) 738/** @} */ // end of group ROTs_n 739 740/** Circular-right-shift (rotate-right) the given vector by the given amount. 741 * @name ROTATE 742 * 743 * @param[in] x The vector to be shifted 744 * @param[in] s The size of the vector 745 * @param[in] n The amount to be shifted 746 * 747 * @return The shifted vector 748 * @{ 749 */ 750#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 751#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 752/** @} */ // end of group ROTATE 753 754/** Creates a vector of size n filled with offset values corresponding to the location of each element. 755 * @name V_OFFSn 756 * 757 * @param[in] dt The data type of the output vector 758 * 759 * @return The vector filled with offset values 760 * @{ 761 */ 762#define V_OFFS1(dt) (dt##1)(0) 763#define V_OFFS2(dt) (dt##2)(0, 1) 764#define V_OFFS3(dt) (dt##3)(0, 1, 2) 765#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 766#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 767#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 768/** @} */ // end of group V_OFFSn 769 770/** Create a vector filled with offset values corresponding to the location of each element. 771 * @name VEC_OFFS 772 * 773 * @param[in] dt The data type of the output vector 774 * @param[in] s The size of the output vector 775 * 776 * @return The vector filled with offset values 777 * @{ 778 */ 779#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 780#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 781/** @} */ // end of group VEC_OFFS 782 783#define VLOAD_STR(size) vload##size 784#define VLOAD(size) VLOAD_STR(size) 785 786#define PIXEL_UNIT4 1 787#define PIXEL_UNIT8 2 788#define PIXEL_UNIT16 4 789 790/** Utility macro to convert a vector size in pixel unit. 791 * 792 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 793 * 794 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported 795 * 796 * @return The pixel unit (number of pixels) 797 * @{ 798 */ 799#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 800#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 801/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 802 803#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 804#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 805#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 806 807#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 808#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 809#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 810#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 811#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 812 813/** Utility macro to read a 2D OpenCL image object. 814 * 815 * @note Coordinates are not normalized 816 * 817 * @param[in] data_type Data type 818 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported 819 * @param[in] img OpenCL image object 820 * @param[in] x_coord The x coordinate for the top-left pixel 821 * @param[in] y_coord The y coordinate for the top-left pixel 822 * 823 * @return Pixels from the 2D OpenCL image object 824 * @{ 825 */ 826#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 827#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 828 829#define VSTORE_STR(size) vstore##size 830#define VSTORE(size) VSTORE_STR(size) 831 832#define float1 float 833#define half1 half 834#define char1 char 835#define uchar1 uchar 836#define short1 short 837#define ushort1 ushort 838#define int1 int 839#define uint1 uint 840#define long1 long 841#define ulong1 ulong 842#define double1 double 843 844#define vload1(OFFSET, PTR) *(OFFSET + PTR) 845#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 846 847/** Extended partial vstore that correctly handles scalar values as well. 848 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 849 * @name VSTORE_PARTIAL 850 * 851 * @note With this macro, the passed data can be both a vector and a scalar 852 * @note @p store_size needs to be <= @p size 853 * eg 1: Valid 854 * VSTORE_PARTIAL(16, 15) ...; 855 * eg 2: Invalid 856 * VSTORE_PARTIAL(4, 7) ...; 857 * 858 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 859 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size 860 * @{ 861 */ 862#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 863#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 864 865#define NO_STORE(data, offs, ptr) \ 866 { \ 867 } 868 869// Size == 1 (scalar) 870#define vstore_partial_1_0 NO_STORE 871#define vstore_partial_1_1 vstore1 872#define vstore_partial_1_2 NO_STORE 873#define vstore_partial_1_3 NO_STORE 874#define vstore_partial_1_4 NO_STORE 875#define vstore_partial_1_5 NO_STORE 876#define vstore_partial_1_6 NO_STORE 877#define vstore_partial_1_7 NO_STORE 878#define vstore_partial_1_8 NO_STORE 879#define vstore_partial_1_9 NO_STORE 880#define vstore_partial_1_10 NO_STORE 881#define vstore_partial_1_11 NO_STORE 882#define vstore_partial_1_12 NO_STORE 883#define vstore_partial_1_13 NO_STORE 884#define vstore_partial_1_14 NO_STORE 885#define vstore_partial_1_15 NO_STORE 886#define vstore_partial_1_16 NO_STORE 887// Size == 2 888#define vstore_partial_2_0 NO_STORE 889#define vstore_partial_2_1 vstore_partial_1 890#define vstore_partial_2_2 vstore_partial_2 891#define vstore_partial_2_3 NO_STORE 892#define vstore_partial_2_4 NO_STORE 893#define vstore_partial_2_5 NO_STORE 894#define vstore_partial_2_6 NO_STORE 895#define vstore_partial_2_7 NO_STORE 896#define vstore_partial_2_8 NO_STORE 897#define vstore_partial_2_9 NO_STORE 898#define vstore_partial_2_10 NO_STORE 899#define vstore_partial_2_11 NO_STORE 900#define vstore_partial_2_12 NO_STORE 901#define vstore_partial_2_13 NO_STORE 902#define vstore_partial_2_14 NO_STORE 903#define vstore_partial_2_15 NO_STORE 904#define vstore_partial_2_16 NO_STORE 905// Size == 3 906#define vstore_partial_3_0 NO_STORE 907#define vstore_partial_3_1 vstore_partial_1 908#define vstore_partial_3_2 vstore_partial_2 909#define vstore_partial_3_3 vstore_partial_3 910#define vstore_partial_3_4 NO_STORE 911#define vstore_partial_3_5 NO_STORE 912#define vstore_partial_3_6 NO_STORE 913#define vstore_partial_3_7 NO_STORE 914#define vstore_partial_3_8 NO_STORE 915#define vstore_partial_3_9 NO_STORE 916#define vstore_partial_3_10 NO_STORE 917#define vstore_partial_3_11 NO_STORE 918#define vstore_partial_3_12 NO_STORE 919#define vstore_partial_3_13 NO_STORE 920#define vstore_partial_3_14 NO_STORE 921#define vstore_partial_3_15 NO_STORE 922#define vstore_partial_3_16 NO_STORE 923// Size == 4 924#define vstore_partial_4_0 NO_STORE 925#define vstore_partial_4_1 vstore_partial_1 926#define vstore_partial_4_2 vstore_partial_2 927#define vstore_partial_4_3 vstore_partial_3 928#define vstore_partial_4_4 vstore_partial_4 929#define vstore_partial_4_5 NO_STORE 930#define vstore_partial_4_6 NO_STORE 931#define vstore_partial_4_7 NO_STORE 932#define vstore_partial_4_8 NO_STORE 933#define vstore_partial_4_9 NO_STORE 934#define vstore_partial_4_10 NO_STORE 935#define vstore_partial_4_11 NO_STORE 936#define vstore_partial_4_12 NO_STORE 937#define vstore_partial_4_13 NO_STORE 938#define vstore_partial_4_14 NO_STORE 939#define vstore_partial_4_15 NO_STORE 940#define vstore_partial_4_16 NO_STORE 941// Size == 8 942#define vstore_partial_8_0 NO_STORE 943#define vstore_partial_8_1 vstore_partial_1 944#define vstore_partial_8_2 vstore_partial_2 945#define vstore_partial_8_3 vstore_partial_3 946#define vstore_partial_8_4 vstore_partial_4 947#define vstore_partial_8_5 vstore_partial_5 948#define vstore_partial_8_6 vstore_partial_6 949#define vstore_partial_8_7 vstore_partial_7 950#define vstore_partial_8_8 vstore_partial_8 951#define vstore_partial_8_9 NO_STORE 952#define vstore_partial_8_10 NO_STORE 953#define vstore_partial_8_11 NO_STORE 954#define vstore_partial_8_12 NO_STORE 955#define vstore_partial_8_13 NO_STORE 956#define vstore_partial_8_14 NO_STORE 957#define vstore_partial_8_15 NO_STORE 958#define vstore_partial_8_16 NO_STORE 959// Size == 16 960#define vstore_partial_16_0 NO_STORE 961#define vstore_partial_16_1 vstore_partial_1 962#define vstore_partial_16_2 vstore_partial_2 963#define vstore_partial_16_3 vstore_partial_3 964#define vstore_partial_16_4 vstore_partial_4 965#define vstore_partial_16_5 vstore_partial_5 966#define vstore_partial_16_6 vstore_partial_6 967#define vstore_partial_16_7 vstore_partial_7 968#define vstore_partial_16_8 vstore_partial_8 969#define vstore_partial_16_9 vstore_partial_9 970#define vstore_partial_16_10 vstore_partial_10 971#define vstore_partial_16_11 vstore_partial_11 972#define vstore_partial_16_12 vstore_partial_12 973#define vstore_partial_16_13 vstore_partial_13 974#define vstore_partial_16_14 vstore_partial_14 975#define vstore_partial_16_15 vstore_partial_15 976#define vstore_partial_16_16 vstore_partial_16 977 978/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 979 * @name vstore_partial_n 980 * 981 * @note @p DATA needs to be a vector not a scalar 982 * @note n needs to be <= the vector width of the input variable @p DATA 983 * eg 1: Valid 984 * vstore_partial_15(var:float16, 0, 0xabcd); 985 * eg 2: Invalid 986 * vstore_partial_7(var:float4, 0, 0xabcd); 987 * 988 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. 989 * 990 * @param[in] DATA The name of the variable 991 * @param[in] OFFSET Offset in n 992 * @param[in] PTR The base pointer 993 * @{ 994 */ 995#define vstore_partial_1(DATA, OFFSET, PTR) \ 996 vstore1(DATA.s0, OFFSET, PTR); 997 998#define vstore_partial_2(DATA, OFFSET, PTR) \ 999 vstore2(DATA.s01, OFFSET, PTR); 1000 1001#define vstore_partial_3(DATA, OFFSET, PTR) \ 1002 vstore3(DATA.s012, OFFSET, PTR); 1003 1004#define vstore_partial_4(DATA, OFFSET, PTR) \ 1005 vstore4(DATA.s0123, OFFSET, PTR); 1006 1007#define vstore_partial_5(DATA, OFFSET, PTR) \ 1008 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1009 vstore1(DATA.s4, OFFSET, PTR + 4); 1010 1011#define vstore_partial_6(DATA, OFFSET, PTR) \ 1012 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1013 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 1014 1015#define vstore_partial_7(DATA, OFFSET, PTR) \ 1016 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1017 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 1018 1019#define vstore_partial_8(DATA, OFFSET, PTR) \ 1020 vstore8(DATA.s01234567, OFFSET, PTR); 1021 1022#define vstore_partial_9(DATA, OFFSET, PTR) \ 1023 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1024 vstore1(DATA.s8, OFFSET, PTR + 8); 1025 1026#define vstore_partial_10(DATA, OFFSET, PTR) \ 1027 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1028 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 1029 1030#define vstore_partial_11(DATA, OFFSET, PTR) \ 1031 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1032 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 1033 1034#define vstore_partial_12(DATA, OFFSET, PTR) \ 1035 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1036 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 1037 1038#define vstore_partial_13(DATA, OFFSET, PTR) \ 1039 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1040 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 1041 1042#define vstore_partial_14(DATA, OFFSET, PTR) \ 1043 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1044 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 1045 1046#define vstore_partial_15(DATA, OFFSET, PTR) \ 1047 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1048 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 1049 1050#define vstore_partial_16(DATA, OFFSET, PTR) \ 1051 vstore16(DATA, OFFSET, PTR); 1052/** @} */ // end of groupd vstore_partial_n 1053/** @} */ // end of groupd VSTORE_PARTIAL 1054 1055// Convert built-in functions with _sat modifier are not supported in floating point so we create defines 1056// without _sat to overcome this issue 1057#define convert_float_sat convert_float 1058#define convert_float1_sat convert_float 1059#define convert_float2_sat convert_float2 1060#define convert_float3_sat convert_float3 1061#define convert_float4_sat convert_float4 1062#define convert_float8_sat convert_float8 1063#define convert_float16_sat convert_float16 1064#define convert_half_sat convert_float 1065#define convert_half1_sat convert_half 1066#define convert_half2_sat convert_half2 1067#define convert_half3_sat convert_half3 1068#define convert_half4_sat convert_half4 1069#define convert_half8_sat convert_half8 1070#define convert_half16_sat convert_half16 1071 1072#define convert_float1 convert_float 1073#define convert_half1 convert_half 1074#define convert_char1 convert_char 1075#define convert_uchar1 convert_uchar 1076#define convert_short1 convert_short 1077#define convert_ushort1 convert_ushort 1078#define convert_int1 convert_int 1079#define convert_uint1 convert_uint 1080#define convert_long1 convert_long 1081#define convert_ulong1 convert_ulong 1082#define convert_double1 convert_double 1083 1084#define convert_char1_sat convert_char_sat 1085#define convert_uchar1_sat convert_uchar_sat 1086#define convert_short1_sat convert_short_sat 1087#define convert_ushort1_sat convert_ushort_sat 1088#define convert_int1_sat convert_int_sat 1089#define convert_uint1_sat convert_uint_sat 1090#define convert_long1_sat convert_long_sat 1091#define convert_ulong1_sat convert_ulong_sat 1092#define convert_double1_sat convert_double_sat 1093 1094#define VEC_DATA_TYPE_STR(type, size) type##size 1095#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 1096 1097#define CONVERT_STR(x, type) (convert_##type((x))) 1098#define CONVERT(x, type) CONVERT_STR(x, type) 1099 1100#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 1101#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 1102 1103#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 1104#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 1105 1106#define select_vec_dt_uchar(size) uchar##size 1107#define select_vec_dt_char(size) char##size 1108#define select_vec_dt_ushort(size) ushort##size 1109#define select_vec_dt_short(size) short##size 1110#define select_vec_dt_half(size) short##size 1111#define select_vec_dt_uint(size) uint##size 1112#define select_vec_dt_int(size) int##size 1113#define select_vec_dt_float(size) int##size 1114#define select_vec_dt_ulong(size) ulong##size 1115#define select_vec_dt_long(size) long##size 1116 1117#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 1118#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 1119#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 1120 1121#define sum_reduce_1(x) (x) 1122#define sum_reduce_2(x) ((x).s0) + ((x).s1) 1123#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 1124#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 1125#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 1126#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 1127 1128#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 1129#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 1130 1131#define max_reduce_1(x) (x) 1132#define max_reduce_2(x) max(((x).s0), ((x).s1)) 1133#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 1134#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 1135#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 1136#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 1137 1138#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 1139#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 1140 1141#define VECTOR_DECLARATION(name) \ 1142 __global uchar *name##_ptr, \ 1143 uint name##_stride_x, \ 1144 uint name##_step_x, \ 1145 uint name##_offset_first_element_in_bytes 1146 1147#define IMAGE_DECLARATION(name) \ 1148 __global uchar *name##_ptr, \ 1149 uint name##_stride_x, \ 1150 uint name##_step_x, \ 1151 uint name##_stride_y, \ 1152 uint name##_step_y, \ 1153 uint name##_offset_first_element_in_bytes 1154 1155#define TENSOR3D_DECLARATION(name) \ 1156 __global uchar *name##_ptr, \ 1157 uint name##_stride_x, \ 1158 uint name##_step_x, \ 1159 uint name##_stride_y, \ 1160 uint name##_step_y, \ 1161 uint name##_stride_z, \ 1162 uint name##_step_z, \ 1163 uint name##_offset_first_element_in_bytes 1164 1165#define TENSOR4D_DECLARATION(name) \ 1166 __global uchar *name##_ptr, \ 1167 uint name##_stride_x, \ 1168 uint name##_step_x, \ 1169 uint name##_stride_y, \ 1170 uint name##_step_y, \ 1171 uint name##_stride_z, \ 1172 uint name##_step_z, \ 1173 uint name##_stride_w, \ 1174 uint name##_step_w, \ 1175 uint name##_offset_first_element_in_bytes 1176 1177#define CONVERT_TO_VECTOR_STRUCT(name) \ 1178 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1179 1180#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1181 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1182 1183#define CONVERT_TO_IMAGE_STRUCT(name) \ 1184 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1185 1186#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1187 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1188 1189#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1190 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1191 1192#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1193 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1194 1195#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1196 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1197 1198#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1199 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1200 name##_stride_z, name##_step_z) 1201 1202#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1203 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1204 1205#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1206 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1207 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1208 1209#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1210 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1211 1212#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1213 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1214 name##_stride_z, name##_step_z) 1215 1216/** Structure to hold Vector information */ 1217typedef struct Vector 1218{ 1219 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1220 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1221 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1222} Vector; 1223 1224/** Structure to hold Image information */ 1225typedef struct Image 1226{ 1227 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1228 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1229 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1230 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1231} Image; 1232 1233/** Structure to hold 3D tensor information */ 1234typedef struct Tensor3D 1235{ 1236 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1237 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1238 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1239 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1240 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1241} Tensor3D; 1242 1243/** Structure to hold 4D tensor information */ 1244typedef struct Tensor4D 1245{ 1246 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1247 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1248 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1249 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1250 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1251 int stride_w; /**< Stride of the image in W dimension (in bytes) */ 1252} Tensor4D; 1253 1254/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. 1255 * 1256 * @param[in] ptr Pointer to the starting postion of the buffer 1257 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector 1258 * @param[in] stride_x Stride of the vector in X dimension (in bytes) 1259 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1260 * 1261 * @return An image object 1262 */ 1263inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1264{ 1265 Vector vector = 1266 { 1267 .ptr = ptr, 1268 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1269 .stride_x = stride_x, 1270 }; 1271 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1272 return vector; 1273} 1274 1275/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. 1276 * 1277 * @param[in] ptr Pointer to the starting postion of the buffer 1278 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1279 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1280 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1281 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1282 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1283 * 1284 * @return An image object 1285 */ 1286inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1287{ 1288 Image img = 1289 { 1290 .ptr = ptr, 1291 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1292 .stride_x = stride_x, 1293 .stride_y = stride_y 1294 }; 1295 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1296 return img; 1297} 1298 1299/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. 1300 * 1301 * @param[in] ptr Pointer to the starting postion of the buffer 1302 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1303 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1304 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1305 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1306 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1307 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1308 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1309 * 1310 * @return A 3D tensor object 1311 */ 1312inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1313{ 1314 Image img = 1315 { 1316 .ptr = ptr, 1317 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1318 .stride_x = stride_x, 1319 .stride_y = stride_y 1320 }; 1321 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1322 return img; 1323} 1324 1325/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. 1326 * 1327 * @param[in] ptr Pointer to the starting postion of the buffer 1328 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1329 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1330 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1331 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1332 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1333 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1334 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1335 * 1336 * @return A 3D tensor object 1337 */ 1338inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1339{ 1340 Tensor3D tensor = 1341 { 1342 .ptr = ptr, 1343 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1344 .stride_x = stride_x, 1345 .stride_y = stride_y, 1346 .stride_z = stride_z 1347 }; 1348 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1349 return tensor; 1350} 1351 1352/** Wrap 3D tensor information into an tensor structure. 1353 * 1354 * @param[in] ptr Pointer to the starting postion of the buffer 1355 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1356 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1357 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1358 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1359 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1360 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1361 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1362 * 1363 * @return A 3D tensor object 1364 */ 1365inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1366{ 1367 Tensor3D tensor = 1368 { 1369 .ptr = ptr, 1370 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1371 .stride_x = stride_x, 1372 .stride_y = stride_y, 1373 .stride_z = stride_z 1374 }; 1375 return tensor; 1376} 1377 1378inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1379 uint step_w, 1380 uint mod_size) 1381{ 1382 Tensor4D tensor = 1383 { 1384 .ptr = ptr, 1385 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1386 .stride_x = stride_x, 1387 .stride_y = stride_y, 1388 .stride_z = stride_z, 1389 .stride_w = stride_w 1390 }; 1391 1392 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1393 return tensor; 1394} 1395 1396/** Get the pointer position of a Vector 1397 * 1398 * @param[in] vec Pointer to the starting position of the buffer 1399 * @param[in] x Relative X position 1400 */ 1401inline __global const uchar *vector_offset(const Vector *vec, int x) 1402{ 1403 return vec->ptr + x * vec->stride_x; 1404} 1405 1406/** Get the pointer position of a Image 1407 * 1408 * @param[in] img Pointer to the starting position of the buffer 1409 * @param[in] x Relative X position 1410 * @param[in] y Relative Y position 1411 */ 1412inline __global uchar *offset(const Image *img, int x, int y) 1413{ 1414 return img->ptr + x * img->stride_x + y * img->stride_y; 1415} 1416 1417/** Get the pointer position of a Tensor3D 1418 * 1419 * @param[in] tensor Pointer to the starting position of the buffer 1420 * @param[in] x Relative X position 1421 * @param[in] y Relative Y position 1422 * @param[in] z Relative Z position 1423 */ 1424inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1425{ 1426 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1427} 1428 1429/** Get the pointer position of a Tensor4D 1430 * 1431 * @param[in] tensor Pointer to the starting position of the buffer 1432 * @param[in] x Relative X position 1433 * @param[in] y Relative Y position 1434 * @param[in] z Relative Z position 1435 * @param[in] w Relative W position 1436 */ 1437inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1438{ 1439 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1440} 1441 1442/** Get the offset for a given linear index of a Tensor3D 1443 * 1444 * @param[in] tensor Pointer to the starting position of the buffer 1445 * @param[in] width Width of the input tensor 1446 * @param[in] height Height of the input tensor 1447 * @param[in] depth Depth of the input tensor 1448 * @param[in] index Linear index 1449 */ 1450inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1451{ 1452 uint num_elements = width * height; 1453 1454 const uint z = index / num_elements; 1455 1456 index %= num_elements; 1457 1458 const uint y = index / width; 1459 1460 index %= width; 1461 1462 const uint x = index; 1463 1464 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1465} 1466 1467#endif // _HELPER_H 1468 1469)"