1R"( 2 3/* 4 * Copyright (c) 2016-2020 Arm Limited. 5 * 6 * SPDX-License-Identifier: MIT 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to 10 * deal in the Software without restriction, including without limitation the 11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 * sell copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in all 16 * copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 * SOFTWARE. 25 */ 26/* 27 * Copyright (c) 2019-2020 Arm Limited. 28 * 29 * SPDX-License-Identifier: MIT 30 * 31 * Permission is hereby granted, free of charge, to any person obtaining a copy 32 * of this software and associated documentation files (the "Software"), to 33 * deal in the Software without restriction, including without limitation the 34 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 35 * sell copies of the Software, and to permit persons to whom the Software is 36 * furnished to do so, subject to the following conditions: 37 * 38 * The above copyright notice and this permission notice shall be included in all 39 * copies or substantial portions of the Software. 40 * 41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 44 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 45 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 46 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 47 * SOFTWARE. 48 */ 49 50/* 51 * Copyright (c) 2016-2020 Arm Limited. 52 * 53 * SPDX-License-Identifier: MIT 54 * 55 * Permission is hereby granted, free of charge, to any person obtaining a copy 56 * of this software and associated documentation files (the "Software"), to 57 * deal in the Software without restriction, including without limitation the 58 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 59 * sell copies of the Software, and to permit persons to whom the Software is 60 * furnished to do so, subject to the following conditions: 61 * 62 * The above copyright notice and this permission notice shall be included in all 63 * copies or substantial portions of the Software. 64 * 65 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 66 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 67 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 68 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 69 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 70 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 71 * SOFTWARE. 72 */ 73#ifndef ARM_COMPUTE_HELPER_H 74#define ARM_COMPUTE_HELPER_H 75 76/* 77 * Copyright (c) 2020 Arm Limited. 78 * 79 * SPDX-License-Identifier: MIT 80 * 81 * Permission is hereby granted, free of charge, to any person obtaining a copy 82 * of this software and associated documentation files (the "Software"), to 83 * deal in the Software without restriction, including without limitation the 84 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 85 * sell copies of the Software, and to permit persons to whom the Software is 86 * furnished to do so, subject to the following conditions: 87 * 88 * The above copyright notice and this permission notice shall be included in all 89 * copies or substantial portions of the Software. 90 * 91 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 92 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 93 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 94 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 95 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 96 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 97 * SOFTWARE. 98 */ 99 100/** Store the 0 to (n-1)th rows of the given variables 101 * @name STORE_ROW_n 102 * 103 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 104 * @param[in] DATA_TYPE The data type of the vectors 105 * @param[in] BASENAME The basename of the variables 106 * @param[in] PTR The base pointer 107 * @param[in] STRIDE_Y The stride value in y-axis direction 108 * @param[in] Z The offset in z-axis direction 109 * @{ 110 */ 111#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 112 VSTORE(N0) \ 113 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 114 115#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 116 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 117 VSTORE(N0) \ 118 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 119 120#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 121 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 122 VSTORE(N0) \ 123 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 124 125#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 126 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 127 VSTORE(N0) \ 128 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 129 130#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 131 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 132 VSTORE(N0) \ 133 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 134 135#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 136 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 137 VSTORE(N0) \ 138 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 139 140#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 141 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 142 VSTORE(N0) \ 143 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 144 145#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 146 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 147 VSTORE(N0) \ 148 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 149 150#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 151 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 152 VSTORE(N0) \ 153 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 154 155#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 156 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 157 VSTORE(N0) \ 158 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 159 160#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 161 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 162 VSTORE(N0) \ 163 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 164 165#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 166 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 167 VSTORE(N0) \ 168 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 169 170#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 171 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 172 VSTORE(N0) \ 173 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 174 175#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 176 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 177 VSTORE(N0) \ 178 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 179 180#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 181 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 182 VSTORE(N0) \ 183 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 184 185#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 187 VSTORE(N0) \ 188 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 189/** @} */ // end of groupd STORE_ROW_n 190 191/** Convert and store the 0th to (n-1)th rows of the given variables 192 * @name CONVERT_STORE_ROW_n 193 * 194 * @param[in] N0 The size of the vectors 195 * @param[in] DATA_TYPE The data type of the vectors 196 * @param[in] BASENAME The basename of the variables 197 * @param[in] PTR The base pointer 198 * @param[in] STRIDE_Y The stride value in y-axis direction 199 * @param[in] Z The offset in z-axis direction 200 * @{ 201 */ 202#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 203 VSTORE(N0) \ 204 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 205 206#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 207 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 208 VSTORE(N0) \ 209 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 210 211#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 212 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 213 VSTORE(N0) \ 214 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 215 216#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 217 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 218 VSTORE(N0) \ 219 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 220 221#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 222 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 223 VSTORE(N0) \ 224 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 225 226#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 227 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 228 VSTORE(N0) \ 229 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 230 231#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 232 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 233 VSTORE(N0) \ 234 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 235 236#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 237 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 238 VSTORE(N0) \ 239 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 240 241#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 242 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 243 VSTORE(N0) \ 244 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 245 246#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 247 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 248 VSTORE(N0) \ 249 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 250 251#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 252 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 253 VSTORE(N0) \ 254 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 255 256#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 257 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 258 VSTORE(N0) \ 259 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 260 261#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 262 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 263 VSTORE(N0) \ 264 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 265 266#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 267 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 268 VSTORE(N0) \ 269 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 270 271#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 272 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 273 VSTORE(N0) \ 274 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 275 276#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 277 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 278 VSTORE(N0) \ 279 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 280 281/** @} */ // end of groupd CONVERT_STORE_ROW_n 282 283/** Store a block of the given size M0xN0 284 * @name STORE_BLOCK 285 * 286 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 287 * The data to store is expected to have consecutive names for each row. 288 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 289 * The Z offset is expected to have consecutive names. 290 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 291 * 292 * @param[in] M0 The number of rows to store 293 * @param[in] N0 The size of each vector 294 * @param[in] DATA_TYPE The data type of the vectors 295 * @param[in] BASENAME The basename of the variables 296 * @param[in] PTR The base pointer 297 * @param[in] STRIDE_Y The stride value in y-axis direction 298 * @param[in] Z The offset in z-axis direction 299 * @{ 300 */ 301#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 302#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 303/** @} */ // end of group STORE_BLOCK 304 305/** Convert and store a block of the given size M0xN0 306 * @name CONVERT_STORE_BLOCK 307 * 308 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 309 * The data to store is expected to have consecutive names for each row. 310 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 311 * The Z offset is expected to have consecutive names. 312 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 313 * 314 * @param[in] M0 The number of rows to store 315 * @param[in] N0 The size of each vector 316 * @param[in] DATA_TYPE The data type of the vectors 317 * @param[in] BASENAME The basename of the variables 318 * @param[in] PTR The base pointer 319 * @param[in] STRIDE_Y The stride value in y-axis direction 320 * @param[in] Z The offset in z-axis direction 321 * @{ 322 */ 323#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 324#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 325/** @} */ // end of group CONVERT_STORE_BLOCK 326 327/** Partially store the 0 to (n-1)th rows of the given variables 328 * @name STORE_ROW_PARTIAL_n 329 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0 330 * 331 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 332 * 333 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 334 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0 335 * @param[in] DATA_TYPE The data type of the vectors 336 * @param[in] BASENAME The basename of the variables 337 * @param[in] PTR The base pointer 338 * @param[in] STRIDE_Y The stride value in y-axis direction 339 * @param[in] Z The offset in z-axis direction 340 * @{ 341 */ 342#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 343 VSTORE_PARTIAL(N0, STORE_N0) \ 344 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 345 346#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 347 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 348 VSTORE_PARTIAL(N0, STORE_N0) \ 349 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 350 351#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 352 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 353 VSTORE_PARTIAL(N0, STORE_N0) \ 354 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 355 356#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 357 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 358 VSTORE_PARTIAL(N0, STORE_N0) \ 359 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 360 361#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 362 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 363 VSTORE_PARTIAL(N0, STORE_N0) \ 364 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 365 366#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 367 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 368 VSTORE_PARTIAL(N0, STORE_N0) \ 369 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 370 371#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 372 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 373 VSTORE_PARTIAL(N0, STORE_N0) \ 374 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 375 376#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 377 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 378 VSTORE_PARTIAL(N0, STORE_N0) \ 379 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 380 381#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 382 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 383 VSTORE_PARTIAL(N0, STORE_N0) \ 384 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 385 386#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 387 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 388 VSTORE_PARTIAL(N0, STORE_N0) \ 389 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 390 391#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 392 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 393 VSTORE_PARTIAL(N0, STORE_N0) \ 394 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 395 396#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 397 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 398 VSTORE_PARTIAL(N0, STORE_N0) \ 399 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 400 401#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 402 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 403 VSTORE_PARTIAL(N0, STORE_N0) \ 404 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 405 406#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 407 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 408 VSTORE_PARTIAL(N0, STORE_N0) \ 409 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 410 411#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 412 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 413 VSTORE_PARTIAL(N0, STORE_N0) \ 414 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 415 416#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 417 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 418 VSTORE_PARTIAL(N0, STORE_N0) \ 419 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 420/** @} */ // end of groupd STORE_ROW_PARTIAL_n 421 422/** Partially store a block of the given size STORE_M0xSTORE_N0 423 * @name STORE_BLOCK_PARTIAL 424 * 425 * @note The vector width @p N0 is also required for correct partial storing behaviour. 426 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 427 * 428 * The data to store is expected to have consecutive names for each row. 429 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2. 430 * The Z offset is expected to have consecutive names. 431 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 432 * 433 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16 434 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0 435 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 436 * @param[in] DATA_TYPE The data type of the vectors 437 * @param[in] BASENAME The basename of the variables 438 * @param[in] PTR The base pointer 439 * @param[in] STRIDE_Y The stride value in y-axis direction 440 * @param[in] Z The offset in z-axis direction 441 * @{ 442 */ 443#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 444#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 445/** Store a block that can be partial in both x and y dimensions 446 * 447 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 448 * 449 * The data to store is expected to have consecutive names for each row. 450 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 451 * The Z offset is expected to have consecutive names. 452 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 453 * 454 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 455 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 456 * @param[in] DATA_TYPE The data type of the vectors 457 * @param[in] BASENAME The basename of the variables 458 * @param[in] PTR The base pointer 459 * @param[in] STRIDE_Y The stride value in y-axis direction 460 * @param[in] Z The offset in z-axis direction 461 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 462 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 463 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 464 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 465 */ 466#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 467 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 468 { \ 469 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 470 } \ 471 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 472 { \ 473 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 474 } \ 475 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 476 { \ 477 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 478 } \ 479 else \ 480 { \ 481 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 482 } 483/** Store a block that can only be partial in x but not y. 484 * 485 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 486 * 487 * The data to store is expected to have consecutive names for each row. 488 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 489 * The Z offset is expected to have consecutive names. 490 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 491 * 492 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 493 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 494 * @param[in] DATA_TYPE The data type of the vectors 495 * @param[in] BASENAME The basename of the variables 496 * @param[in] PTR The base pointer 497 * @param[in] STRIDE_Y The stride value in y-axis direction 498 * @param[in] Z The offset in z-axis direction 499 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 500 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 501 */ 502#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 503 if(!(PARTIAL_COND_X)) \ 504 { \ 505 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 506 } \ 507 else \ 508 { \ 509 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 510 } 511/** Store a block that can only be partial in y but not x. 512 * 513 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 514 * 515 * The data to store is expected to have consecutive names for each row. 516 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 517 * The Z offset is expected to have consecutive names. 518 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 519 * 520 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 521 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 522 * @param[in] DATA_TYPE The data type of the vectors 523 * @param[in] BASENAME The basename of the variables 524 * @param[in] PTR The base pointer 525 * @param[in] STRIDE_Y The stride value in y-axis direction 526 * @param[in] Z The offset in z-axis direction 527 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 528 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 529 */ 530#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 531 if(!(PARTIAL_COND_Y)) \ 532 { \ 533 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 534 } \ 535 else \ 536 { \ 537 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 538 } 539/** @} */ // end of group STORE_BLOCK_PARTIAL 540 541#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 542 543/** Boundary-aware GEMM block store 544 * @name STORE_BLOCK_BOUNDARY_AWARE 545 * This macro assumes the following schemes to achieve boundary-awareness: 546 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 547 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 548 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 549 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim. 550 * 551 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 552 * blocks **at the end**. 553 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 554 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 555 * 556 * *--x--> x == 0 x == 1 557 * | |<------------------------------N-------------------------->| 558 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 559 * | -------------############################################################# 560 * * | | |...............................|...........................| 561 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 562 * | | |...............................|...........................| 563 * M --############################################################# 564 * | | | |...........................| 565 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 566 * | | | |...........................| 567 * |------------############################################################# 568 * 569 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 570 * 571 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 572 * 573 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 574 * and select corresponding store methods such that the boundary detection logic is only added when needed. 575 * 576 * The data to store is expected to have consecutive names for each row. 577 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 578 * The Z offset is expected to have consecutive names. 579 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 580 * 581 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 582 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 583 * @param[in] DATA_TYPE The data type of the vectors 584 * @param[in] BASENAME The basename of the variables 585 * @param[in] PTR The base pointer 586 * @param[in] STRIDE_Y The stride value in y-axis direction 587 * @param[in] Z The offset in z-axis direction 588 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 589 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 590 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 591 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 592 * @{ 593 */ 594#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 595// Case1: No partial blocks in either x or y 596#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 597 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 598 599#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 600// Case2: Partial blocks in y 601#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 602 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 603 604#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 605// Case3: Partial blocks in x 606#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 607 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 608 609#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 610// Case4: Partial blocks in both x and y 611#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 612 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 613 614#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 615 616#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 617/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE 618 619#if defined(PARTIAL_STORE_M0) 620/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding 621 * @name COMPUTE_M0_START_ROW 622 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows. 623 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent 624 * blocks in the y dimension to avoid any padding. 625 * EG: M0=4, PARTIAL_STORE_M0=1: 626 * | Non-overlapping | +M0_ROW_SHIFT (Overlapping) 627 * block 0 (partial)| start row = 0 | start row = 0 628 * block 1 (full) | start row = 4 | start row = 1 629 * block 2 (full) | start row = 8 | start row = 5 630 * 631 * @param[in] y Global id of current block in y. 632 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 633 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 634 * @{ 635 */ 636#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 637 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 638#else // defined(PARTIAL_STORE_M0) 639#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 640 ((uint)(y * M0)) 641#endif // defined(PARTIAL_STORE_M0) 642/** @} */ // end of group COMPUTE_M0_START_ROW 643 644/** Store a vector that can only be partial in x. 645 * 646 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 647 * 648 * The data to store is expected to end in a 0. 649 * E.g., for basename=c, the expected name is c0. 650 * 651 * @param[in] basename The name of the variable without trailing 0 652 * @param[in] data_type The data type of the vector 653 * @param[in] ptr The base pointer 654 * @param[in] vec_size The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16 655 * @param[in] leftover The vector size if cond = true. Supported range: [1, @p vec_size0) 656 * @param[in] cond Condition to select either vec_size0 or vec_size1 657 * @{ 658 */ 659#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 660 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 661/** @} */ // end of group STORE_VECTOR_SELECT 662 663#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 664#pragma OPENCL EXTENSION cl_khr_fp16 : enable 665#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666 667#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 668#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 669#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 670 671#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 672#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 673#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 674 675#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 676#pragma OPENCL EXTENSION cl_arm_printf : enable 677#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 678 679#define GPU_ARCH_MIDGARD 0x100 680#define GPU_ARCH_BIFROST 0x200 681 682/** Concatenate two inputs. 683 * 684 * @param[in] a The first input to be concatenated 685 * @param[in] b The second input to be concatenated 686 * 687 * @return The concatenated output 688 */ 689#define CONCAT(a, b) a##b 690 691/** Expand the given vector 692 * 693 * @param[in] x The vector to be expanded 694 * 695 * @return The expanded output 696 */ 697#define EXPAND(x) x 698 699/** Clamp the given value between an upper and lower bound. 700 * 701 * @param[in] x The value to be clamped 702 * @param[in] min_val The lower bound 703 * @param[in] max_val The upper bound 704 * 705 * @return The clamped value. 706 */ 707#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 708 709/** REVn reverses the given vector whose size is n. 710 * @name REVn 711 * 712 * @param[in] x The vector to be reversed 713 * 714 * @return The reversed vector 715 * @{ 716 */ 717#define REV1(x) ((x)) 718#define REV2(x) ((x).s10) 719#define REV3(x) ((x).s210) 720#define REV4(x) ((x).s3210) 721#define REV8(x) ((x).s76543210) 722#define REV16(x) ((x).sFEDCBA9876543210) 723/** @} */ // end of group REVn 724 725/** Reverse the given vector. 726 * @name REVERSE 727 * 728 * @param[in] x The vector to be reversed 729 * @param[in] s The size of the vector 730 * 731 * @return The reversed vector 732 * @{ 733 */ 734#define REVERSE_STR(x, s) REV##s((x)) 735#define REVERSE(x, s) REVERSE_STR(x, s) 736/** @} */ // end of group REVERSE 737 738/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. 739 * @name ROTs_n 740 * 741 * @param[in] x The vector to be shifted 742 * 743 * @return The shifted vector 744 * @{ 745 */ 746#define ROT1_0(x) ((x)) 747 748#define ROT2_0(x) ((x)) 749#define ROT2_1(x) ((x).s10) 750 751#define ROT3_0(x) ((x)) 752#define ROT3_1(x) ((x).s201) 753#define ROT3_2(x) ((x).s120) 754 755#define ROT4_0(x) ((x)) 756#define ROT4_1(x) ((x).s3012) 757#define ROT4_2(x) ((x).s2301) 758#define ROT4_3(x) ((x).s1230) 759 760#define ROT8_0(x) ((x)) 761#define ROT8_1(x) ((x).s70123456) 762#define ROT8_2(x) ((x).s67012345) 763#define ROT8_3(x) ((x).s56701234) 764#define ROT8_4(x) ((x).s45670123) 765#define ROT8_5(x) ((x).s34567012) 766#define ROT8_6(x) ((x).s23456701) 767#define ROT8_7(x) ((x).s12345670) 768 769#define ROT16_0(x) ((x)) 770#define ROT16_1(x) ((x).sF0123456789ABCDE) 771#define ROT16_2(x) ((x).sEF0123456789ABCD) 772#define ROT16_3(x) ((x).sDEF0123456789ABC) 773#define ROT16_4(x) ((x).sCDEF0123456789AB) 774#define ROT16_5(x) ((x).sBCDEF0123456789A) 775#define ROT16_6(x) ((x).sABCDEF0123456789) 776#define ROT16_7(x) ((x).s9ABCDEF012345678) 777#define ROT16_8(x) ((x).s89ABCDEF01234567) 778#define ROT16_9(x) ((x).s789ABCDEF0123456) 779#define ROT16_10(x) ((x).s6789ABCDEF012345) 780#define ROT16_11(x) ((x).s56789ABCDEF01234) 781#define ROT16_12(x) ((x).s456789ABCDEF0123) 782#define ROT16_13(x) ((x).s3456789ABCDEF012) 783#define ROT16_14(x) ((x).s23456789ABCDEF01) 784#define ROT16_15(x) ((x).s123456789ABCDEF0) 785/** @} */ // end of group ROTs_n 786 787/** Circular-right-shift (rotate-right) the given vector by the given amount. 788 * @name ROTATE 789 * 790 * @param[in] x The vector to be shifted 791 * @param[in] s The size of the vector 792 * @param[in] n The amount to be shifted 793 * 794 * @return The shifted vector 795 * @{ 796 */ 797#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 798#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 799/** @} */ // end of group ROTATE 800 801/** Creates a vector of size n filled with offset values corresponding to the location of each element. 802 * @name V_OFFSn 803 * 804 * @param[in] dt The data type of the output vector 805 * 806 * @return The vector filled with offset values 807 * @{ 808 */ 809#define V_OFFS1(dt) (dt##1)(0) 810#define V_OFFS2(dt) (dt##2)(0, 1) 811#define V_OFFS3(dt) (dt##3)(0, 1, 2) 812#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 813#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 814#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 815/** @} */ // end of group V_OFFSn 816 817/** Create a vector filled with offset values corresponding to the location of each element. 818 * @name VEC_OFFS 819 * 820 * @param[in] dt The data type of the output vector 821 * @param[in] s The size of the output vector 822 * 823 * @return The vector filled with offset values 824 * @{ 825 */ 826#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 827#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 828/** @} */ // end of group VEC_OFFS 829 830#define VLOAD_STR(size) vload##size 831#define VLOAD(size) VLOAD_STR(size) 832 833#define PIXEL_UNIT4 1 834#define PIXEL_UNIT8 2 835#define PIXEL_UNIT16 4 836 837/** Utility macro to convert a vector size in pixel unit. 838 * 839 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 840 * 841 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported 842 * 843 * @return The pixel unit (number of pixels) 844 * @{ 845 */ 846#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 847#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 848/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 849 850#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 851#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 852#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 853 854#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 855#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 856#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 857#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 858#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 859 860/** Utility macro to read a 2D OpenCL image object. 861 * 862 * @note Coordinates are not normalized 863 * 864 * @param[in] data_type Data type 865 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported 866 * @param[in] img OpenCL image object 867 * @param[in] x_coord The x coordinate for the top-left pixel 868 * @param[in] y_coord The y coordinate for the top-left pixel 869 * 870 * @return Pixels from the 2D OpenCL image object 871 * @{ 872 */ 873#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 874#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 875 876#define VSTORE_STR(size) vstore##size 877#define VSTORE(size) VSTORE_STR(size) 878 879#define float1 float 880#define half1 half 881#define char1 char 882#define uchar1 uchar 883#define short1 short 884#define ushort1 ushort 885#define int1 int 886#define uint1 uint 887#define long1 long 888#define ulong1 ulong 889#define double1 double 890 891#define vload1(OFFSET, PTR) *(OFFSET + PTR) 892#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 893 894/** Extended partial vstore that correctly handles scalar values as well. 895 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 896 * @name VSTORE_PARTIAL 897 * 898 * @note With this macro, the passed data can be both a vector and a scalar 899 * @note @p store_size needs to be <= @p size 900 * eg 1: Valid 901 * VSTORE_PARTIAL(16, 15) ...; 902 * eg 2: Invalid 903 * VSTORE_PARTIAL(4, 7) ...; 904 * 905 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 906 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size 907 * @{ 908 */ 909#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 910#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 911 912#define NO_STORE(data, offs, ptr) \ 913 { \ 914 } 915 916// Size == 1 (scalar) 917#define vstore_partial_1_0 NO_STORE 918#define vstore_partial_1_1 vstore1 919#define vstore_partial_1_2 NO_STORE 920#define vstore_partial_1_3 NO_STORE 921#define vstore_partial_1_4 NO_STORE 922#define vstore_partial_1_5 NO_STORE 923#define vstore_partial_1_6 NO_STORE 924#define vstore_partial_1_7 NO_STORE 925#define vstore_partial_1_8 NO_STORE 926#define vstore_partial_1_9 NO_STORE 927#define vstore_partial_1_10 NO_STORE 928#define vstore_partial_1_11 NO_STORE 929#define vstore_partial_1_12 NO_STORE 930#define vstore_partial_1_13 NO_STORE 931#define vstore_partial_1_14 NO_STORE 932#define vstore_partial_1_15 NO_STORE 933#define vstore_partial_1_16 NO_STORE 934// Size == 2 935#define vstore_partial_2_0 NO_STORE 936#define vstore_partial_2_1 vstore_partial_1 937#define vstore_partial_2_2 vstore_partial_2 938#define vstore_partial_2_3 NO_STORE 939#define vstore_partial_2_4 NO_STORE 940#define vstore_partial_2_5 NO_STORE 941#define vstore_partial_2_6 NO_STORE 942#define vstore_partial_2_7 NO_STORE 943#define vstore_partial_2_8 NO_STORE 944#define vstore_partial_2_9 NO_STORE 945#define vstore_partial_2_10 NO_STORE 946#define vstore_partial_2_11 NO_STORE 947#define vstore_partial_2_12 NO_STORE 948#define vstore_partial_2_13 NO_STORE 949#define vstore_partial_2_14 NO_STORE 950#define vstore_partial_2_15 NO_STORE 951#define vstore_partial_2_16 NO_STORE 952// Size == 3 953#define vstore_partial_3_0 NO_STORE 954#define vstore_partial_3_1 vstore_partial_1 955#define vstore_partial_3_2 vstore_partial_2 956#define vstore_partial_3_3 vstore_partial_3 957#define vstore_partial_3_4 NO_STORE 958#define vstore_partial_3_5 NO_STORE 959#define vstore_partial_3_6 NO_STORE 960#define vstore_partial_3_7 NO_STORE 961#define vstore_partial_3_8 NO_STORE 962#define vstore_partial_3_9 NO_STORE 963#define vstore_partial_3_10 NO_STORE 964#define vstore_partial_3_11 NO_STORE 965#define vstore_partial_3_12 NO_STORE 966#define vstore_partial_3_13 NO_STORE 967#define vstore_partial_3_14 NO_STORE 968#define vstore_partial_3_15 NO_STORE 969#define vstore_partial_3_16 NO_STORE 970// Size == 4 971#define vstore_partial_4_0 NO_STORE 972#define vstore_partial_4_1 vstore_partial_1 973#define vstore_partial_4_2 vstore_partial_2 974#define vstore_partial_4_3 vstore_partial_3 975#define vstore_partial_4_4 vstore_partial_4 976#define vstore_partial_4_5 NO_STORE 977#define vstore_partial_4_6 NO_STORE 978#define vstore_partial_4_7 NO_STORE 979#define vstore_partial_4_8 NO_STORE 980#define vstore_partial_4_9 NO_STORE 981#define vstore_partial_4_10 NO_STORE 982#define vstore_partial_4_11 NO_STORE 983#define vstore_partial_4_12 NO_STORE 984#define vstore_partial_4_13 NO_STORE 985#define vstore_partial_4_14 NO_STORE 986#define vstore_partial_4_15 NO_STORE 987#define vstore_partial_4_16 NO_STORE 988// Size == 8 989#define vstore_partial_8_0 NO_STORE 990#define vstore_partial_8_1 vstore_partial_1 991#define vstore_partial_8_2 vstore_partial_2 992#define vstore_partial_8_3 vstore_partial_3 993#define vstore_partial_8_4 vstore_partial_4 994#define vstore_partial_8_5 vstore_partial_5 995#define vstore_partial_8_6 vstore_partial_6 996#define vstore_partial_8_7 vstore_partial_7 997#define vstore_partial_8_8 vstore_partial_8 998#define vstore_partial_8_9 NO_STORE 999#define vstore_partial_8_10 NO_STORE 1000#define vstore_partial_8_11 NO_STORE 1001#define vstore_partial_8_12 NO_STORE 1002#define vstore_partial_8_13 NO_STORE 1003#define vstore_partial_8_14 NO_STORE 1004#define vstore_partial_8_15 NO_STORE 1005#define vstore_partial_8_16 NO_STORE 1006// Size == 16 1007#define vstore_partial_16_0 NO_STORE 1008#define vstore_partial_16_1 vstore_partial_1 1009#define vstore_partial_16_2 vstore_partial_2 1010#define vstore_partial_16_3 vstore_partial_3 1011#define vstore_partial_16_4 vstore_partial_4 1012#define vstore_partial_16_5 vstore_partial_5 1013#define vstore_partial_16_6 vstore_partial_6 1014#define vstore_partial_16_7 vstore_partial_7 1015#define vstore_partial_16_8 vstore_partial_8 1016#define vstore_partial_16_9 vstore_partial_9 1017#define vstore_partial_16_10 vstore_partial_10 1018#define vstore_partial_16_11 vstore_partial_11 1019#define vstore_partial_16_12 vstore_partial_12 1020#define vstore_partial_16_13 vstore_partial_13 1021#define vstore_partial_16_14 vstore_partial_14 1022#define vstore_partial_16_15 vstore_partial_15 1023#define vstore_partial_16_16 vstore_partial_16 1024 1025/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 1026 * @name vstore_partial_n 1027 * 1028 * @note @p DATA needs to be a vector not a scalar 1029 * @note n needs to be <= the vector width of the input variable @p DATA 1030 * eg 1: Valid 1031 * vstore_partial_15(var:float16, 0, 0xabcd); 1032 * eg 2: Invalid 1033 * vstore_partial_7(var:float4, 0, 0xabcd); 1034 * 1035 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. 1036 * 1037 * @param[in] DATA The name of the variable 1038 * @param[in] OFFSET Offset in n 1039 * @param[in] PTR The base pointer 1040 * @{ 1041 */ 1042#define vstore_partial_1(DATA, OFFSET, PTR) \ 1043 vstore1(DATA.s0, OFFSET, PTR); 1044 1045#define vstore_partial_2(DATA, OFFSET, PTR) \ 1046 vstore2(DATA.s01, OFFSET, PTR); 1047 1048#define vstore_partial_3(DATA, OFFSET, PTR) \ 1049 vstore3(DATA.s012, OFFSET, PTR); 1050 1051#define vstore_partial_4(DATA, OFFSET, PTR) \ 1052 vstore4(DATA.s0123, OFFSET, PTR); 1053 1054#define vstore_partial_5(DATA, OFFSET, PTR) \ 1055 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1056 vstore1(DATA.s4, OFFSET, PTR + 4); 1057 1058#define vstore_partial_6(DATA, OFFSET, PTR) \ 1059 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1060 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 1061 1062#define vstore_partial_7(DATA, OFFSET, PTR) \ 1063 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1064 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 1065 1066#define vstore_partial_8(DATA, OFFSET, PTR) \ 1067 vstore8(DATA.s01234567, OFFSET, PTR); 1068 1069#define vstore_partial_9(DATA, OFFSET, PTR) \ 1070 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1071 vstore1(DATA.s8, OFFSET, PTR + 8); 1072 1073#define vstore_partial_10(DATA, OFFSET, PTR) \ 1074 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1075 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 1076 1077#define vstore_partial_11(DATA, OFFSET, PTR) \ 1078 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1079 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 1080 1081#define vstore_partial_12(DATA, OFFSET, PTR) \ 1082 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1083 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 1084 1085#define vstore_partial_13(DATA, OFFSET, PTR) \ 1086 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1087 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 1088 1089#define vstore_partial_14(DATA, OFFSET, PTR) \ 1090 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1091 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 1092 1093#define vstore_partial_15(DATA, OFFSET, PTR) \ 1094 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1095 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 1096 1097#define vstore_partial_16(DATA, OFFSET, PTR) \ 1098 vstore16(DATA, OFFSET, PTR); 1099/** @} */ // end of groupd vstore_partial_n 1100/** @} */ // end of groupd VSTORE_PARTIAL 1101 1102// Convert built-in functions with _sat modifier are not supported in floating point so we create defines 1103// without _sat to overcome this issue 1104#define convert_float_sat convert_float 1105#define convert_float1_sat convert_float 1106#define convert_float2_sat convert_float2 1107#define convert_float3_sat convert_float3 1108#define convert_float4_sat convert_float4 1109#define convert_float8_sat convert_float8 1110#define convert_float16_sat convert_float16 1111#define convert_half_sat convert_float 1112#define convert_half1_sat convert_half 1113#define convert_half2_sat convert_half2 1114#define convert_half3_sat convert_half3 1115#define convert_half4_sat convert_half4 1116#define convert_half8_sat convert_half8 1117#define convert_half16_sat convert_half16 1118 1119#define convert_float1 convert_float 1120#define convert_half1 convert_half 1121#define convert_char1 convert_char 1122#define convert_uchar1 convert_uchar 1123#define convert_short1 convert_short 1124#define convert_ushort1 convert_ushort 1125#define convert_int1 convert_int 1126#define convert_uint1 convert_uint 1127#define convert_long1 convert_long 1128#define convert_ulong1 convert_ulong 1129#define convert_double1 convert_double 1130 1131#define convert_char1_sat convert_char_sat 1132#define convert_uchar1_sat convert_uchar_sat 1133#define convert_short1_sat convert_short_sat 1134#define convert_ushort1_sat convert_ushort_sat 1135#define convert_int1_sat convert_int_sat 1136#define convert_uint1_sat convert_uint_sat 1137#define convert_long1_sat convert_long_sat 1138#define convert_ulong1_sat convert_ulong_sat 1139#define convert_double1_sat convert_double_sat 1140 1141#define VEC_DATA_TYPE_STR(type, size) type##size 1142#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 1143 1144#define CONVERT_STR(x, type) (convert_##type((x))) 1145#define CONVERT(x, type) CONVERT_STR(x, type) 1146 1147#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 1148#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 1149 1150#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 1151#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 1152 1153#define select_vec_dt_uchar(size) uchar##size 1154#define select_vec_dt_char(size) char##size 1155#define select_vec_dt_ushort(size) ushort##size 1156#define select_vec_dt_short(size) short##size 1157#define select_vec_dt_half(size) short##size 1158#define select_vec_dt_uint(size) uint##size 1159#define select_vec_dt_int(size) int##size 1160#define select_vec_dt_float(size) int##size 1161#define select_vec_dt_ulong(size) ulong##size 1162#define select_vec_dt_long(size) long##size 1163 1164#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 1165#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 1166#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 1167 1168#define sum_reduce_1(x) (x) 1169#define sum_reduce_2(x) ((x).s0) + ((x).s1) 1170#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 1171#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 1172#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 1173#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 1174 1175#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 1176#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 1177 1178#define max_reduce_1(x) (x) 1179#define max_reduce_2(x) max(((x).s0), ((x).s1)) 1180#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 1181#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 1182#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 1183#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 1184 1185#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 1186#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 1187 1188#define VECTOR_DECLARATION(name) \ 1189 __global uchar *name##_ptr, \ 1190 uint name##_stride_x, \ 1191 uint name##_step_x, \ 1192 uint name##_offset_first_element_in_bytes 1193 1194#define IMAGE_DECLARATION(name) \ 1195 __global uchar *name##_ptr, \ 1196 uint name##_stride_x, \ 1197 uint name##_step_x, \ 1198 uint name##_stride_y, \ 1199 uint name##_step_y, \ 1200 uint name##_offset_first_element_in_bytes 1201 1202#define TENSOR3D_DECLARATION(name) \ 1203 __global uchar *name##_ptr, \ 1204 uint name##_stride_x, \ 1205 uint name##_step_x, \ 1206 uint name##_stride_y, \ 1207 uint name##_step_y, \ 1208 uint name##_stride_z, \ 1209 uint name##_step_z, \ 1210 uint name##_offset_first_element_in_bytes 1211 1212#define TENSOR4D_DECLARATION(name) \ 1213 __global uchar *name##_ptr, \ 1214 uint name##_stride_x, \ 1215 uint name##_step_x, \ 1216 uint name##_stride_y, \ 1217 uint name##_step_y, \ 1218 uint name##_stride_z, \ 1219 uint name##_step_z, \ 1220 uint name##_stride_w, \ 1221 uint name##_step_w, \ 1222 uint name##_offset_first_element_in_bytes 1223 1224#define CONVERT_TO_VECTOR_STRUCT(name) \ 1225 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1226 1227#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1228 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1229 1230#define CONVERT_TO_IMAGE_STRUCT(name) \ 1231 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1232 1233#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1234 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1235 1236#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1237 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1238 1239#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1240 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1241 1242#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1243 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1244 1245#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1246 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1247 name##_stride_z, name##_step_z) 1248 1249#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1250 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1251 1252#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1253 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1254 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1255 1256#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1257 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1258 1259#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1260 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1261 name##_stride_z, name##_step_z) 1262 1263/** Structure to hold Vector information */ 1264typedef struct Vector 1265{ 1266 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1267 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1268 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1269} Vector; 1270 1271/** Structure to hold Image information */ 1272typedef struct Image 1273{ 1274 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1275 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1276 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1277 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1278} Image; 1279 1280/** Structure to hold 3D tensor information */ 1281typedef struct Tensor3D 1282{ 1283 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1284 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1285 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1286 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1287 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1288} Tensor3D; 1289 1290/** Structure to hold 4D tensor information */ 1291typedef struct Tensor4D 1292{ 1293 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1294 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1295 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1296 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1297 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1298 int stride_w; /**< Stride of the image in W dimension (in bytes) */ 1299} Tensor4D; 1300 1301/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. 1302 * 1303 * @param[in] ptr Pointer to the starting postion of the buffer 1304 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector 1305 * @param[in] stride_x Stride of the vector in X dimension (in bytes) 1306 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1307 * 1308 * @return An image object 1309 */ 1310inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1311{ 1312 Vector vector = 1313 { 1314 .ptr = ptr, 1315 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1316 .stride_x = stride_x, 1317 }; 1318 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1319 return vector; 1320} 1321 1322/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. 1323 * 1324 * @param[in] ptr Pointer to the starting postion of the buffer 1325 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1326 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1327 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1328 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1329 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1330 * 1331 * @return An image object 1332 */ 1333inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1334{ 1335 Image img = 1336 { 1337 .ptr = ptr, 1338 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1339 .stride_x = stride_x, 1340 .stride_y = stride_y 1341 }; 1342 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1343 return img; 1344} 1345 1346/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. 1347 * 1348 * @param[in] ptr Pointer to the starting postion of the buffer 1349 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1350 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1351 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1352 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1353 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1354 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1355 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1356 * 1357 * @return A 3D tensor object 1358 */ 1359inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1360{ 1361 Image img = 1362 { 1363 .ptr = ptr, 1364 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1365 .stride_x = stride_x, 1366 .stride_y = stride_y 1367 }; 1368 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1369 return img; 1370} 1371 1372/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. 1373 * 1374 * @param[in] ptr Pointer to the starting postion of the buffer 1375 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1376 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1377 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1378 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1379 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1380 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1381 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1382 * 1383 * @return A 3D tensor object 1384 */ 1385inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1386{ 1387 Tensor3D tensor = 1388 { 1389 .ptr = ptr, 1390 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1391 .stride_x = stride_x, 1392 .stride_y = stride_y, 1393 .stride_z = stride_z 1394 }; 1395 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1396 return tensor; 1397} 1398 1399/** Wrap 3D tensor information into an tensor structure. 1400 * 1401 * @param[in] ptr Pointer to the starting postion of the buffer 1402 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1403 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1404 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1405 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1406 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1407 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1408 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1409 * 1410 * @return A 3D tensor object 1411 */ 1412inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1413{ 1414 Tensor3D tensor = 1415 { 1416 .ptr = ptr, 1417 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1418 .stride_x = stride_x, 1419 .stride_y = stride_y, 1420 .stride_z = stride_z 1421 }; 1422 return tensor; 1423} 1424 1425inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1426 uint step_w, 1427 uint mod_size) 1428{ 1429 Tensor4D tensor = 1430 { 1431 .ptr = ptr, 1432 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1433 .stride_x = stride_x, 1434 .stride_y = stride_y, 1435 .stride_z = stride_z, 1436 .stride_w = stride_w 1437 }; 1438 1439 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1440 return tensor; 1441} 1442 1443/** Get the pointer position of a Vector 1444 * 1445 * @param[in] vec Pointer to the starting position of the buffer 1446 * @param[in] x Relative X position 1447 */ 1448inline __global const uchar *vector_offset(const Vector *vec, int x) 1449{ 1450 return vec->ptr + x * vec->stride_x; 1451} 1452 1453/** Get the pointer position of a Image 1454 * 1455 * @param[in] img Pointer to the starting position of the buffer 1456 * @param[in] x Relative X position 1457 * @param[in] y Relative Y position 1458 */ 1459inline __global uchar *offset(const Image *img, int x, int y) 1460{ 1461 return img->ptr + x * img->stride_x + y * img->stride_y; 1462} 1463 1464/** Get the pointer position of a Tensor3D 1465 * 1466 * @param[in] tensor Pointer to the starting position of the buffer 1467 * @param[in] x Relative X position 1468 * @param[in] y Relative Y position 1469 * @param[in] z Relative Z position 1470 */ 1471inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1472{ 1473 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1474} 1475 1476/** Get the pointer position of a Tensor4D 1477 * 1478 * @param[in] tensor Pointer to the starting position of the buffer 1479 * @param[in] x Relative X position 1480 * @param[in] y Relative Y position 1481 * @param[in] z Relative Z position 1482 * @param[in] w Relative W position 1483 */ 1484inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1485{ 1486 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1487} 1488 1489/** Get the offset for a given linear index of a Tensor3D 1490 * 1491 * @param[in] tensor Pointer to the starting position of the buffer 1492 * @param[in] width Width of the input tensor 1493 * @param[in] height Height of the input tensor 1494 * @param[in] depth Depth of the input tensor 1495 * @param[in] index Linear index 1496 */ 1497inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1498{ 1499 uint num_elements = width * height; 1500 1501 const uint z = index / num_elements; 1502 1503 index %= num_elements; 1504 1505 const uint y = index / width; 1506 1507 index %= width; 1508 1509 const uint x = index; 1510 1511 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1512} 1513 1514#endif // _HELPER_H 1515 1516#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1517 1518#if defined(S1_VAL) && !defined(S2_VAL) 1519#define S2_VAL S1_VAL 1520#endif // defined(S1_VAL) && !defined(S2_VAL) 1521#if defined(O1_VAL) && !defined(O2_VAL) 1522#define O2_VAL O1_VAL 1523#endif // defined(O1_VAL) && !defined(O2_VAL) 1524 1525// RELU Activation 1526inline TYPE relu_op(TYPE x) 1527{ 1528 return max((TYPE)CONST_0, x); 1529} 1530// Bounded RELU Activation 1531inline TYPE brelu_op(TYPE x) 1532{ 1533 return min((TYPE)A_VAL, max((TYPE)CONST_0, x)); 1534} 1535// Lower Upper Bounded RELU Activation 1536inline TYPE lu_brelu_op(TYPE x) 1537{ 1538 return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL); 1539} 1540// Hard Swish Activation 1541inline TYPE hard_swish_op(TYPE x) 1542{ 1543 return (x * ((min(max((TYPE)(x + (TYPE)3.f), (TYPE)0.f), (TYPE)6.f)) * (TYPE)0.166666667f)); 1544} 1545 1546#define ACTIVATION_OP2(op, x) op##_op(x) 1547#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) 1548 1549#if defined(S1_VAL) && defined(S2_VAL) 1550#if defined(O1_VAL) && defined(O2_VAL) 1551#define PERFORM_ACTIVATION_QUANT(act, data) \ 1552 ({ \ 1553 data = ACTIVATION_OP(act, data); \ 1554 \ 1555 VEC_DATA_TYPE(float, VEC_SIZE) \ 1556 fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ 1557 \ 1558 fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \ 1559 data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ 1560 }) 1561#else // defined(O1_VAL) && defined(O2_VAL) 1562#define PERFORM_ACTIVATION_QUANT(act, data) \ 1563 ({ \ 1564 data = ACTIVATION_OP(act, data); \ 1565 \ 1566 VEC_DATA_TYPE(float, VEC_SIZE) \ 1567 fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ 1568 \ 1569 fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL)); \ 1570 data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ 1571 }) 1572#endif /* defined(O1_VAL) && defined(O2_VAL) */ 1573#else /* defined(S1_VAL) && defined(S2_VAL) */ 1574#define PERFORM_ACTIVATION_QUANT(act, data) \ 1575 ({ \ 1576 data = ACTIVATION_OP(act, data); \ 1577 }) 1578#endif /* defined(S1_VAL) && defined(S2_VAL) */ 1579 1580#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) 1581 1582#if defined(FLOAT_DOMAIN) 1583// Activations performed in the float domain 1584 1585/* 1586 * Copyright (c) 2019-2020 Arm Limited. 1587 * 1588 * SPDX-License-Identifier: MIT 1589 * 1590 * Permission is hereby granted, free of charge, to any person obtaining a copy 1591 * of this software and associated documentation files (the "Software"), to 1592 * deal in the Software without restriction, including without limitation the 1593 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 1594 * sell copies of the Software, and to permit persons to whom the Software is 1595 * furnished to do so, subject to the following conditions: 1596 * 1597 * The above copyright notice and this permission notice shall be included in all 1598 * copies or substantial portions of the Software. 1599 * 1600 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1602 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1603 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1604 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1605 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1606 * SOFTWARE. 1607 */ 1608 1609/* 1610 * Copyright (c) 2016-2020 Arm Limited. 1611 * 1612 * SPDX-License-Identifier: MIT 1613 * 1614 * Permission is hereby granted, free of charge, to any person obtaining a copy 1615 * of this software and associated documentation files (the "Software"), to 1616 * deal in the Software without restriction, including without limitation the 1617 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 1618 * sell copies of the Software, and to permit persons to whom the Software is 1619 * furnished to do so, subject to the following conditions: 1620 * 1621 * The above copyright notice and this permission notice shall be included in all 1622 * copies or substantial portions of the Software. 1623 * 1624 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1625 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1626 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1627 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1628 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1629 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1630 * SOFTWARE. 1631 */ 1632#ifndef ARM_COMPUTE_HELPER_H 1633#define ARM_COMPUTE_HELPER_H 1634 1635/* 1636 * Copyright (c) 2020 Arm Limited. 1637 * 1638 * SPDX-License-Identifier: MIT 1639 * 1640 * Permission is hereby granted, free of charge, to any person obtaining a copy 1641 * of this software and associated documentation files (the "Software"), to 1642 * deal in the Software without restriction, including without limitation the 1643 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 1644 * sell copies of the Software, and to permit persons to whom the Software is 1645 * furnished to do so, subject to the following conditions: 1646 * 1647 * The above copyright notice and this permission notice shall be included in all 1648 * copies or substantial portions of the Software. 1649 * 1650 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1651 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1652 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1653 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1654 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1655 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1656 * SOFTWARE. 1657 */ 1658 1659/** Store the 0 to (n-1)th rows of the given variables 1660 * @name STORE_ROW_n 1661 * 1662 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 1663 * @param[in] DATA_TYPE The data type of the vectors 1664 * @param[in] BASENAME The basename of the variables 1665 * @param[in] PTR The base pointer 1666 * @param[in] STRIDE_Y The stride value in y-axis direction 1667 * @param[in] Z The offset in z-axis direction 1668 * @{ 1669 */ 1670#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1671 VSTORE(N0) \ 1672 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1673 1674#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1675 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1676 VSTORE(N0) \ 1677 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1678 1679#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1680 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1681 VSTORE(N0) \ 1682 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1683 1684#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1685 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1686 VSTORE(N0) \ 1687 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1688 1689#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1690 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1691 VSTORE(N0) \ 1692 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1693 1694#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1695 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1696 VSTORE(N0) \ 1697 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1698 1699#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1700 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1701 VSTORE(N0) \ 1702 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1703 1704#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1705 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1706 VSTORE(N0) \ 1707 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1708 1709#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1710 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1711 VSTORE(N0) \ 1712 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1713 1714#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1715 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1716 VSTORE(N0) \ 1717 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1718 1719#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1720 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1721 VSTORE(N0) \ 1722 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1723 1724#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1725 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1726 VSTORE(N0) \ 1727 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1728 1729#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1730 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1731 VSTORE(N0) \ 1732 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1733 1734#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1735 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1736 VSTORE(N0) \ 1737 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1738 1739#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1740 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1741 VSTORE(N0) \ 1742 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1743 1744#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1745 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1746 VSTORE(N0) \ 1747 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1748/** @} */ // end of groupd STORE_ROW_n 1749 1750/** Convert and store the 0th to (n-1)th rows of the given variables 1751 * @name CONVERT_STORE_ROW_n 1752 * 1753 * @param[in] N0 The size of the vectors 1754 * @param[in] DATA_TYPE The data type of the vectors 1755 * @param[in] BASENAME The basename of the variables 1756 * @param[in] PTR The base pointer 1757 * @param[in] STRIDE_Y The stride value in y-axis direction 1758 * @param[in] Z The offset in z-axis direction 1759 * @{ 1760 */ 1761#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1762 VSTORE(N0) \ 1763 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1764 1765#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1766 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1767 VSTORE(N0) \ 1768 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1769 1770#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1771 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1772 VSTORE(N0) \ 1773 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1774 1775#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1776 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1777 VSTORE(N0) \ 1778 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1779 1780#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1781 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1782 VSTORE(N0) \ 1783 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1784 1785#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1786 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1787 VSTORE(N0) \ 1788 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1789 1790#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1791 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1792 VSTORE(N0) \ 1793 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1794 1795#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1796 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1797 VSTORE(N0) \ 1798 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1799 1800#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1801 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1802 VSTORE(N0) \ 1803 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1804 1805#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1806 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1807 VSTORE(N0) \ 1808 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1809 1810#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1811 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1812 VSTORE(N0) \ 1813 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1814 1815#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1816 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1817 VSTORE(N0) \ 1818 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1819 1820#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1821 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1822 VSTORE(N0) \ 1823 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1824 1825#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1826 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1827 VSTORE(N0) \ 1828 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1829 1830#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1831 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1832 VSTORE(N0) \ 1833 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1834 1835#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1836 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1837 VSTORE(N0) \ 1838 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1839 1840/** @} */ // end of groupd CONVERT_STORE_ROW_n 1841 1842/** Store a block of the given size M0xN0 1843 * @name STORE_BLOCK 1844 * 1845 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 1846 * The data to store is expected to have consecutive names for each row. 1847 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 1848 * The Z offset is expected to have consecutive names. 1849 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 1850 * 1851 * @param[in] M0 The number of rows to store 1852 * @param[in] N0 The size of each vector 1853 * @param[in] DATA_TYPE The data type of the vectors 1854 * @param[in] BASENAME The basename of the variables 1855 * @param[in] PTR The base pointer 1856 * @param[in] STRIDE_Y The stride value in y-axis direction 1857 * @param[in] Z The offset in z-axis direction 1858 * @{ 1859 */ 1860#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1861#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1862/** @} */ // end of group STORE_BLOCK 1863 1864/** Convert and store a block of the given size M0xN0 1865 * @name CONVERT_STORE_BLOCK 1866 * 1867 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 1868 * The data to store is expected to have consecutive names for each row. 1869 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 1870 * The Z offset is expected to have consecutive names. 1871 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 1872 * 1873 * @param[in] M0 The number of rows to store 1874 * @param[in] N0 The size of each vector 1875 * @param[in] DATA_TYPE The data type of the vectors 1876 * @param[in] BASENAME The basename of the variables 1877 * @param[in] PTR The base pointer 1878 * @param[in] STRIDE_Y The stride value in y-axis direction 1879 * @param[in] Z The offset in z-axis direction 1880 * @{ 1881 */ 1882#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1883#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1884/** @} */ // end of group CONVERT_STORE_BLOCK 1885 1886/** Partially store the 0 to (n-1)th rows of the given variables 1887 * @name STORE_ROW_PARTIAL_n 1888 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0 1889 * 1890 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 1891 * 1892 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 1893 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0 1894 * @param[in] DATA_TYPE The data type of the vectors 1895 * @param[in] BASENAME The basename of the variables 1896 * @param[in] PTR The base pointer 1897 * @param[in] STRIDE_Y The stride value in y-axis direction 1898 * @param[in] Z The offset in z-axis direction 1899 * @{ 1900 */ 1901#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1902 VSTORE_PARTIAL(N0, STORE_N0) \ 1903 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1904 1905#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1906 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1907 VSTORE_PARTIAL(N0, STORE_N0) \ 1908 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1909 1910#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1911 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1912 VSTORE_PARTIAL(N0, STORE_N0) \ 1913 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1914 1915#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1916 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1917 VSTORE_PARTIAL(N0, STORE_N0) \ 1918 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1919 1920#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1921 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1922 VSTORE_PARTIAL(N0, STORE_N0) \ 1923 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1924 1925#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1926 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1927 VSTORE_PARTIAL(N0, STORE_N0) \ 1928 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1929 1930#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1931 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1932 VSTORE_PARTIAL(N0, STORE_N0) \ 1933 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1934 1935#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1936 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1937 VSTORE_PARTIAL(N0, STORE_N0) \ 1938 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1939 1940#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1941 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1942 VSTORE_PARTIAL(N0, STORE_N0) \ 1943 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1944 1945#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1946 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1947 VSTORE_PARTIAL(N0, STORE_N0) \ 1948 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1949 1950#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1951 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1952 VSTORE_PARTIAL(N0, STORE_N0) \ 1953 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1954 1955#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1956 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1957 VSTORE_PARTIAL(N0, STORE_N0) \ 1958 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1959 1960#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1961 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1962 VSTORE_PARTIAL(N0, STORE_N0) \ 1963 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1964 1965#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1966 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1967 VSTORE_PARTIAL(N0, STORE_N0) \ 1968 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1969 1970#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1971 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1972 VSTORE_PARTIAL(N0, STORE_N0) \ 1973 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1974 1975#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1976 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1977 VSTORE_PARTIAL(N0, STORE_N0) \ 1978 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1979/** @} */ // end of groupd STORE_ROW_PARTIAL_n 1980 1981/** Partially store a block of the given size STORE_M0xSTORE_N0 1982 * @name STORE_BLOCK_PARTIAL 1983 * 1984 * @note The vector width @p N0 is also required for correct partial storing behaviour. 1985 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 1986 * 1987 * The data to store is expected to have consecutive names for each row. 1988 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2. 1989 * The Z offset is expected to have consecutive names. 1990 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 1991 * 1992 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16 1993 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0 1994 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 1995 * @param[in] DATA_TYPE The data type of the vectors 1996 * @param[in] BASENAME The basename of the variables 1997 * @param[in] PTR The base pointer 1998 * @param[in] STRIDE_Y The stride value in y-axis direction 1999 * @param[in] Z The offset in z-axis direction 2000 * @{ 2001 */ 2002#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2003#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2004/** Store a block that can be partial in both x and y dimensions 2005 * 2006 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2007 * 2008 * The data to store is expected to have consecutive names for each row. 2009 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2010 * The Z offset is expected to have consecutive names. 2011 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2012 * 2013 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2014 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2015 * @param[in] DATA_TYPE The data type of the vectors 2016 * @param[in] BASENAME The basename of the variables 2017 * @param[in] PTR The base pointer 2018 * @param[in] STRIDE_Y The stride value in y-axis direction 2019 * @param[in] Z The offset in z-axis direction 2020 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 2021 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 2022 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 2023 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 2024 */ 2025#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2026 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2027 { \ 2028 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2029 } \ 2030 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2031 { \ 2032 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2033 } \ 2034 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2035 { \ 2036 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2037 } \ 2038 else \ 2039 { \ 2040 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2041 } 2042/** Store a block that can only be partial in x but not y. 2043 * 2044 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2045 * 2046 * The data to store is expected to have consecutive names for each row. 2047 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2048 * The Z offset is expected to have consecutive names. 2049 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2050 * 2051 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2052 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2053 * @param[in] DATA_TYPE The data type of the vectors 2054 * @param[in] BASENAME The basename of the variables 2055 * @param[in] PTR The base pointer 2056 * @param[in] STRIDE_Y The stride value in y-axis direction 2057 * @param[in] Z The offset in z-axis direction 2058 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 2059 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 2060 */ 2061#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2062 if(!(PARTIAL_COND_X)) \ 2063 { \ 2064 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2065 } \ 2066 else \ 2067 { \ 2068 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2069 } 2070/** Store a block that can only be partial in y but not x. 2071 * 2072 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2073 * 2074 * The data to store is expected to have consecutive names for each row. 2075 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2076 * The Z offset is expected to have consecutive names. 2077 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2078 * 2079 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2080 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2081 * @param[in] DATA_TYPE The data type of the vectors 2082 * @param[in] BASENAME The basename of the variables 2083 * @param[in] PTR The base pointer 2084 * @param[in] STRIDE_Y The stride value in y-axis direction 2085 * @param[in] Z The offset in z-axis direction 2086 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 2087 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 2088 */ 2089#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2090 if(!(PARTIAL_COND_Y)) \ 2091 { \ 2092 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2093 } \ 2094 else \ 2095 { \ 2096 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2097 } 2098/** @} */ // end of group STORE_BLOCK_PARTIAL 2099 2100#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 2101 2102/** Boundary-aware GEMM block store 2103 * @name STORE_BLOCK_BOUNDARY_AWARE 2104 * This macro assumes the following schemes to achieve boundary-awareness: 2105 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 2106 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 2107 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 2108 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim. 2109 * 2110 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 2111 * blocks **at the end**. 2112 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 2113 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 2114 * 2115 * *--x--> x == 0 x == 1 2116 * | |<------------------------------N-------------------------->| 2117 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 2118 * | -------------############################################################# 2119 * * | | |...............................|...........................| 2120 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 2121 * | | |...............................|...........................| 2122 * M --############################################################# 2123 * | | | |...........................| 2124 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 2125 * | | | |...........................| 2126 * |------------############################################################# 2127 * 2128 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 2129 * 2130 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2131 * 2132 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 2133 * and select corresponding store methods such that the boundary detection logic is only added when needed. 2134 * 2135 * The data to store is expected to have consecutive names for each row. 2136 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2137 * The Z offset is expected to have consecutive names. 2138 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2139 * 2140 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2141 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2142 * @param[in] DATA_TYPE The data type of the vectors 2143 * @param[in] BASENAME The basename of the variables 2144 * @param[in] PTR The base pointer 2145 * @param[in] STRIDE_Y The stride value in y-axis direction 2146 * @param[in] Z The offset in z-axis direction 2147 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 2148 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 2149 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 2150 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 2151 * @{ 2152 */ 2153#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2154// Case1: No partial blocks in either x or y 2155#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2156 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2157 2158#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2159// Case2: Partial blocks in y 2160#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2161 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2162 2163#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2164// Case3: Partial blocks in x 2165#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2166 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2167 2168#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2169// Case4: Partial blocks in both x and y 2170#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2171 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2172 2173#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2174 2175#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 2176/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE 2177 2178#if defined(PARTIAL_STORE_M0) 2179/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding 2180 * @name COMPUTE_M0_START_ROW 2181 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows. 2182 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent 2183 * blocks in the y dimension to avoid any padding. 2184 * EG: M0=4, PARTIAL_STORE_M0=1: 2185 * | Non-overlapping | +M0_ROW_SHIFT (Overlapping) 2186 * block 0 (partial)| start row = 0 | start row = 0 2187 * block 1 (full) | start row = 4 | start row = 1 2188 * block 2 (full) | start row = 8 | start row = 5 2189 * 2190 * @param[in] y Global id of current block in y. 2191 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2192 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 2193 * @{ 2194 */ 2195#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 2196 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 2197#else // defined(PARTIAL_STORE_M0) 2198#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 2199 ((uint)(y * M0)) 2200#endif // defined(PARTIAL_STORE_M0) 2201/** @} */ // end of group COMPUTE_M0_START_ROW 2202 2203/** Store a vector that can only be partial in x. 2204 * 2205 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2206 * 2207 * The data to store is expected to end in a 0. 2208 * E.g., for basename=c, the expected name is c0. 2209 * 2210 * @param[in] basename The name of the variable without trailing 0 2211 * @param[in] data_type The data type of the vector 2212 * @param[in] ptr The base pointer 2213 * @param[in] vec_size The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16 2214 * @param[in] leftover The vector size if cond = true. Supported range: [1, @p vec_size0) 2215 * @param[in] cond Condition to select either vec_size0 or vec_size1 2216 * @{ 2217 */ 2218#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 2219 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 2220/** @} */ // end of group STORE_VECTOR_SELECT 2221 2222#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2223#pragma OPENCL EXTENSION cl_khr_fp16 : enable 2224#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2225 2226#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 2227#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 2228#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 2229 2230#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 2231#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 2232#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 2233 2234#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 2235#pragma OPENCL EXTENSION cl_arm_printf : enable 2236#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 2237 2238#define GPU_ARCH_MIDGARD 0x100 2239#define GPU_ARCH_BIFROST 0x200 2240 2241/** Concatenate two inputs. 2242 * 2243 * @param[in] a The first input to be concatenated 2244 * @param[in] b The second input to be concatenated 2245 * 2246 * @return The concatenated output 2247 */ 2248#define CONCAT(a, b) a##b 2249 2250/** Expand the given vector 2251 * 2252 * @param[in] x The vector to be expanded 2253 * 2254 * @return The expanded output 2255 */ 2256#define EXPAND(x) x 2257 2258/** Clamp the given value between an upper and lower bound. 2259 * 2260 * @param[in] x The value to be clamped 2261 * @param[in] min_val The lower bound 2262 * @param[in] max_val The upper bound 2263 * 2264 * @return The clamped value. 2265 */ 2266#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 2267 2268/** REVn reverses the given vector whose size is n. 2269 * @name REVn 2270 * 2271 * @param[in] x The vector to be reversed 2272 * 2273 * @return The reversed vector 2274 * @{ 2275 */ 2276#define REV1(x) ((x)) 2277#define REV2(x) ((x).s10) 2278#define REV3(x) ((x).s210) 2279#define REV4(x) ((x).s3210) 2280#define REV8(x) ((x).s76543210) 2281#define REV16(x) ((x).sFEDCBA9876543210) 2282/** @} */ // end of group REVn 2283 2284/** Reverse the given vector. 2285 * @name REVERSE 2286 * 2287 * @param[in] x The vector to be reversed 2288 * @param[in] s The size of the vector 2289 * 2290 * @return The reversed vector 2291 * @{ 2292 */ 2293#define REVERSE_STR(x, s) REV##s((x)) 2294#define REVERSE(x, s) REVERSE_STR(x, s) 2295/** @} */ // end of group REVERSE 2296 2297/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. 2298 * @name ROTs_n 2299 * 2300 * @param[in] x The vector to be shifted 2301 * 2302 * @return The shifted vector 2303 * @{ 2304 */ 2305#define ROT1_0(x) ((x)) 2306 2307#define ROT2_0(x) ((x)) 2308#define ROT2_1(x) ((x).s10) 2309 2310#define ROT3_0(x) ((x)) 2311#define ROT3_1(x) ((x).s201) 2312#define ROT3_2(x) ((x).s120) 2313 2314#define ROT4_0(x) ((x)) 2315#define ROT4_1(x) ((x).s3012) 2316#define ROT4_2(x) ((x).s2301) 2317#define ROT4_3(x) ((x).s1230) 2318 2319#define ROT8_0(x) ((x)) 2320#define ROT8_1(x) ((x).s70123456) 2321#define ROT8_2(x) ((x).s67012345) 2322#define ROT8_3(x) ((x).s56701234) 2323#define ROT8_4(x) ((x).s45670123) 2324#define ROT8_5(x) ((x).s34567012) 2325#define ROT8_6(x) ((x).s23456701) 2326#define ROT8_7(x) ((x).s12345670) 2327 2328#define ROT16_0(x) ((x)) 2329#define ROT16_1(x) ((x).sF0123456789ABCDE) 2330#define ROT16_2(x) ((x).sEF0123456789ABCD) 2331#define ROT16_3(x) ((x).sDEF0123456789ABC) 2332#define ROT16_4(x) ((x).sCDEF0123456789AB) 2333#define ROT16_5(x) ((x).sBCDEF0123456789A) 2334#define ROT16_6(x) ((x).sABCDEF0123456789) 2335#define ROT16_7(x) ((x).s9ABCDEF012345678) 2336#define ROT16_8(x) ((x).s89ABCDEF01234567) 2337#define ROT16_9(x) ((x).s789ABCDEF0123456) 2338#define ROT16_10(x) ((x).s6789ABCDEF012345) 2339#define ROT16_11(x) ((x).s56789ABCDEF01234) 2340#define ROT16_12(x) ((x).s456789ABCDEF0123) 2341#define ROT16_13(x) ((x).s3456789ABCDEF012) 2342#define ROT16_14(x) ((x).s23456789ABCDEF01) 2343#define ROT16_15(x) ((x).s123456789ABCDEF0) 2344/** @} */ // end of group ROTs_n 2345 2346/** Circular-right-shift (rotate-right) the given vector by the given amount. 2347 * @name ROTATE 2348 * 2349 * @param[in] x The vector to be shifted 2350 * @param[in] s The size of the vector 2351 * @param[in] n The amount to be shifted 2352 * 2353 * @return The shifted vector 2354 * @{ 2355 */ 2356#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 2357#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 2358/** @} */ // end of group ROTATE 2359 2360/** Creates a vector of size n filled with offset values corresponding to the location of each element. 2361 * @name V_OFFSn 2362 * 2363 * @param[in] dt The data type of the output vector 2364 * 2365 * @return The vector filled with offset values 2366 * @{ 2367 */ 2368#define V_OFFS1(dt) (dt##1)(0) 2369#define V_OFFS2(dt) (dt##2)(0, 1) 2370#define V_OFFS3(dt) (dt##3)(0, 1, 2) 2371#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 2372#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 2373#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 2374/** @} */ // end of group V_OFFSn 2375 2376/** Create a vector filled with offset values corresponding to the location of each element. 2377 * @name VEC_OFFS 2378 * 2379 * @param[in] dt The data type of the output vector 2380 * @param[in] s The size of the output vector 2381 * 2382 * @return The vector filled with offset values 2383 * @{ 2384 */ 2385#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 2386#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 2387/** @} */ // end of group VEC_OFFS 2388 2389#define VLOAD_STR(size) vload##size 2390#define VLOAD(size) VLOAD_STR(size) 2391 2392#define PIXEL_UNIT4 1 2393#define PIXEL_UNIT8 2 2394#define PIXEL_UNIT16 4 2395 2396/** Utility macro to convert a vector size in pixel unit. 2397 * 2398 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 2399 * 2400 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported 2401 * 2402 * @return The pixel unit (number of pixels) 2403 * @{ 2404 */ 2405#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 2406#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 2407/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 2408 2409#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 2410#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 2411#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 2412 2413#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2414#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 2415#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 2416#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 2417#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2418 2419/** Utility macro to read a 2D OpenCL image object. 2420 * 2421 * @note Coordinates are not normalized 2422 * 2423 * @param[in] data_type Data type 2424 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported 2425 * @param[in] img OpenCL image object 2426 * @param[in] x_coord The x coordinate for the top-left pixel 2427 * @param[in] y_coord The y coordinate for the top-left pixel 2428 * 2429 * @return Pixels from the 2D OpenCL image object 2430 * @{ 2431 */ 2432#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 2433#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 2434 2435#define VSTORE_STR(size) vstore##size 2436#define VSTORE(size) VSTORE_STR(size) 2437 2438#define float1 float 2439#define half1 half 2440#define char1 char 2441#define uchar1 uchar 2442#define short1 short 2443#define ushort1 ushort 2444#define int1 int 2445#define uint1 uint 2446#define long1 long 2447#define ulong1 ulong 2448#define double1 double 2449 2450#define vload1(OFFSET, PTR) *(OFFSET + PTR) 2451#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 2452 2453/** Extended partial vstore that correctly handles scalar values as well. 2454 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 2455 * @name VSTORE_PARTIAL 2456 * 2457 * @note With this macro, the passed data can be both a vector and a scalar 2458 * @note @p store_size needs to be <= @p size 2459 * eg 1: Valid 2460 * VSTORE_PARTIAL(16, 15) ...; 2461 * eg 2: Invalid 2462 * VSTORE_PARTIAL(4, 7) ...; 2463 * 2464 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 2465 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size 2466 * @{ 2467 */ 2468#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 2469#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 2470 2471#define NO_STORE(data, offs, ptr) \ 2472 { \ 2473 } 2474 2475// Size == 1 (scalar) 2476#define vstore_partial_1_0 NO_STORE 2477#define vstore_partial_1_1 vstore1 2478#define vstore_partial_1_2 NO_STORE 2479#define vstore_partial_1_3 NO_STORE 2480#define vstore_partial_1_4 NO_STORE 2481#define vstore_partial_1_5 NO_STORE 2482#define vstore_partial_1_6 NO_STORE 2483#define vstore_partial_1_7 NO_STORE 2484#define vstore_partial_1_8 NO_STORE 2485#define vstore_partial_1_9 NO_STORE 2486#define vstore_partial_1_10 NO_STORE 2487#define vstore_partial_1_11 NO_STORE 2488#define vstore_partial_1_12 NO_STORE 2489#define vstore_partial_1_13 NO_STORE 2490#define vstore_partial_1_14 NO_STORE 2491#define vstore_partial_1_15 NO_STORE 2492#define vstore_partial_1_16 NO_STORE 2493// Size == 2 2494#define vstore_partial_2_0 NO_STORE 2495#define vstore_partial_2_1 vstore_partial_1 2496#define vstore_partial_2_2 vstore_partial_2 2497#define vstore_partial_2_3 NO_STORE 2498#define vstore_partial_2_4 NO_STORE 2499#define vstore_partial_2_5 NO_STORE 2500#define vstore_partial_2_6 NO_STORE 2501#define vstore_partial_2_7 NO_STORE 2502#define vstore_partial_2_8 NO_STORE 2503#define vstore_partial_2_9 NO_STORE 2504#define vstore_partial_2_10 NO_STORE 2505#define vstore_partial_2_11 NO_STORE 2506#define vstore_partial_2_12 NO_STORE 2507#define vstore_partial_2_13 NO_STORE 2508#define vstore_partial_2_14 NO_STORE 2509#define vstore_partial_2_15 NO_STORE 2510#define vstore_partial_2_16 NO_STORE 2511// Size == 3 2512#define vstore_partial_3_0 NO_STORE 2513#define vstore_partial_3_1 vstore_partial_1 2514#define vstore_partial_3_2 vstore_partial_2 2515#define vstore_partial_3_3 vstore_partial_3 2516#define vstore_partial_3_4 NO_STORE 2517#define vstore_partial_3_5 NO_STORE 2518#define vstore_partial_3_6 NO_STORE 2519#define vstore_partial_3_7 NO_STORE 2520#define vstore_partial_3_8 NO_STORE 2521#define vstore_partial_3_9 NO_STORE 2522#define vstore_partial_3_10 NO_STORE 2523#define vstore_partial_3_11 NO_STORE 2524#define vstore_partial_3_12 NO_STORE 2525#define vstore_partial_3_13 NO_STORE 2526#define vstore_partial_3_14 NO_STORE 2527#define vstore_partial_3_15 NO_STORE 2528#define vstore_partial_3_16 NO_STORE 2529// Size == 4 2530#define vstore_partial_4_0 NO_STORE 2531#define vstore_partial_4_1 vstore_partial_1 2532#define vstore_partial_4_2 vstore_partial_2 2533#define vstore_partial_4_3 vstore_partial_3 2534#define vstore_partial_4_4 vstore_partial_4 2535#define vstore_partial_4_5 NO_STORE 2536#define vstore_partial_4_6 NO_STORE 2537#define vstore_partial_4_7 NO_STORE 2538#define vstore_partial_4_8 NO_STORE 2539#define vstore_partial_4_9 NO_STORE 2540#define vstore_partial_4_10 NO_STORE 2541#define vstore_partial_4_11 NO_STORE 2542#define vstore_partial_4_12 NO_STORE 2543#define vstore_partial_4_13 NO_STORE 2544#define vstore_partial_4_14 NO_STORE 2545#define vstore_partial_4_15 NO_STORE 2546#define vstore_partial_4_16 NO_STORE 2547// Size == 8 2548#define vstore_partial_8_0 NO_STORE 2549#define vstore_partial_8_1 vstore_partial_1 2550#define vstore_partial_8_2 vstore_partial_2 2551#define vstore_partial_8_3 vstore_partial_3 2552#define vstore_partial_8_4 vstore_partial_4 2553#define vstore_partial_8_5 vstore_partial_5 2554#define vstore_partial_8_6 vstore_partial_6 2555#define vstore_partial_8_7 vstore_partial_7 2556#define vstore_partial_8_8 vstore_partial_8 2557#define vstore_partial_8_9 NO_STORE 2558#define vstore_partial_8_10 NO_STORE 2559#define vstore_partial_8_11 NO_STORE 2560#define vstore_partial_8_12 NO_STORE 2561#define vstore_partial_8_13 NO_STORE 2562#define vstore_partial_8_14 NO_STORE 2563#define vstore_partial_8_15 NO_STORE 2564#define vstore_partial_8_16 NO_STORE 2565// Size == 16 2566#define vstore_partial_16_0 NO_STORE 2567#define vstore_partial_16_1 vstore_partial_1 2568#define vstore_partial_16_2 vstore_partial_2 2569#define vstore_partial_16_3 vstore_partial_3 2570#define vstore_partial_16_4 vstore_partial_4 2571#define vstore_partial_16_5 vstore_partial_5 2572#define vstore_partial_16_6 vstore_partial_6 2573#define vstore_partial_16_7 vstore_partial_7 2574#define vstore_partial_16_8 vstore_partial_8 2575#define vstore_partial_16_9 vstore_partial_9 2576#define vstore_partial_16_10 vstore_partial_10 2577#define vstore_partial_16_11 vstore_partial_11 2578#define vstore_partial_16_12 vstore_partial_12 2579#define vstore_partial_16_13 vstore_partial_13 2580#define vstore_partial_16_14 vstore_partial_14 2581#define vstore_partial_16_15 vstore_partial_15 2582#define vstore_partial_16_16 vstore_partial_16 2583 2584/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 2585 * @name vstore_partial_n 2586 * 2587 * @note @p DATA needs to be a vector not a scalar 2588 * @note n needs to be <= the vector width of the input variable @p DATA 2589 * eg 1: Valid 2590 * vstore_partial_15(var:float16, 0, 0xabcd); 2591 * eg 2: Invalid 2592 * vstore_partial_7(var:float4, 0, 0xabcd); 2593 * 2594 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. 2595 * 2596 * @param[in] DATA The name of the variable 2597 * @param[in] OFFSET Offset in n 2598 * @param[in] PTR The base pointer 2599 * @{ 2600 */ 2601#define vstore_partial_1(DATA, OFFSET, PTR) \ 2602 vstore1(DATA.s0, OFFSET, PTR); 2603 2604#define vstore_partial_2(DATA, OFFSET, PTR) \ 2605 vstore2(DATA.s01, OFFSET, PTR); 2606 2607#define vstore_partial_3(DATA, OFFSET, PTR) \ 2608 vstore3(DATA.s012, OFFSET, PTR); 2609 2610#define vstore_partial_4(DATA, OFFSET, PTR) \ 2611 vstore4(DATA.s0123, OFFSET, PTR); 2612 2613#define vstore_partial_5(DATA, OFFSET, PTR) \ 2614 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2615 vstore1(DATA.s4, OFFSET, PTR + 4); 2616 2617#define vstore_partial_6(DATA, OFFSET, PTR) \ 2618 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2619 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2620 2621#define vstore_partial_7(DATA, OFFSET, PTR) \ 2622 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2623 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2624 2625#define vstore_partial_8(DATA, OFFSET, PTR) \ 2626 vstore8(DATA.s01234567, OFFSET, PTR); 2627 2628#define vstore_partial_9(DATA, OFFSET, PTR) \ 2629 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2630 vstore1(DATA.s8, OFFSET, PTR + 8); 2631 2632#define vstore_partial_10(DATA, OFFSET, PTR) \ 2633 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2634 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2635 2636#define vstore_partial_11(DATA, OFFSET, PTR) \ 2637 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2638 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2639 2640#define vstore_partial_12(DATA, OFFSET, PTR) \ 2641 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2642 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2643 2644#define vstore_partial_13(DATA, OFFSET, PTR) \ 2645 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2646 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2647 2648#define vstore_partial_14(DATA, OFFSET, PTR) \ 2649 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2650 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2651 2652#define vstore_partial_15(DATA, OFFSET, PTR) \ 2653 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2654 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2655 2656#define vstore_partial_16(DATA, OFFSET, PTR) \ 2657 vstore16(DATA, OFFSET, PTR); 2658/** @} */ // end of groupd vstore_partial_n 2659/** @} */ // end of groupd VSTORE_PARTIAL 2660 2661// Convert built-in functions with _sat modifier are not supported in floating point so we create defines 2662// without _sat to overcome this issue 2663#define convert_float_sat convert_float 2664#define convert_float1_sat convert_float 2665#define convert_float2_sat convert_float2 2666#define convert_float3_sat convert_float3 2667#define convert_float4_sat convert_float4 2668#define convert_float8_sat convert_float8 2669#define convert_float16_sat convert_float16 2670#define convert_half_sat convert_float 2671#define convert_half1_sat convert_half 2672#define convert_half2_sat convert_half2 2673#define convert_half3_sat convert_half3 2674#define convert_half4_sat convert_half4 2675#define convert_half8_sat convert_half8 2676#define convert_half16_sat convert_half16 2677 2678#define convert_float1 convert_float 2679#define convert_half1 convert_half 2680#define convert_char1 convert_char 2681#define convert_uchar1 convert_uchar 2682#define convert_short1 convert_short 2683#define convert_ushort1 convert_ushort 2684#define convert_int1 convert_int 2685#define convert_uint1 convert_uint 2686#define convert_long1 convert_long 2687#define convert_ulong1 convert_ulong 2688#define convert_double1 convert_double 2689 2690#define convert_char1_sat convert_char_sat 2691#define convert_uchar1_sat convert_uchar_sat 2692#define convert_short1_sat convert_short_sat 2693#define convert_ushort1_sat convert_ushort_sat 2694#define convert_int1_sat convert_int_sat 2695#define convert_uint1_sat convert_uint_sat 2696#define convert_long1_sat convert_long_sat 2697#define convert_ulong1_sat convert_ulong_sat 2698#define convert_double1_sat convert_double_sat 2699 2700#define VEC_DATA_TYPE_STR(type, size) type##size 2701#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2702 2703#define CONVERT_STR(x, type) (convert_##type((x))) 2704#define CONVERT(x, type) CONVERT_STR(x, type) 2705 2706#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2707#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2708 2709#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2710#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2711 2712#define select_vec_dt_uchar(size) uchar##size 2713#define select_vec_dt_char(size) char##size 2714#define select_vec_dt_ushort(size) ushort##size 2715#define select_vec_dt_short(size) short##size 2716#define select_vec_dt_half(size) short##size 2717#define select_vec_dt_uint(size) uint##size 2718#define select_vec_dt_int(size) int##size 2719#define select_vec_dt_float(size) int##size 2720#define select_vec_dt_ulong(size) ulong##size 2721#define select_vec_dt_long(size) long##size 2722 2723#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2724#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2725#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2726 2727#define sum_reduce_1(x) (x) 2728#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2729#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2730#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2731#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2732#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2733 2734#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2735#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2736 2737#define max_reduce_1(x) (x) 2738#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2739#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2740#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2741#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2742#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2743 2744#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2745#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2746 2747#define VECTOR_DECLARATION(name) \ 2748 __global uchar *name##_ptr, \ 2749 uint name##_stride_x, \ 2750 uint name##_step_x, \ 2751 uint name##_offset_first_element_in_bytes 2752 2753#define IMAGE_DECLARATION(name) \ 2754 __global uchar *name##_ptr, \ 2755 uint name##_stride_x, \ 2756 uint name##_step_x, \ 2757 uint name##_stride_y, \ 2758 uint name##_step_y, \ 2759 uint name##_offset_first_element_in_bytes 2760 2761#define TENSOR3D_DECLARATION(name) \ 2762 __global uchar *name##_ptr, \ 2763 uint name##_stride_x, \ 2764 uint name##_step_x, \ 2765 uint name##_stride_y, \ 2766 uint name##_step_y, \ 2767 uint name##_stride_z, \ 2768 uint name##_step_z, \ 2769 uint name##_offset_first_element_in_bytes 2770 2771#define TENSOR4D_DECLARATION(name) \ 2772 __global uchar *name##_ptr, \ 2773 uint name##_stride_x, \ 2774 uint name##_step_x, \ 2775 uint name##_stride_y, \ 2776 uint name##_step_y, \ 2777 uint name##_stride_z, \ 2778 uint name##_step_z, \ 2779 uint name##_stride_w, \ 2780 uint name##_step_w, \ 2781 uint name##_offset_first_element_in_bytes 2782 2783#define CONVERT_TO_VECTOR_STRUCT(name) \ 2784 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2785 2786#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2787 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2788 2789#define CONVERT_TO_IMAGE_STRUCT(name) \ 2790 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2791 2792#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2793 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2794 2795#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2796 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2797 2798#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2799 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2800 2801#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2802 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2803 2804#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2805 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2806 name##_stride_z, name##_step_z) 2807 2808#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2809 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2810 2811#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2812 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2813 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2814 2815#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2816 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2817 2818#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2819 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2820 name##_stride_z, name##_step_z) 2821 2822/** Structure to hold Vector information */ 2823typedef struct Vector 2824{ 2825 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 2826 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 2827 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 2828} Vector; 2829 2830/** Structure to hold Image information */ 2831typedef struct Image 2832{ 2833 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 2834 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 2835 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 2836 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 2837} Image; 2838 2839/** Structure to hold 3D tensor information */ 2840typedef struct Tensor3D 2841{ 2842 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 2843 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 2844 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 2845 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 2846 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 2847} Tensor3D; 2848 2849/** Structure to hold 4D tensor information */ 2850typedef struct Tensor4D 2851{ 2852 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 2853 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 2854 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 2855 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 2856 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 2857 int stride_w; /**< Stride of the image in W dimension (in bytes) */ 2858} Tensor4D; 2859 2860/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. 2861 * 2862 * @param[in] ptr Pointer to the starting postion of the buffer 2863 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector 2864 * @param[in] stride_x Stride of the vector in X dimension (in bytes) 2865 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 2866 * 2867 * @return An image object 2868 */ 2869inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2870{ 2871 Vector vector = 2872 { 2873 .ptr = ptr, 2874 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2875 .stride_x = stride_x, 2876 }; 2877 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2878 return vector; 2879} 2880 2881/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. 2882 * 2883 * @param[in] ptr Pointer to the starting postion of the buffer 2884 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 2885 * @param[in] stride_x Stride of the image in X dimension (in bytes) 2886 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 2887 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 2888 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 2889 * 2890 * @return An image object 2891 */ 2892inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2893{ 2894 Image img = 2895 { 2896 .ptr = ptr, 2897 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2898 .stride_x = stride_x, 2899 .stride_y = stride_y 2900 }; 2901 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2902 return img; 2903} 2904 2905/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. 2906 * 2907 * @param[in] ptr Pointer to the starting postion of the buffer 2908 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 2909 * @param[in] stride_x Stride of the image in X dimension (in bytes) 2910 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 2911 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 2912 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 2913 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 2914 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 2915 * 2916 * @return A 3D tensor object 2917 */ 2918inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2919{ 2920 Image img = 2921 { 2922 .ptr = ptr, 2923 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2924 .stride_x = stride_x, 2925 .stride_y = stride_y 2926 }; 2927 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2928 return img; 2929} 2930 2931/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. 2932 * 2933 * @param[in] ptr Pointer to the starting postion of the buffer 2934 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 2935 * @param[in] stride_x Stride of the image in X dimension (in bytes) 2936 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 2937 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 2938 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 2939 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 2940 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 2941 * 2942 * @return A 3D tensor object 2943 */ 2944inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2945{ 2946 Tensor3D tensor = 2947 { 2948 .ptr = ptr, 2949 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2950 .stride_x = stride_x, 2951 .stride_y = stride_y, 2952 .stride_z = stride_z 2953 }; 2954 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2955 return tensor; 2956} 2957 2958/** Wrap 3D tensor information into an tensor structure. 2959 * 2960 * @param[in] ptr Pointer to the starting postion of the buffer 2961 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 2962 * @param[in] stride_x Stride of the image in X dimension (in bytes) 2963 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 2964 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 2965 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 2966 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 2967 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 2968 * 2969 * @return A 3D tensor object 2970 */ 2971inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2972{ 2973 Tensor3D tensor = 2974 { 2975 .ptr = ptr, 2976 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2977 .stride_x = stride_x, 2978 .stride_y = stride_y, 2979 .stride_z = stride_z 2980 }; 2981 return tensor; 2982} 2983 2984inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2985 uint step_w, 2986 uint mod_size) 2987{ 2988 Tensor4D tensor = 2989 { 2990 .ptr = ptr, 2991 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2992 .stride_x = stride_x, 2993 .stride_y = stride_y, 2994 .stride_z = stride_z, 2995 .stride_w = stride_w 2996 }; 2997 2998 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2999 return tensor; 3000} 3001 3002/** Get the pointer position of a Vector 3003 * 3004 * @param[in] vec Pointer to the starting position of the buffer 3005 * @param[in] x Relative X position 3006 */ 3007inline __global const uchar *vector_offset(const Vector *vec, int x) 3008{ 3009 return vec->ptr + x * vec->stride_x; 3010} 3011 3012/** Get the pointer position of a Image 3013 * 3014 * @param[in] img Pointer to the starting position of the buffer 3015 * @param[in] x Relative X position 3016 * @param[in] y Relative Y position 3017 */ 3018inline __global uchar *offset(const Image *img, int x, int y) 3019{ 3020 return img->ptr + x * img->stride_x + y * img->stride_y; 3021} 3022 3023/** Get the pointer position of a Tensor3D 3024 * 3025 * @param[in] tensor Pointer to the starting position of the buffer 3026 * @param[in] x Relative X position 3027 * @param[in] y Relative Y position 3028 * @param[in] z Relative Z position 3029 */ 3030inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 3031{ 3032 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 3033} 3034 3035/** Get the pointer position of a Tensor4D 3036 * 3037 * @param[in] tensor Pointer to the starting position of the buffer 3038 * @param[in] x Relative X position 3039 * @param[in] y Relative Y position 3040 * @param[in] z Relative Z position 3041 * @param[in] w Relative W position 3042 */ 3043inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 3044{ 3045 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 3046} 3047 3048/** Get the offset for a given linear index of a Tensor3D 3049 * 3050 * @param[in] tensor Pointer to the starting position of the buffer 3051 * @param[in] width Width of the input tensor 3052 * @param[in] height Height of the input tensor 3053 * @param[in] depth Depth of the input tensor 3054 * @param[in] index Linear index 3055 */ 3056inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 3057{ 3058 uint num_elements = width * height; 3059 3060 const uint z = index / num_elements; 3061 3062 index %= num_elements; 3063 3064 const uint y = index / width; 3065 3066 index %= width; 3067 3068 const uint x = index; 3069 3070 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 3071} 3072 3073#endif // _HELPER_H 3074 3075#if GPU_ARCH == GPU_ARCH_BIFROST 3076#define MLA(a, b, c) (fma(c, b, a)) 3077#else // GPU_ARCH == GPU_ARCH_BIFROST 3078#define MLA(a, b, c) ((b) * (c) + (a)) 3079#endif // GPU_ARCH == GPU_ARCH_BIFROST 3080 3081// Hard-Swish 3082#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 3083 3084// Logistic Activation 3085#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 3086 3087// Hyperbolic Tangent Activation 3088#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 3089 3090// RELU Tangent Activation 3091#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 3092 3093// Bounded RELU Activation 3094#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 3095 3096// Lower Upper Bounded RELU Activation 3097#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 3098 3099// Leaky RELU Activation 3100#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 3101 3102// Soft RELU Activation 3103#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 3104 3105// ELU Activation 3106#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 3107 3108// Absolute Activation 3109#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 3110 3111// Square Activation 3112#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 3113 3114// Square-root Activation 3115#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 3116 3117// Linear Activation 3118#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 3119 3120// Identity Activation 3121#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 3122 3123#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 3124 3125#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 3126 3127/** This performs an activation function on quantized inputs with float transformations. 3128 * 3129 * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time 3130 * 3131 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short 3132 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 3133 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 3134 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. 3135 * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively. 3136 * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively. 3137 * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128. 3138 * 3139 * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16 3140 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) 3141 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) 3142 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) 3143 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) 3144 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) 3145 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) 3146 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image 3147 * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr 3148 * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes) 3149 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes) 3150 * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes) 3151 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes) 3152 * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes) 3153 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes) 3154 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image 3155 */ 3156__kernel void activation_layer_quant_f32( 3157 TENSOR3D_DECLARATION(input) 3158#ifndef IN_PLACE 3159 , 3160 TENSOR3D_DECLARATION(output) 3161#endif /* not IN_PLACE */ 3162) 3163{ 3164 uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); 3165 3166 // Get pixels pointer 3167 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; 3168#ifdef IN_PLACE 3169 __global uchar *output_addr = input_addr; 3170#else /* IN_PLACE */ 3171 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; 3172#endif /* IN_PLACE */ 3173 3174 // Load data 3175 TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr); 3176 3177 VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT); 3178#if defined(O1_VAL) 3179 data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL); 3180#else // defined(O1_VAL) 3181 data_flt = round(data_flt) * ((float)S1_VAL); 3182#endif // defined(O1_VAL) 3183 data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL); 3184 3185#if defined(O2_VAL) 3186 data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE); 3187#else // defined(O2_VAL) 3188 data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE); 3189#endif // defined(O2_VAL) 3190 3191 // Store result 3192 STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 3193} 3194 3195#else // defined(FLOAT_DOMAIN) 3196// Activations performed in the quantized domain 3197 3198#if defined(ACT) 3199/** This performs an activation function on quantized inputs. 3200 * 3201 * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time 3202 * 3203 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short 3204 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 3205 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 3206 * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH 3207 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. 3208 * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively. 3209 * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively. 3210 * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128. 3211 * 3212 * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16 3213 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) 3214 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) 3215 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) 3216 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) 3217 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) 3218 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) 3219 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image 3220 * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr 3221 * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes) 3222 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes) 3223 * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes) 3224 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes) 3225 * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes) 3226 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes) 3227 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image 3228 */ 3229__kernel void activation_layer_quant( 3230 TENSOR3D_DECLARATION(input) 3231#ifndef IN_PLACE 3232 , 3233 TENSOR3D_DECLARATION(output) 3234#endif /* not IN_PLACE */ 3235) 3236{ 3237 uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); 3238 3239 // Get pixels pointer 3240 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; 3241#ifdef IN_PLACE 3242 __global uchar *output_addr = input_addr; 3243#else /* IN_PLACE */ 3244 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; 3245#endif /* IN_PLACE */ 3246 3247 // Load data 3248 TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr); 3249 3250 data0 = PERFORM_ACTIVATION_QUANT(ACT, data0); 3251 3252 // Store result 3253 STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 3254} 3255#endif // defined(ACT) 3256#endif // defined(FLOAT_DOMAIN) 3257 3258)"