1R"( 2 3/* 4 * Copyright (c) 2018 Arm Limited. 5 * 6 * SPDX-License-Identifier: MIT 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to 10 * deal in the Software without restriction, including without limitation the 11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 * sell copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in all 16 * copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 * SOFTWARE. 25 */ 26/* 27 * Copyright (c) 2017-2020 Arm Limited. 28 * 29 * SPDX-License-Identifier: MIT 30 * 31 * Permission is hereby granted, free of charge, to any person obtaining a copy 32 * of this software and associated documentation files (the "Software"), to 33 * deal in the Software without restriction, including without limitation the 34 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 35 * sell copies of the Software, and to permit persons to whom the Software is 36 * furnished to do so, subject to the following conditions: 37 * 38 * The above copyright notice and this permission notice shall be included in all 39 * copies or substantial portions of the Software. 40 * 41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 44 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 45 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 46 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 47 * SOFTWARE. 48 */ 49#ifndef ARM_COMPUTE_HELPERS_ASYMM_H 50#define ARM_COMPUTE_HELPERS_ASYMM_H 51 52/* 53 * Copyright (c) 2016-2020 Arm Limited. 54 * 55 * SPDX-License-Identifier: MIT 56 * 57 * Permission is hereby granted, free of charge, to any person obtaining a copy 58 * of this software and associated documentation files (the "Software"), to 59 * deal in the Software without restriction, including without limitation the 60 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 61 * sell copies of the Software, and to permit persons to whom the Software is 62 * furnished to do so, subject to the following conditions: 63 * 64 * The above copyright notice and this permission notice shall be included in all 65 * copies or substantial portions of the Software. 66 * 67 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 68 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 69 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 70 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 71 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 72 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 73 * SOFTWARE. 74 */ 75#ifndef ARM_COMPUTE_HELPER_H 76#define ARM_COMPUTE_HELPER_H 77 78/* 79 * Copyright (c) 2020 Arm Limited. 80 * 81 * SPDX-License-Identifier: MIT 82 * 83 * Permission is hereby granted, free of charge, to any person obtaining a copy 84 * of this software and associated documentation files (the "Software"), to 85 * deal in the Software without restriction, including without limitation the 86 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 87 * sell copies of the Software, and to permit persons to whom the Software is 88 * furnished to do so, subject to the following conditions: 89 * 90 * The above copyright notice and this permission notice shall be included in all 91 * copies or substantial portions of the Software. 92 * 93 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 94 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 95 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 96 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 97 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 98 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 99 * SOFTWARE. 100 */ 101 102/** Store the 0 to (n-1)th rows of the given variables 103 * @name STORE_ROW_n 104 * 105 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 106 * @param[in] DATA_TYPE The data type of the vectors 107 * @param[in] BASENAME The basename of the variables 108 * @param[in] PTR The base pointer 109 * @param[in] STRIDE_Y The stride value in y-axis direction 110 * @param[in] Z The offset in z-axis direction 111 * @{ 112 */ 113#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 VSTORE(N0) \ 115 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 116 117#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 VSTORE(N0) \ 120 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 121 122#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 VSTORE(N0) \ 125 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 126 127#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 VSTORE(N0) \ 130 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 131 132#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 VSTORE(N0) \ 135 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 136 137#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 138 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 VSTORE(N0) \ 140 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 141 142#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 VSTORE(N0) \ 145 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 146 147#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 VSTORE(N0) \ 150 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 151 152#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 VSTORE(N0) \ 155 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 156 157#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 VSTORE(N0) \ 160 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 161 162#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 VSTORE(N0) \ 165 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 166 167#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169 VSTORE(N0) \ 170 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 171 172#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 173 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 174 VSTORE(N0) \ 175 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 176 177#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 178 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 179 VSTORE(N0) \ 180 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 181 182#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 183 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 184 VSTORE(N0) \ 185 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 186 187#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 188 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 189 VSTORE(N0) \ 190 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 191/** @} */ // end of groupd STORE_ROW_n 192 193/** Convert and store the 0th to (n-1)th rows of the given variables 194 * @name CONVERT_STORE_ROW_n 195 * 196 * @param[in] N0 The size of the vectors 197 * @param[in] DATA_TYPE The data type of the vectors 198 * @param[in] BASENAME The basename of the variables 199 * @param[in] PTR The base pointer 200 * @param[in] STRIDE_Y The stride value in y-axis direction 201 * @param[in] Z The offset in z-axis direction 202 * @{ 203 */ 204#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 VSTORE(N0) \ 206 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 207 208#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 209 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 VSTORE(N0) \ 211 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 212 213#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 214 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 VSTORE(N0) \ 216 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 217 218#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 219 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 VSTORE(N0) \ 221 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 222 223#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 224 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 VSTORE(N0) \ 226 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 227 228#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 229 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 VSTORE(N0) \ 231 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 232 233#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 234 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235 VSTORE(N0) \ 236 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 237 238#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 239 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240 VSTORE(N0) \ 241 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 242 243#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 244 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245 VSTORE(N0) \ 246 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 247 248#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 249 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250 VSTORE(N0) \ 251 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 252 253#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 254 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255 VSTORE(N0) \ 256 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 257 258#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 259 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260 VSTORE(N0) \ 261 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 262 263#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 264 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 265 VSTORE(N0) \ 266 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 267 268#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 269 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 270 VSTORE(N0) \ 271 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 272 273#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 274 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 275 VSTORE(N0) \ 276 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 277 278#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 279 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 280 VSTORE(N0) \ 281 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 282 283/** @} */ // end of groupd CONVERT_STORE_ROW_n 284 285/** Store a block of the given size M0xN0 286 * @name STORE_BLOCK 287 * 288 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 289 * The data to store is expected to have consecutive names for each row. 290 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 291 * The Z offset is expected to have consecutive names. 292 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 293 * 294 * @param[in] M0 The number of rows to store 295 * @param[in] N0 The size of each vector 296 * @param[in] DATA_TYPE The data type of the vectors 297 * @param[in] BASENAME The basename of the variables 298 * @param[in] PTR The base pointer 299 * @param[in] STRIDE_Y The stride value in y-axis direction 300 * @param[in] Z The offset in z-axis direction 301 * @{ 302 */ 303#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 304#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 305/** @} */ // end of group STORE_BLOCK 306 307/** Convert and store a block of the given size M0xN0 308 * @name CONVERT_STORE_BLOCK 309 * 310 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 311 * The data to store is expected to have consecutive names for each row. 312 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 313 * The Z offset is expected to have consecutive names. 314 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 315 * 316 * @param[in] M0 The number of rows to store 317 * @param[in] N0 The size of each vector 318 * @param[in] DATA_TYPE The data type of the vectors 319 * @param[in] BASENAME The basename of the variables 320 * @param[in] PTR The base pointer 321 * @param[in] STRIDE_Y The stride value in y-axis direction 322 * @param[in] Z The offset in z-axis direction 323 * @{ 324 */ 325#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 326#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 327/** @} */ // end of group CONVERT_STORE_BLOCK 328 329/** Partially store the 0 to (n-1)th rows of the given variables 330 * @name STORE_ROW_PARTIAL_n 331 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0 332 * 333 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 334 * 335 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 336 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0 337 * @param[in] DATA_TYPE The data type of the vectors 338 * @param[in] BASENAME The basename of the variables 339 * @param[in] PTR The base pointer 340 * @param[in] STRIDE_Y The stride value in y-axis direction 341 * @param[in] Z The offset in z-axis direction 342 * @{ 343 */ 344#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 345 VSTORE_PARTIAL(N0, STORE_N0) \ 346 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 347 348#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 349 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 350 VSTORE_PARTIAL(N0, STORE_N0) \ 351 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 352 353#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 354 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 355 VSTORE_PARTIAL(N0, STORE_N0) \ 356 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 357 358#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 359 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 360 VSTORE_PARTIAL(N0, STORE_N0) \ 361 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 362 363#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 364 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 365 VSTORE_PARTIAL(N0, STORE_N0) \ 366 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 367 368#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 369 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 370 VSTORE_PARTIAL(N0, STORE_N0) \ 371 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 372 373#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 374 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 375 VSTORE_PARTIAL(N0, STORE_N0) \ 376 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 377 378#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 379 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 380 VSTORE_PARTIAL(N0, STORE_N0) \ 381 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 382 383#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 384 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 385 VSTORE_PARTIAL(N0, STORE_N0) \ 386 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 387 388#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 389 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 390 VSTORE_PARTIAL(N0, STORE_N0) \ 391 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 392 393#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 394 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 395 VSTORE_PARTIAL(N0, STORE_N0) \ 396 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 397 398#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 399 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 400 VSTORE_PARTIAL(N0, STORE_N0) \ 401 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 402 403#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 404 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 405 VSTORE_PARTIAL(N0, STORE_N0) \ 406 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 407 408#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 409 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 410 VSTORE_PARTIAL(N0, STORE_N0) \ 411 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 412 413#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 414 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 415 VSTORE_PARTIAL(N0, STORE_N0) \ 416 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 417 418#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 419 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 420 VSTORE_PARTIAL(N0, STORE_N0) \ 421 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 422/** @} */ // end of groupd STORE_ROW_PARTIAL_n 423 424/** Partially store a block of the given size STORE_M0xSTORE_N0 425 * @name STORE_BLOCK_PARTIAL 426 * 427 * @note The vector width @p N0 is also required for correct partial storing behaviour. 428 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 429 * 430 * The data to store is expected to have consecutive names for each row. 431 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2. 432 * The Z offset is expected to have consecutive names. 433 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 434 * 435 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16 436 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0 437 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 438 * @param[in] DATA_TYPE The data type of the vectors 439 * @param[in] BASENAME The basename of the variables 440 * @param[in] PTR The base pointer 441 * @param[in] STRIDE_Y The stride value in y-axis direction 442 * @param[in] Z The offset in z-axis direction 443 * @{ 444 */ 445#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 446#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 447/** Store a block that can be partial in both x and y dimensions 448 * 449 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 450 * 451 * The data to store is expected to have consecutive names for each row. 452 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 453 * The Z offset is expected to have consecutive names. 454 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 455 * 456 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 457 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 458 * @param[in] DATA_TYPE The data type of the vectors 459 * @param[in] BASENAME The basename of the variables 460 * @param[in] PTR The base pointer 461 * @param[in] STRIDE_Y The stride value in y-axis direction 462 * @param[in] Z The offset in z-axis direction 463 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 464 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 465 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 466 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 467 */ 468#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 469 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 470 { \ 471 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 472 } \ 473 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 474 { \ 475 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 476 } \ 477 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 478 { \ 479 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 480 } \ 481 else \ 482 { \ 483 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 484 } 485/** Store a block that can only be partial in x but not y. 486 * 487 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 488 * 489 * The data to store is expected to have consecutive names for each row. 490 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 491 * The Z offset is expected to have consecutive names. 492 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 493 * 494 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 495 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 496 * @param[in] DATA_TYPE The data type of the vectors 497 * @param[in] BASENAME The basename of the variables 498 * @param[in] PTR The base pointer 499 * @param[in] STRIDE_Y The stride value in y-axis direction 500 * @param[in] Z The offset in z-axis direction 501 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 502 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 503 */ 504#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 505 if(!(PARTIAL_COND_X)) \ 506 { \ 507 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 508 } \ 509 else \ 510 { \ 511 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 512 } 513/** Store a block that can only be partial in y but not x. 514 * 515 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 516 * 517 * The data to store is expected to have consecutive names for each row. 518 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 519 * The Z offset is expected to have consecutive names. 520 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 521 * 522 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 523 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 524 * @param[in] DATA_TYPE The data type of the vectors 525 * @param[in] BASENAME The basename of the variables 526 * @param[in] PTR The base pointer 527 * @param[in] STRIDE_Y The stride value in y-axis direction 528 * @param[in] Z The offset in z-axis direction 529 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 530 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 531 */ 532#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 533 if(!(PARTIAL_COND_Y)) \ 534 { \ 535 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 536 } \ 537 else \ 538 { \ 539 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 540 } 541/** @} */ // end of group STORE_BLOCK_PARTIAL 542 543#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 544 545/** Boundary-aware GEMM block store 546 * @name STORE_BLOCK_BOUNDARY_AWARE 547 * This macro assumes the following schemes to achieve boundary-awareness: 548 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 549 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 550 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 551 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim. 552 * 553 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 554 * blocks **at the end**. 555 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 556 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 557 * 558 * *--x--> x == 0 x == 1 559 * | |<------------------------------N-------------------------->| 560 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 561 * | -------------############################################################# 562 * * | | |...............................|...........................| 563 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 564 * | | |...............................|...........................| 565 * M --############################################################# 566 * | | | |...........................| 567 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 568 * | | | |...........................| 569 * |------------############################################################# 570 * 571 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 572 * 573 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 574 * 575 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 576 * and select corresponding store methods such that the boundary detection logic is only added when needed. 577 * 578 * The data to store is expected to have consecutive names for each row. 579 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 580 * The Z offset is expected to have consecutive names. 581 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 582 * 583 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 584 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 585 * @param[in] DATA_TYPE The data type of the vectors 586 * @param[in] BASENAME The basename of the variables 587 * @param[in] PTR The base pointer 588 * @param[in] STRIDE_Y The stride value in y-axis direction 589 * @param[in] Z The offset in z-axis direction 590 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 591 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 592 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 593 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 594 * @{ 595 */ 596#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 597// Case1: No partial blocks in either x or y 598#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 599 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 600 601#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 602// Case2: Partial blocks in y 603#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 604 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 605 606#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 607// Case3: Partial blocks in x 608#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 609 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 610 611#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 612// Case4: Partial blocks in both x and y 613#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 614 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 615 616#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 617 618#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 619/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE 620 621#if defined(PARTIAL_STORE_M0) 622/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding 623 * @name COMPUTE_M0_START_ROW 624 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows. 625 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent 626 * blocks in the y dimension to avoid any padding. 627 * EG: M0=4, PARTIAL_STORE_M0=1: 628 * | Non-overlapping | +M0_ROW_SHIFT (Overlapping) 629 * block 0 (partial)| start row = 0 | start row = 0 630 * block 1 (full) | start row = 4 | start row = 1 631 * block 2 (full) | start row = 8 | start row = 5 632 * 633 * @param[in] y Global id of current block in y. 634 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 635 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 636 * @{ 637 */ 638#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 639 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 640#else // defined(PARTIAL_STORE_M0) 641#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 642 ((uint)(y * M0)) 643#endif // defined(PARTIAL_STORE_M0) 644/** @} */ // end of group COMPUTE_M0_START_ROW 645 646/** Store a vector that can only be partial in x. 647 * 648 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 649 * 650 * The data to store is expected to end in a 0. 651 * E.g., for basename=c, the expected name is c0. 652 * 653 * @param[in] basename The name of the variable without trailing 0 654 * @param[in] data_type The data type of the vector 655 * @param[in] ptr The base pointer 656 * @param[in] vec_size The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16 657 * @param[in] leftover The vector size if cond = true. Supported range: [1, @p vec_size0) 658 * @param[in] cond Condition to select either vec_size0 or vec_size1 659 * @{ 660 */ 661#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 662 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 663/** @} */ // end of group STORE_VECTOR_SELECT 664 665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666#pragma OPENCL EXTENSION cl_khr_fp16 : enable 667#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 668 669#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 670#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 671#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 672 673#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 674#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 675#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 676 677#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 678#pragma OPENCL EXTENSION cl_arm_printf : enable 679#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 680 681#define GPU_ARCH_MIDGARD 0x100 682#define GPU_ARCH_BIFROST 0x200 683 684/** Concatenate two inputs. 685 * 686 * @param[in] a The first input to be concatenated 687 * @param[in] b The second input to be concatenated 688 * 689 * @return The concatenated output 690 */ 691#define CONCAT(a, b) a##b 692 693/** Expand the given vector 694 * 695 * @param[in] x The vector to be expanded 696 * 697 * @return The expanded output 698 */ 699#define EXPAND(x) x 700 701/** Clamp the given value between an upper and lower bound. 702 * 703 * @param[in] x The value to be clamped 704 * @param[in] min_val The lower bound 705 * @param[in] max_val The upper bound 706 * 707 * @return The clamped value. 708 */ 709#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 710 711/** REVn reverses the given vector whose size is n. 712 * @name REVn 713 * 714 * @param[in] x The vector to be reversed 715 * 716 * @return The reversed vector 717 * @{ 718 */ 719#define REV1(x) ((x)) 720#define REV2(x) ((x).s10) 721#define REV3(x) ((x).s210) 722#define REV4(x) ((x).s3210) 723#define REV8(x) ((x).s76543210) 724#define REV16(x) ((x).sFEDCBA9876543210) 725/** @} */ // end of group REVn 726 727/** Reverse the given vector. 728 * @name REVERSE 729 * 730 * @param[in] x The vector to be reversed 731 * @param[in] s The size of the vector 732 * 733 * @return The reversed vector 734 * @{ 735 */ 736#define REVERSE_STR(x, s) REV##s((x)) 737#define REVERSE(x, s) REVERSE_STR(x, s) 738/** @} */ // end of group REVERSE 739 740/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. 741 * @name ROTs_n 742 * 743 * @param[in] x The vector to be shifted 744 * 745 * @return The shifted vector 746 * @{ 747 */ 748#define ROT1_0(x) ((x)) 749 750#define ROT2_0(x) ((x)) 751#define ROT2_1(x) ((x).s10) 752 753#define ROT3_0(x) ((x)) 754#define ROT3_1(x) ((x).s201) 755#define ROT3_2(x) ((x).s120) 756 757#define ROT4_0(x) ((x)) 758#define ROT4_1(x) ((x).s3012) 759#define ROT4_2(x) ((x).s2301) 760#define ROT4_3(x) ((x).s1230) 761 762#define ROT8_0(x) ((x)) 763#define ROT8_1(x) ((x).s70123456) 764#define ROT8_2(x) ((x).s67012345) 765#define ROT8_3(x) ((x).s56701234) 766#define ROT8_4(x) ((x).s45670123) 767#define ROT8_5(x) ((x).s34567012) 768#define ROT8_6(x) ((x).s23456701) 769#define ROT8_7(x) ((x).s12345670) 770 771#define ROT16_0(x) ((x)) 772#define ROT16_1(x) ((x).sF0123456789ABCDE) 773#define ROT16_2(x) ((x).sEF0123456789ABCD) 774#define ROT16_3(x) ((x).sDEF0123456789ABC) 775#define ROT16_4(x) ((x).sCDEF0123456789AB) 776#define ROT16_5(x) ((x).sBCDEF0123456789A) 777#define ROT16_6(x) ((x).sABCDEF0123456789) 778#define ROT16_7(x) ((x).s9ABCDEF012345678) 779#define ROT16_8(x) ((x).s89ABCDEF01234567) 780#define ROT16_9(x) ((x).s789ABCDEF0123456) 781#define ROT16_10(x) ((x).s6789ABCDEF012345) 782#define ROT16_11(x) ((x).s56789ABCDEF01234) 783#define ROT16_12(x) ((x).s456789ABCDEF0123) 784#define ROT16_13(x) ((x).s3456789ABCDEF012) 785#define ROT16_14(x) ((x).s23456789ABCDEF01) 786#define ROT16_15(x) ((x).s123456789ABCDEF0) 787/** @} */ // end of group ROTs_n 788 789/** Circular-right-shift (rotate-right) the given vector by the given amount. 790 * @name ROTATE 791 * 792 * @param[in] x The vector to be shifted 793 * @param[in] s The size of the vector 794 * @param[in] n The amount to be shifted 795 * 796 * @return The shifted vector 797 * @{ 798 */ 799#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 800#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 801/** @} */ // end of group ROTATE 802 803/** Creates a vector of size n filled with offset values corresponding to the location of each element. 804 * @name V_OFFSn 805 * 806 * @param[in] dt The data type of the output vector 807 * 808 * @return The vector filled with offset values 809 * @{ 810 */ 811#define V_OFFS1(dt) (dt##1)(0) 812#define V_OFFS2(dt) (dt##2)(0, 1) 813#define V_OFFS3(dt) (dt##3)(0, 1, 2) 814#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 815#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 816#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 817/** @} */ // end of group V_OFFSn 818 819/** Create a vector filled with offset values corresponding to the location of each element. 820 * @name VEC_OFFS 821 * 822 * @param[in] dt The data type of the output vector 823 * @param[in] s The size of the output vector 824 * 825 * @return The vector filled with offset values 826 * @{ 827 */ 828#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 829#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 830/** @} */ // end of group VEC_OFFS 831 832#define VLOAD_STR(size) vload##size 833#define VLOAD(size) VLOAD_STR(size) 834 835#define PIXEL_UNIT4 1 836#define PIXEL_UNIT8 2 837#define PIXEL_UNIT16 4 838 839/** Utility macro to convert a vector size in pixel unit. 840 * 841 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 842 * 843 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported 844 * 845 * @return The pixel unit (number of pixels) 846 * @{ 847 */ 848#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 849#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 850/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 851 852#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 853#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 854#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 855 856#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 857#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 858#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 859#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 860#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 861 862/** Utility macro to read a 2D OpenCL image object. 863 * 864 * @note Coordinates are not normalized 865 * 866 * @param[in] data_type Data type 867 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported 868 * @param[in] img OpenCL image object 869 * @param[in] x_coord The x coordinate for the top-left pixel 870 * @param[in] y_coord The y coordinate for the top-left pixel 871 * 872 * @return Pixels from the 2D OpenCL image object 873 * @{ 874 */ 875#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 876#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 877 878#define VSTORE_STR(size) vstore##size 879#define VSTORE(size) VSTORE_STR(size) 880 881#define float1 float 882#define half1 half 883#define char1 char 884#define uchar1 uchar 885#define short1 short 886#define ushort1 ushort 887#define int1 int 888#define uint1 uint 889#define long1 long 890#define ulong1 ulong 891#define double1 double 892 893#define vload1(OFFSET, PTR) *(OFFSET + PTR) 894#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 895 896/** Extended partial vstore that correctly handles scalar values as well. 897 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 898 * @name VSTORE_PARTIAL 899 * 900 * @note With this macro, the passed data can be both a vector and a scalar 901 * @note @p store_size needs to be <= @p size 902 * eg 1: Valid 903 * VSTORE_PARTIAL(16, 15) ...; 904 * eg 2: Invalid 905 * VSTORE_PARTIAL(4, 7) ...; 906 * 907 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 908 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size 909 * @{ 910 */ 911#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 912#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 913 914#define NO_STORE(data, offs, ptr) \ 915 { \ 916 } 917 918// Size == 1 (scalar) 919#define vstore_partial_1_0 NO_STORE 920#define vstore_partial_1_1 vstore1 921#define vstore_partial_1_2 NO_STORE 922#define vstore_partial_1_3 NO_STORE 923#define vstore_partial_1_4 NO_STORE 924#define vstore_partial_1_5 NO_STORE 925#define vstore_partial_1_6 NO_STORE 926#define vstore_partial_1_7 NO_STORE 927#define vstore_partial_1_8 NO_STORE 928#define vstore_partial_1_9 NO_STORE 929#define vstore_partial_1_10 NO_STORE 930#define vstore_partial_1_11 NO_STORE 931#define vstore_partial_1_12 NO_STORE 932#define vstore_partial_1_13 NO_STORE 933#define vstore_partial_1_14 NO_STORE 934#define vstore_partial_1_15 NO_STORE 935#define vstore_partial_1_16 NO_STORE 936// Size == 2 937#define vstore_partial_2_0 NO_STORE 938#define vstore_partial_2_1 vstore_partial_1 939#define vstore_partial_2_2 vstore_partial_2 940#define vstore_partial_2_3 NO_STORE 941#define vstore_partial_2_4 NO_STORE 942#define vstore_partial_2_5 NO_STORE 943#define vstore_partial_2_6 NO_STORE 944#define vstore_partial_2_7 NO_STORE 945#define vstore_partial_2_8 NO_STORE 946#define vstore_partial_2_9 NO_STORE 947#define vstore_partial_2_10 NO_STORE 948#define vstore_partial_2_11 NO_STORE 949#define vstore_partial_2_12 NO_STORE 950#define vstore_partial_2_13 NO_STORE 951#define vstore_partial_2_14 NO_STORE 952#define vstore_partial_2_15 NO_STORE 953#define vstore_partial_2_16 NO_STORE 954// Size == 3 955#define vstore_partial_3_0 NO_STORE 956#define vstore_partial_3_1 vstore_partial_1 957#define vstore_partial_3_2 vstore_partial_2 958#define vstore_partial_3_3 vstore_partial_3 959#define vstore_partial_3_4 NO_STORE 960#define vstore_partial_3_5 NO_STORE 961#define vstore_partial_3_6 NO_STORE 962#define vstore_partial_3_7 NO_STORE 963#define vstore_partial_3_8 NO_STORE 964#define vstore_partial_3_9 NO_STORE 965#define vstore_partial_3_10 NO_STORE 966#define vstore_partial_3_11 NO_STORE 967#define vstore_partial_3_12 NO_STORE 968#define vstore_partial_3_13 NO_STORE 969#define vstore_partial_3_14 NO_STORE 970#define vstore_partial_3_15 NO_STORE 971#define vstore_partial_3_16 NO_STORE 972// Size == 4 973#define vstore_partial_4_0 NO_STORE 974#define vstore_partial_4_1 vstore_partial_1 975#define vstore_partial_4_2 vstore_partial_2 976#define vstore_partial_4_3 vstore_partial_3 977#define vstore_partial_4_4 vstore_partial_4 978#define vstore_partial_4_5 NO_STORE 979#define vstore_partial_4_6 NO_STORE 980#define vstore_partial_4_7 NO_STORE 981#define vstore_partial_4_8 NO_STORE 982#define vstore_partial_4_9 NO_STORE 983#define vstore_partial_4_10 NO_STORE 984#define vstore_partial_4_11 NO_STORE 985#define vstore_partial_4_12 NO_STORE 986#define vstore_partial_4_13 NO_STORE 987#define vstore_partial_4_14 NO_STORE 988#define vstore_partial_4_15 NO_STORE 989#define vstore_partial_4_16 NO_STORE 990// Size == 8 991#define vstore_partial_8_0 NO_STORE 992#define vstore_partial_8_1 vstore_partial_1 993#define vstore_partial_8_2 vstore_partial_2 994#define vstore_partial_8_3 vstore_partial_3 995#define vstore_partial_8_4 vstore_partial_4 996#define vstore_partial_8_5 vstore_partial_5 997#define vstore_partial_8_6 vstore_partial_6 998#define vstore_partial_8_7 vstore_partial_7 999#define vstore_partial_8_8 vstore_partial_8 1000#define vstore_partial_8_9 NO_STORE 1001#define vstore_partial_8_10 NO_STORE 1002#define vstore_partial_8_11 NO_STORE 1003#define vstore_partial_8_12 NO_STORE 1004#define vstore_partial_8_13 NO_STORE 1005#define vstore_partial_8_14 NO_STORE 1006#define vstore_partial_8_15 NO_STORE 1007#define vstore_partial_8_16 NO_STORE 1008// Size == 16 1009#define vstore_partial_16_0 NO_STORE 1010#define vstore_partial_16_1 vstore_partial_1 1011#define vstore_partial_16_2 vstore_partial_2 1012#define vstore_partial_16_3 vstore_partial_3 1013#define vstore_partial_16_4 vstore_partial_4 1014#define vstore_partial_16_5 vstore_partial_5 1015#define vstore_partial_16_6 vstore_partial_6 1016#define vstore_partial_16_7 vstore_partial_7 1017#define vstore_partial_16_8 vstore_partial_8 1018#define vstore_partial_16_9 vstore_partial_9 1019#define vstore_partial_16_10 vstore_partial_10 1020#define vstore_partial_16_11 vstore_partial_11 1021#define vstore_partial_16_12 vstore_partial_12 1022#define vstore_partial_16_13 vstore_partial_13 1023#define vstore_partial_16_14 vstore_partial_14 1024#define vstore_partial_16_15 vstore_partial_15 1025#define vstore_partial_16_16 vstore_partial_16 1026 1027/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 1028 * @name vstore_partial_n 1029 * 1030 * @note @p DATA needs to be a vector not a scalar 1031 * @note n needs to be <= the vector width of the input variable @p DATA 1032 * eg 1: Valid 1033 * vstore_partial_15(var:float16, 0, 0xabcd); 1034 * eg 2: Invalid 1035 * vstore_partial_7(var:float4, 0, 0xabcd); 1036 * 1037 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. 1038 * 1039 * @param[in] DATA The name of the variable 1040 * @param[in] OFFSET Offset in n 1041 * @param[in] PTR The base pointer 1042 * @{ 1043 */ 1044#define vstore_partial_1(DATA, OFFSET, PTR) \ 1045 vstore1(DATA.s0, OFFSET, PTR); 1046 1047#define vstore_partial_2(DATA, OFFSET, PTR) \ 1048 vstore2(DATA.s01, OFFSET, PTR); 1049 1050#define vstore_partial_3(DATA, OFFSET, PTR) \ 1051 vstore3(DATA.s012, OFFSET, PTR); 1052 1053#define vstore_partial_4(DATA, OFFSET, PTR) \ 1054 vstore4(DATA.s0123, OFFSET, PTR); 1055 1056#define vstore_partial_5(DATA, OFFSET, PTR) \ 1057 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1058 vstore1(DATA.s4, OFFSET, PTR + 4); 1059 1060#define vstore_partial_6(DATA, OFFSET, PTR) \ 1061 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1062 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 1063 1064#define vstore_partial_7(DATA, OFFSET, PTR) \ 1065 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1066 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 1067 1068#define vstore_partial_8(DATA, OFFSET, PTR) \ 1069 vstore8(DATA.s01234567, OFFSET, PTR); 1070 1071#define vstore_partial_9(DATA, OFFSET, PTR) \ 1072 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1073 vstore1(DATA.s8, OFFSET, PTR + 8); 1074 1075#define vstore_partial_10(DATA, OFFSET, PTR) \ 1076 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1077 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 1078 1079#define vstore_partial_11(DATA, OFFSET, PTR) \ 1080 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1081 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 1082 1083#define vstore_partial_12(DATA, OFFSET, PTR) \ 1084 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1085 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 1086 1087#define vstore_partial_13(DATA, OFFSET, PTR) \ 1088 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1089 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 1090 1091#define vstore_partial_14(DATA, OFFSET, PTR) \ 1092 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1093 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 1094 1095#define vstore_partial_15(DATA, OFFSET, PTR) \ 1096 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1097 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 1098 1099#define vstore_partial_16(DATA, OFFSET, PTR) \ 1100 vstore16(DATA, OFFSET, PTR); 1101/** @} */ // end of groupd vstore_partial_n 1102/** @} */ // end of groupd VSTORE_PARTIAL 1103 1104// Convert built-in functions with _sat modifier are not supported in floating point so we create defines 1105// without _sat to overcome this issue 1106#define convert_float_sat convert_float 1107#define convert_float1_sat convert_float 1108#define convert_float2_sat convert_float2 1109#define convert_float3_sat convert_float3 1110#define convert_float4_sat convert_float4 1111#define convert_float8_sat convert_float8 1112#define convert_float16_sat convert_float16 1113#define convert_half_sat convert_float 1114#define convert_half1_sat convert_half 1115#define convert_half2_sat convert_half2 1116#define convert_half3_sat convert_half3 1117#define convert_half4_sat convert_half4 1118#define convert_half8_sat convert_half8 1119#define convert_half16_sat convert_half16 1120 1121#define convert_float1 convert_float 1122#define convert_half1 convert_half 1123#define convert_char1 convert_char 1124#define convert_uchar1 convert_uchar 1125#define convert_short1 convert_short 1126#define convert_ushort1 convert_ushort 1127#define convert_int1 convert_int 1128#define convert_uint1 convert_uint 1129#define convert_long1 convert_long 1130#define convert_ulong1 convert_ulong 1131#define convert_double1 convert_double 1132 1133#define convert_char1_sat convert_char_sat 1134#define convert_uchar1_sat convert_uchar_sat 1135#define convert_short1_sat convert_short_sat 1136#define convert_ushort1_sat convert_ushort_sat 1137#define convert_int1_sat convert_int_sat 1138#define convert_uint1_sat convert_uint_sat 1139#define convert_long1_sat convert_long_sat 1140#define convert_ulong1_sat convert_ulong_sat 1141#define convert_double1_sat convert_double_sat 1142 1143#define VEC_DATA_TYPE_STR(type, size) type##size 1144#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 1145 1146#define CONVERT_STR(x, type) (convert_##type((x))) 1147#define CONVERT(x, type) CONVERT_STR(x, type) 1148 1149#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 1150#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 1151 1152#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 1153#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 1154 1155#define select_vec_dt_uchar(size) uchar##size 1156#define select_vec_dt_char(size) char##size 1157#define select_vec_dt_ushort(size) ushort##size 1158#define select_vec_dt_short(size) short##size 1159#define select_vec_dt_half(size) short##size 1160#define select_vec_dt_uint(size) uint##size 1161#define select_vec_dt_int(size) int##size 1162#define select_vec_dt_float(size) int##size 1163#define select_vec_dt_ulong(size) ulong##size 1164#define select_vec_dt_long(size) long##size 1165 1166#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 1167#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 1168#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 1169 1170#define sum_reduce_1(x) (x) 1171#define sum_reduce_2(x) ((x).s0) + ((x).s1) 1172#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 1173#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 1174#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 1175#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 1176 1177#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 1178#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 1179 1180#define max_reduce_1(x) (x) 1181#define max_reduce_2(x) max(((x).s0), ((x).s1)) 1182#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 1183#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 1184#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 1185#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 1186 1187#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 1188#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 1189 1190#define VECTOR_DECLARATION(name) \ 1191 __global uchar *name##_ptr, \ 1192 uint name##_stride_x, \ 1193 uint name##_step_x, \ 1194 uint name##_offset_first_element_in_bytes 1195 1196#define IMAGE_DECLARATION(name) \ 1197 __global uchar *name##_ptr, \ 1198 uint name##_stride_x, \ 1199 uint name##_step_x, \ 1200 uint name##_stride_y, \ 1201 uint name##_step_y, \ 1202 uint name##_offset_first_element_in_bytes 1203 1204#define TENSOR3D_DECLARATION(name) \ 1205 __global uchar *name##_ptr, \ 1206 uint name##_stride_x, \ 1207 uint name##_step_x, \ 1208 uint name##_stride_y, \ 1209 uint name##_step_y, \ 1210 uint name##_stride_z, \ 1211 uint name##_step_z, \ 1212 uint name##_offset_first_element_in_bytes 1213 1214#define TENSOR4D_DECLARATION(name) \ 1215 __global uchar *name##_ptr, \ 1216 uint name##_stride_x, \ 1217 uint name##_step_x, \ 1218 uint name##_stride_y, \ 1219 uint name##_step_y, \ 1220 uint name##_stride_z, \ 1221 uint name##_step_z, \ 1222 uint name##_stride_w, \ 1223 uint name##_step_w, \ 1224 uint name##_offset_first_element_in_bytes 1225 1226#define CONVERT_TO_VECTOR_STRUCT(name) \ 1227 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1228 1229#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1230 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1231 1232#define CONVERT_TO_IMAGE_STRUCT(name) \ 1233 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1234 1235#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1236 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1237 1238#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1239 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1240 1241#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1242 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1243 1244#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1245 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1246 1247#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1248 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1249 name##_stride_z, name##_step_z) 1250 1251#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1252 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1253 1254#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1255 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1256 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1257 1258#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1259 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1260 1261#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1262 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1263 name##_stride_z, name##_step_z) 1264 1265/** Structure to hold Vector information */ 1266typedef struct Vector 1267{ 1268 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1269 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1270 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1271} Vector; 1272 1273/** Structure to hold Image information */ 1274typedef struct Image 1275{ 1276 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1277 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1278 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1279 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1280} Image; 1281 1282/** Structure to hold 3D tensor information */ 1283typedef struct Tensor3D 1284{ 1285 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1286 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1287 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1288 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1289 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1290} Tensor3D; 1291 1292/** Structure to hold 4D tensor information */ 1293typedef struct Tensor4D 1294{ 1295 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1296 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1297 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1298 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1299 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1300 int stride_w; /**< Stride of the image in W dimension (in bytes) */ 1301} Tensor4D; 1302 1303/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. 1304 * 1305 * @param[in] ptr Pointer to the starting postion of the buffer 1306 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector 1307 * @param[in] stride_x Stride of the vector in X dimension (in bytes) 1308 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1309 * 1310 * @return An image object 1311 */ 1312inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1313{ 1314 Vector vector = 1315 { 1316 .ptr = ptr, 1317 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1318 .stride_x = stride_x, 1319 }; 1320 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1321 return vector; 1322} 1323 1324/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. 1325 * 1326 * @param[in] ptr Pointer to the starting postion of the buffer 1327 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1328 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1329 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1330 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1331 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1332 * 1333 * @return An image object 1334 */ 1335inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1336{ 1337 Image img = 1338 { 1339 .ptr = ptr, 1340 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1341 .stride_x = stride_x, 1342 .stride_y = stride_y 1343 }; 1344 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1345 return img; 1346} 1347 1348/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. 1349 * 1350 * @param[in] ptr Pointer to the starting postion of the buffer 1351 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1352 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1353 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1354 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1355 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1356 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1357 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1358 * 1359 * @return A 3D tensor object 1360 */ 1361inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1362{ 1363 Image img = 1364 { 1365 .ptr = ptr, 1366 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1367 .stride_x = stride_x, 1368 .stride_y = stride_y 1369 }; 1370 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1371 return img; 1372} 1373 1374/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. 1375 * 1376 * @param[in] ptr Pointer to the starting postion of the buffer 1377 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1378 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1379 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1380 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1381 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1382 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1383 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1384 * 1385 * @return A 3D tensor object 1386 */ 1387inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1388{ 1389 Tensor3D tensor = 1390 { 1391 .ptr = ptr, 1392 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1393 .stride_x = stride_x, 1394 .stride_y = stride_y, 1395 .stride_z = stride_z 1396 }; 1397 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1398 return tensor; 1399} 1400 1401/** Wrap 3D tensor information into an tensor structure. 1402 * 1403 * @param[in] ptr Pointer to the starting postion of the buffer 1404 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1405 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1406 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1407 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1408 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1409 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1410 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1411 * 1412 * @return A 3D tensor object 1413 */ 1414inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1415{ 1416 Tensor3D tensor = 1417 { 1418 .ptr = ptr, 1419 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1420 .stride_x = stride_x, 1421 .stride_y = stride_y, 1422 .stride_z = stride_z 1423 }; 1424 return tensor; 1425} 1426 1427inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1428 uint step_w, 1429 uint mod_size) 1430{ 1431 Tensor4D tensor = 1432 { 1433 .ptr = ptr, 1434 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1435 .stride_x = stride_x, 1436 .stride_y = stride_y, 1437 .stride_z = stride_z, 1438 .stride_w = stride_w 1439 }; 1440 1441 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1442 return tensor; 1443} 1444 1445/** Get the pointer position of a Vector 1446 * 1447 * @param[in] vec Pointer to the starting position of the buffer 1448 * @param[in] x Relative X position 1449 */ 1450inline __global const uchar *vector_offset(const Vector *vec, int x) 1451{ 1452 return vec->ptr + x * vec->stride_x; 1453} 1454 1455/** Get the pointer position of a Image 1456 * 1457 * @param[in] img Pointer to the starting position of the buffer 1458 * @param[in] x Relative X position 1459 * @param[in] y Relative Y position 1460 */ 1461inline __global uchar *offset(const Image *img, int x, int y) 1462{ 1463 return img->ptr + x * img->stride_x + y * img->stride_y; 1464} 1465 1466/** Get the pointer position of a Tensor3D 1467 * 1468 * @param[in] tensor Pointer to the starting position of the buffer 1469 * @param[in] x Relative X position 1470 * @param[in] y Relative Y position 1471 * @param[in] z Relative Z position 1472 */ 1473inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1474{ 1475 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1476} 1477 1478/** Get the pointer position of a Tensor4D 1479 * 1480 * @param[in] tensor Pointer to the starting position of the buffer 1481 * @param[in] x Relative X position 1482 * @param[in] y Relative Y position 1483 * @param[in] z Relative Z position 1484 * @param[in] w Relative W position 1485 */ 1486inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1487{ 1488 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1489} 1490 1491/** Get the offset for a given linear index of a Tensor3D 1492 * 1493 * @param[in] tensor Pointer to the starting position of the buffer 1494 * @param[in] width Width of the input tensor 1495 * @param[in] height Height of the input tensor 1496 * @param[in] depth Depth of the input tensor 1497 * @param[in] index Linear index 1498 */ 1499inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1500{ 1501 uint num_elements = width * height; 1502 1503 const uint z = index / num_elements; 1504 1505 index %= num_elements; 1506 1507 const uint y = index / width; 1508 1509 index %= width; 1510 1511 const uint x = index; 1512 1513 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1514} 1515 1516#endif // _HELPER_H 1517 1518/** Convert the given vector with round to nearest even rounding mode 1519 * 1520 * @param[in] x The target to be converted 1521 * @param[in] type The target type 1522 * 1523 * @return The converted vector 1524 */ 1525#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) 1526#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) 1527 1528/** Quantize a floating-point scalar value to 8-bit asymmetric 1529 * 1530 * @param[in] input Input value to quantize 1531 * @param[in] offset Quantization offset 1532 * @param[in] scale Quantization scale 1533 * 1534 * @return quantized value 1535 */ 1536inline uchar quantize_qasymm8(float input, float offset, float scale) 1537{ 1538 float out_f32 = input / scale + offset; 1539 uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); 1540 return res_u8; 1541} 1542 1543/** Dequantize a scalar value from 8-bit asymmetric to floating-point 1544 * 1545 * @param[in] input Input value to quantize 1546 * @param[in] offset Quantization offset 1547 * @param[in] scale Quantization scale 1548 * 1549 * @return quantized value 1550 */ 1551inline float dequantize_qasymm8(uchar input, float offset, float scale) 1552{ 1553 return ((float)input - offset) * scale; 1554} 1555 1556/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point 1557 * 1558 * @param[in] input Input value to quantize 1559 * @param[in] offset Quantization offset 1560 * @param[in] scale Quantization scale 1561 * 1562 * @return quantized value 1563 */ 1564inline float dequantize_qasymm8_signed(char input, float offset, float scale) 1565{ 1566 return ((float)input - offset) * scale; 1567} 1568 1569/** Quantize a vector of values from floating-point 1570 * 1571 * @param[in] type Output data type. 1572 * @param[in] size Size of vector. 1573 * 1574 * @return quantized values 1575 */ 1576#define QUANTIZE_IMPL(type, size) \ 1577 inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ 1578 { \ 1579 VEC_DATA_TYPE(float, size) \ 1580 out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ 1581 VEC_DATA_TYPE(type, size) \ 1582 res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ 1583 return res; \ 1584 } 1585 1586/** Dequantize a vector of values to floating-point 1587 * 1588 * @param[in] type Input data type. 1589 * @param[in] size Size of vector. 1590 * 1591 * @return dequantized values in floating point 1592 */ 1593#define DEQUANTIZE_IMPL(type, size) \ 1594 inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ 1595 { \ 1596 return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ 1597 } 1598 1599/** Correctly-rounded-to-nearest division by a power-of-two. 1600 * 1601 * @param[in] size Size of vector. 1602 * 1603 * @return Correctly-rounded-to-nearest division by a power-of-two. 1604 */ 1605#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ 1606 inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ 1607 { \ 1608 const VEC_DATA_TYPE(int, size) \ 1609 zero = (VEC_DATA_TYPE(int, size))0; \ 1610 const VEC_DATA_TYPE(int, size) \ 1611 one = (VEC_DATA_TYPE(int, size))1; \ 1612 VEC_DATA_TYPE(int, size) \ 1613 mask = (one << exponent) - one; \ 1614 VEC_DATA_TYPE(int, size) \ 1615 threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ 1616 return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ 1617 } 1618 1619/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), 1620 * rounding to the nearest value, and saturating -1 * -1 to the maximum value. 1621 * 1622 * @param[in] size Size of vector. 1623 * 1624 * @return Product of two fixed-point numbers. 1625 */ 1626#define ASYMM_MULT_IMPL(size) \ 1627 inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 1628 { \ 1629 VEC_DATA_TYPE(int, size) \ 1630 overflow = a == b && a == INT_MIN; \ 1631 VEC_DATA_TYPE(long, size) \ 1632 a_64 = convert_long##size(a); \ 1633 VEC_DATA_TYPE(long, size) \ 1634 b_64 = convert_long##size(b); \ 1635 VEC_DATA_TYPE(long, size) \ 1636 ab_64 = a_64 * b_64; \ 1637 /* Revert COMPMID-907 */ \ 1638 VEC_DATA_TYPE(long, size) \ 1639 mask1 = 1 << 30; \ 1640 VEC_DATA_TYPE(long, size) \ 1641 mask2 = 1 - (1 << 30); \ 1642 VEC_DATA_TYPE(long, size) \ 1643 is_positive_or_zero = ab_64 >= 0; \ 1644 VEC_DATA_TYPE(long, size) \ 1645 nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \ 1646 VEC_DATA_TYPE(long, size) \ 1647 mask = 1ll << 31; \ 1648 VEC_DATA_TYPE(int, size) \ 1649 ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ 1650 return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \ 1651 } 1652 1653/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). 1654 * 1655 * @param[in] size Size of vector. 1656 * 1657 * @return Result in fixed-point format Q0. 1658 */ 1659#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ 1660 inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ 1661 { \ 1662 const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ 1663 const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ 1664 const int k_fractional_bits = 31; \ 1665 VEC_DATA_TYPE(int, size) \ 1666 x = a + (1 << (k_fractional_bits - 3)); \ 1667 VEC_DATA_TYPE(int, size) \ 1668 x2 = ASYMM_MULT(x, x, size); \ 1669 VEC_DATA_TYPE(int, size) \ 1670 x3 = ASYMM_MULT(x2, x, size); \ 1671 VEC_DATA_TYPE(int, size) \ 1672 x4 = ASYMM_MULT(x2, x2, size); \ 1673 VEC_DATA_TYPE(int, size) \ 1674 x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ 1675 VEC_DATA_TYPE(int, size) \ 1676 x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ 1677 VEC_DATA_TYPE(int, size) \ 1678 x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ 1679 return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ 1680 } 1681 1682/** Each bit of the result is set to the corresponding bit of either then_val or 1683 * else_val depending on whether the corresponding bit of if_mask is set. 1684 * Equivalent to the VBSL instruction in ARM NEON. 1685 * 1686 * @param[in] size Size of vector. 1687 * 1688 * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not. 1689 */ 1690#define ASYMM_SELECT_USING_MASK_IMPL(size) \ 1691 inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ 1692 { \ 1693 return (if_mask & then_val) ^ (~if_mask & else_val); \ 1694 } 1695 1696/** For each element of input vector, the corresponding bits of the result item are set 1697 * if the input item is zero. 1698 * 1699 * @param[in] size Size of vector. 1700 * 1701 * @returns Output vector with bits set when corresponding bit in @p a is zero. 1702 */ 1703#define ASYMM_MASK_IF_ZERO_IMPL(size) \ 1704 inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ 1705 { \ 1706 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 1707 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 1708 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0)); \ 1709 } 1710 1711/** For each element of input vector, the corresponding bits of the result item are set 1712 * if the input item is non-zero. 1713 * 1714 * @param[in] size Size of vector. 1715 * 1716 * @returns Output vector with bits set when corresponding bit in @p a is non zero. 1717 */ 1718#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ 1719 inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ 1720 { \ 1721 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 1722 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 1723 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ 1724 } 1725 1726#define EXP_BARREL_SHIFTER_IMPL(size) \ 1727 inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ 1728 { \ 1729 if(k_integer_bits > exponent) \ 1730 { \ 1731 const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ 1732 return ASYMM_SELECT_USING_MASK( \ 1733 ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ 1734 ASYMM_MULT(result, fp_multiplier, size), result, size); \ 1735 } \ 1736 \ 1737 return result; \ 1738 } 1739 1740/** Calculates \f$ exp(x) \f$ for x < 0. 1741 * 1742 * @param[in] size Size of vector. 1743 * 1744 * @return Result in fixed-point format Q0. 1745 */ 1746#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ 1747 inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ 1748 { \ 1749 const int k_fractional_bits = 31 - k_integer_bits; \ 1750 VEC_DATA_TYPE(int, size) \ 1751 k_one_quarter = 1 << (k_fractional_bits - 2); \ 1752 VEC_DATA_TYPE(int, size) \ 1753 mask = k_one_quarter - 1; \ 1754 VEC_DATA_TYPE(int, size) \ 1755 a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ 1756 VEC_DATA_TYPE(int, size) \ 1757 a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ 1758 VEC_DATA_TYPE(int, size) \ 1759 result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ 1760 VEC_DATA_TYPE(int, size) \ 1761 remainder = a_mod_quarter_minus_one_quarter - a; \ 1762 \ 1763 result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ 1764 result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ 1765 result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ 1766 result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ 1767 result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ 1768 result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ 1769 result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ 1770 \ 1771 if(k_integer_bits > 5) \ 1772 { \ 1773 const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ 1774 result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ 1775 } \ 1776 \ 1777 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 1778 return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ 1779 } 1780 1781/** Calculates the product of a integer value by a power of two, with either a positive exponent 1782 * (equivalent to an arithmetic left shift, saturating) or a negative exponent 1783 * (equivalent to an arithmetic right shift, rounding to nearest). 1784 * 1785 * @param[in] size Size of vector. 1786 * 1787 * @return Arithmetic left or right shift. 1788 */ 1789#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ 1790 inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ 1791 { \ 1792 if(exponent < 0) \ 1793 { \ 1794 return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ 1795 } \ 1796 \ 1797 const VEC_DATA_TYPE(int, size) min = INT_MIN; \ 1798 const VEC_DATA_TYPE(int, size) max = INT_MAX; \ 1799 int threshold = ((1 << (31 - exponent)) - 1); \ 1800 VEC_DATA_TYPE(int, size) \ 1801 positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ 1802 VEC_DATA_TYPE(int, size) \ 1803 negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ 1804 VEC_DATA_TYPE(int, size) \ 1805 result = x << exponent; \ 1806 result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ 1807 result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ 1808 return result; \ 1809 } 1810 1811/** Calculates (a+b)/2, rounded to the nearest integer. 1812 * Equivalent to VRHADD in the ARM NEON instruction set. 1813 * 1814 * @param[in] size Size of vector. 1815 * 1816 * @return (a+b)/2, rounded to the nearest integer. 1817 */ 1818#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ 1819 inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 1820 { \ 1821 VEC_DATA_TYPE(long, size) \ 1822 a64 = convert_long##size(a); \ 1823 VEC_DATA_TYPE(long, size) \ 1824 b64 = convert_long##size(b); \ 1825 VEC_DATA_TYPE(long, size) \ 1826 sum = a64 + b64; \ 1827 const VEC_DATA_TYPE(long, size) one = 1; \ 1828 const VEC_DATA_TYPE(long, size) minus_one = -1; \ 1829 VEC_DATA_TYPE(long, size) \ 1830 sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ 1831 return convert_int##size((sum + sign) / 2); \ 1832 } 1833 1834/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). 1835 * 1836 * @param[in] size Size of vector. 1837 * 1838 * @return Result in fixed-point format Q0. 1839 */ 1840#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ 1841 inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ 1842 { \ 1843 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 1844 const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ 1845 VEC_DATA_TYPE(int, size) \ 1846 half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ 1847 const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ 1848 const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ 1849 VEC_DATA_TYPE(int, size) \ 1850 x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ 1851 for(int i = 0; i < 3; i++) \ 1852 { \ 1853 VEC_DATA_TYPE(int, size) \ 1854 half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ 1855 VEC_DATA_TYPE(int, size) \ 1856 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ 1857 VEC_DATA_TYPE(int, size) \ 1858 tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ 1859 x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ 1860 } \ 1861 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ 1862 } 1863 1864/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly. 1865 * 1866 * @param[in] size Size of vector. 1867 * 1868 * @return Rescaled value. 1869 */ 1870#define ASYMM_RESCALE_IMPL(size) \ 1871 inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ 1872 { \ 1873 int exponent = src_integer_bits - dst_integer_bits; \ 1874 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ 1875 } 1876 1877#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) 1878#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) 1879#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) 1880#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) 1881 1882#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) 1883#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) 1884#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) 1885#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) 1886#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ 1887 ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) 1888#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ 1889 ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) 1890#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) 1891#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) 1892#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) 1893#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) 1894#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) 1895#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) 1896#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) 1897#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) 1898#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) 1899#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) 1900#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) 1901#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) 1902#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) 1903 1904#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ 1905 inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ 1906 { \ 1907 const int left_shift = shift > 0 ? shift : 0; \ 1908 const int right_shift = shift > 0 ? 0 : -shift; \ 1909 return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ 1910 } 1911#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) 1912 1913QUANTIZE_IMPL(uchar, 1) 1914QUANTIZE_IMPL(char, 1) 1915QUANTIZE_IMPL(uint, 1) 1916QUANTIZE_IMPL(int, 1) 1917QUANTIZE_IMPL(uchar, 4) 1918QUANTIZE_IMPL(ushort, 4) 1919QUANTIZE_IMPL(short, 4) 1920QUANTIZE_IMPL(uchar, 16) 1921QUANTIZE_IMPL(char, 16) 1922QUANTIZE_IMPL(ushort, 16) 1923QUANTIZE_IMPL(short, 16) 1924QUANTIZE_IMPL(uint, 16) 1925QUANTIZE_IMPL(int, 16) 1926 1927DEQUANTIZE_IMPL(uchar, 1) 1928DEQUANTIZE_IMPL(char, 1) 1929DEQUANTIZE_IMPL(uint, 1) 1930DEQUANTIZE_IMPL(int, 1) 1931DEQUANTIZE_IMPL(uchar, 4) 1932DEQUANTIZE_IMPL(ushort, 4) 1933DEQUANTIZE_IMPL(short, 4) 1934DEQUANTIZE_IMPL(uchar, 16) 1935DEQUANTIZE_IMPL(char, 16) 1936DEQUANTIZE_IMPL(ushort, 16) 1937DEQUANTIZE_IMPL(short, 16) 1938DEQUANTIZE_IMPL(uint, 16) 1939DEQUANTIZE_IMPL(int, 16) 1940 1941ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) 1942ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) 1943ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3) 1944ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) 1945ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) 1946ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) 1947 1948ASYMM_MULT_IMPL(1) 1949ASYMM_MULT_IMPL(2) 1950ASYMM_MULT_IMPL(3) 1951ASYMM_MULT_IMPL(4) 1952ASYMM_MULT_IMPL(8) 1953ASYMM_MULT_IMPL(16) 1954 1955ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1) 1956ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) 1957ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3) 1958ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) 1959ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) 1960ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) 1961 1962ASYMM_SELECT_USING_MASK_IMPL(1) 1963ASYMM_SELECT_USING_MASK_IMPL(2) 1964ASYMM_SELECT_USING_MASK_IMPL(3) 1965ASYMM_SELECT_USING_MASK_IMPL(4) 1966ASYMM_SELECT_USING_MASK_IMPL(8) 1967ASYMM_SELECT_USING_MASK_IMPL(16) 1968 1969ASYMM_MASK_IF_ZERO_IMPL(1) 1970ASYMM_MASK_IF_ZERO_IMPL(2) 1971ASYMM_MASK_IF_ZERO_IMPL(3) 1972ASYMM_MASK_IF_ZERO_IMPL(4) 1973ASYMM_MASK_IF_ZERO_IMPL(8) 1974ASYMM_MASK_IF_ZERO_IMPL(16) 1975 1976ASYMM_MASK_IF_NON_ZERO_IMPL(1) 1977ASYMM_MASK_IF_NON_ZERO_IMPL(2) 1978ASYMM_MASK_IF_NON_ZERO_IMPL(3) 1979ASYMM_MASK_IF_NON_ZERO_IMPL(4) 1980ASYMM_MASK_IF_NON_ZERO_IMPL(8) 1981ASYMM_MASK_IF_NON_ZERO_IMPL(16) 1982 1983EXP_BARREL_SHIFTER_IMPL(1) 1984EXP_BARREL_SHIFTER_IMPL(2) 1985EXP_BARREL_SHIFTER_IMPL(3) 1986EXP_BARREL_SHIFTER_IMPL(4) 1987EXP_BARREL_SHIFTER_IMPL(8) 1988EXP_BARREL_SHIFTER_IMPL(16) 1989 1990ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1) 1991ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) 1992ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3) 1993ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) 1994ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) 1995ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) 1996 1997ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) 1998ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) 1999ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3) 2000ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) 2001ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) 2002ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) 2003 2004ASYMM_ROUNDING_HALF_SUM_IMPL(1) 2005ASYMM_ROUNDING_HALF_SUM_IMPL(2) 2006ASYMM_ROUNDING_HALF_SUM_IMPL(3) 2007ASYMM_ROUNDING_HALF_SUM_IMPL(4) 2008ASYMM_ROUNDING_HALF_SUM_IMPL(8) 2009ASYMM_ROUNDING_HALF_SUM_IMPL(16) 2010 2011ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1) 2012ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) 2013ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3) 2014ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) 2015ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) 2016ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) 2017 2018ASYMM_RESCALE_IMPL(1) 2019ASYMM_RESCALE_IMPL(2) 2020ASYMM_RESCALE_IMPL(3) 2021ASYMM_RESCALE_IMPL(4) 2022ASYMM_RESCALE_IMPL(8) 2023ASYMM_RESCALE_IMPL(16) 2024 2025MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) 2026MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) 2027MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3) 2028MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) 2029MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) 2030MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) 2031 2032#endif // ARM_COMPUTE_HELPERS_ASYMM_H 2033/* 2034 * Copyright (c) 2018-2020 Arm Limited. 2035 * 2036 * SPDX-License-Identifier: MIT 2037 * 2038 * Permission is hereby granted, free of charge, to any person obtaining a copy 2039 * of this software and associated documentation files (the "Software"), to 2040 * deal in the Software without restriction, including without limitation the 2041 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 2042 * sell copies of the Software, and to permit persons to whom the Software is 2043 * furnished to do so, subject to the following conditions: 2044 * 2045 * The above copyright notice and this permission notice shall be included in all 2046 * copies or substantial portions of the Software. 2047 * 2048 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2049 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2050 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 2051 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2052 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2053 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 2054 * SOFTWARE. 2055 */ 2056/* 2057 * Copyright (c) 2017-2020 Arm Limited. 2058 * 2059 * SPDX-License-Identifier: MIT 2060 * 2061 * Permission is hereby granted, free of charge, to any person obtaining a copy 2062 * of this software and associated documentation files (the "Software"), to 2063 * deal in the Software without restriction, including without limitation the 2064 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 2065 * sell copies of the Software, and to permit persons to whom the Software is 2066 * furnished to do so, subject to the following conditions: 2067 * 2068 * The above copyright notice and this permission notice shall be included in all 2069 * copies or substantial portions of the Software. 2070 * 2071 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2072 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2073 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 2074 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2075 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2076 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 2077 * SOFTWARE. 2078 */ 2079#ifndef ARM_COMPUTE_HELPERS_ASYMM_H 2080#define ARM_COMPUTE_HELPERS_ASYMM_H 2081 2082/* 2083 * Copyright (c) 2016-2020 Arm Limited. 2084 * 2085 * SPDX-License-Identifier: MIT 2086 * 2087 * Permission is hereby granted, free of charge, to any person obtaining a copy 2088 * of this software and associated documentation files (the "Software"), to 2089 * deal in the Software without restriction, including without limitation the 2090 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 2091 * sell copies of the Software, and to permit persons to whom the Software is 2092 * furnished to do so, subject to the following conditions: 2093 * 2094 * The above copyright notice and this permission notice shall be included in all 2095 * copies or substantial portions of the Software. 2096 * 2097 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2098 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2099 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 2100 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2101 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2102 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 2103 * SOFTWARE. 2104 */ 2105#ifndef ARM_COMPUTE_HELPER_H 2106#define ARM_COMPUTE_HELPER_H 2107 2108/* 2109 * Copyright (c) 2020 Arm Limited. 2110 * 2111 * SPDX-License-Identifier: MIT 2112 * 2113 * Permission is hereby granted, free of charge, to any person obtaining a copy 2114 * of this software and associated documentation files (the "Software"), to 2115 * deal in the Software without restriction, including without limitation the 2116 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 2117 * sell copies of the Software, and to permit persons to whom the Software is 2118 * furnished to do so, subject to the following conditions: 2119 * 2120 * The above copyright notice and this permission notice shall be included in all 2121 * copies or substantial portions of the Software. 2122 * 2123 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2124 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2125 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 2126 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2127 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2128 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 2129 * SOFTWARE. 2130 */ 2131 2132/** Store the 0 to (n-1)th rows of the given variables 2133 * @name STORE_ROW_n 2134 * 2135 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 2136 * @param[in] DATA_TYPE The data type of the vectors 2137 * @param[in] BASENAME The basename of the variables 2138 * @param[in] PTR The base pointer 2139 * @param[in] STRIDE_Y The stride value in y-axis direction 2140 * @param[in] Z The offset in z-axis direction 2141 * @{ 2142 */ 2143#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2144 VSTORE(N0) \ 2145 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2146 2147#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2148 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2149 VSTORE(N0) \ 2150 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2151 2152#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2153 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2154 VSTORE(N0) \ 2155 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2156 2157#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2158 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2159 VSTORE(N0) \ 2160 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2161 2162#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2163 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2164 VSTORE(N0) \ 2165 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2166 2167#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2168 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2169 VSTORE(N0) \ 2170 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2171 2172#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2173 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2174 VSTORE(N0) \ 2175 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2176 2177#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2178 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2179 VSTORE(N0) \ 2180 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2181 2182#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2183 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2184 VSTORE(N0) \ 2185 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2186 2187#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2188 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2189 VSTORE(N0) \ 2190 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2191 2192#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2193 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2194 VSTORE(N0) \ 2195 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2196 2197#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2198 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2199 VSTORE(N0) \ 2200 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2201 2202#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2203 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2204 VSTORE(N0) \ 2205 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2206 2207#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2208 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2209 VSTORE(N0) \ 2210 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2211 2212#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2213 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2214 VSTORE(N0) \ 2215 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2216 2217#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2218 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2219 VSTORE(N0) \ 2220 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2221/** @} */ // end of groupd STORE_ROW_n 2222 2223/** Convert and store the 0th to (n-1)th rows of the given variables 2224 * @name CONVERT_STORE_ROW_n 2225 * 2226 * @param[in] N0 The size of the vectors 2227 * @param[in] DATA_TYPE The data type of the vectors 2228 * @param[in] BASENAME The basename of the variables 2229 * @param[in] PTR The base pointer 2230 * @param[in] STRIDE_Y The stride value in y-axis direction 2231 * @param[in] Z The offset in z-axis direction 2232 * @{ 2233 */ 2234#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2235 VSTORE(N0) \ 2236 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2237 2238#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2239 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2240 VSTORE(N0) \ 2241 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2242 2243#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2244 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2245 VSTORE(N0) \ 2246 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2247 2248#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2249 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2250 VSTORE(N0) \ 2251 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2252 2253#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2254 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2255 VSTORE(N0) \ 2256 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2257 2258#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2259 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2260 VSTORE(N0) \ 2261 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2262 2263#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2264 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2265 VSTORE(N0) \ 2266 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2267 2268#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2269 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2270 VSTORE(N0) \ 2271 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2272 2273#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2274 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2275 VSTORE(N0) \ 2276 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2277 2278#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 2279 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2280 VSTORE(N0) \ 2281 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2282 2283#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2284 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2285 VSTORE(N0) \ 2286 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2287 2288#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2289 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2290 VSTORE(N0) \ 2291 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2292 2293#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2294 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2295 VSTORE(N0) \ 2296 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2297 2298#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2299 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2300 VSTORE(N0) \ 2301 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2302 2303#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2304 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2305 VSTORE(N0) \ 2306 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2307 2308#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2309 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2310 VSTORE(N0) \ 2311 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2312 2313/** @} */ // end of groupd CONVERT_STORE_ROW_n 2314 2315/** Store a block of the given size M0xN0 2316 * @name STORE_BLOCK 2317 * 2318 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 2319 * The data to store is expected to have consecutive names for each row. 2320 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2321 * The Z offset is expected to have consecutive names. 2322 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2323 * 2324 * @param[in] M0 The number of rows to store 2325 * @param[in] N0 The size of each vector 2326 * @param[in] DATA_TYPE The data type of the vectors 2327 * @param[in] BASENAME The basename of the variables 2328 * @param[in] PTR The base pointer 2329 * @param[in] STRIDE_Y The stride value in y-axis direction 2330 * @param[in] Z The offset in z-axis direction 2331 * @{ 2332 */ 2333#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2334#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2335/** @} */ // end of group STORE_BLOCK 2336 2337/** Convert and store a block of the given size M0xN0 2338 * @name CONVERT_STORE_BLOCK 2339 * 2340 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 2341 * The data to store is expected to have consecutive names for each row. 2342 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2343 * The Z offset is expected to have consecutive names. 2344 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2345 * 2346 * @param[in] M0 The number of rows to store 2347 * @param[in] N0 The size of each vector 2348 * @param[in] DATA_TYPE The data type of the vectors 2349 * @param[in] BASENAME The basename of the variables 2350 * @param[in] PTR The base pointer 2351 * @param[in] STRIDE_Y The stride value in y-axis direction 2352 * @param[in] Z The offset in z-axis direction 2353 * @{ 2354 */ 2355#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2356#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2357/** @} */ // end of group CONVERT_STORE_BLOCK 2358 2359/** Partially store the 0 to (n-1)th rows of the given variables 2360 * @name STORE_ROW_PARTIAL_n 2361 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0 2362 * 2363 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2364 * 2365 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 2366 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0 2367 * @param[in] DATA_TYPE The data type of the vectors 2368 * @param[in] BASENAME The basename of the variables 2369 * @param[in] PTR The base pointer 2370 * @param[in] STRIDE_Y The stride value in y-axis direction 2371 * @param[in] Z The offset in z-axis direction 2372 * @{ 2373 */ 2374#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2375 VSTORE_PARTIAL(N0, STORE_N0) \ 2376 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2377 2378#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2379 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2380 VSTORE_PARTIAL(N0, STORE_N0) \ 2381 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2382 2383#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2384 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2385 VSTORE_PARTIAL(N0, STORE_N0) \ 2386 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2387 2388#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2389 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2390 VSTORE_PARTIAL(N0, STORE_N0) \ 2391 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2392 2393#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2394 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2395 VSTORE_PARTIAL(N0, STORE_N0) \ 2396 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2397 2398#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2399 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2400 VSTORE_PARTIAL(N0, STORE_N0) \ 2401 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2402 2403#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2404 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2405 VSTORE_PARTIAL(N0, STORE_N0) \ 2406 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2407 2408#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2409 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2410 VSTORE_PARTIAL(N0, STORE_N0) \ 2411 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2412 2413#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2414 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2415 VSTORE_PARTIAL(N0, STORE_N0) \ 2416 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2417 2418#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2419 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2420 VSTORE_PARTIAL(N0, STORE_N0) \ 2421 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2422 2423#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2424 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2425 VSTORE_PARTIAL(N0, STORE_N0) \ 2426 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2427 2428#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2429 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2430 VSTORE_PARTIAL(N0, STORE_N0) \ 2431 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2432 2433#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2434 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2435 VSTORE_PARTIAL(N0, STORE_N0) \ 2436 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2437 2438#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2439 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2440 VSTORE_PARTIAL(N0, STORE_N0) \ 2441 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2442 2443#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2444 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2445 VSTORE_PARTIAL(N0, STORE_N0) \ 2446 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2447 2448#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2449 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 2450 VSTORE_PARTIAL(N0, STORE_N0) \ 2451 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2452/** @} */ // end of groupd STORE_ROW_PARTIAL_n 2453 2454/** Partially store a block of the given size STORE_M0xSTORE_N0 2455 * @name STORE_BLOCK_PARTIAL 2456 * 2457 * @note The vector width @p N0 is also required for correct partial storing behaviour. 2458 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2459 * 2460 * The data to store is expected to have consecutive names for each row. 2461 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2. 2462 * The Z offset is expected to have consecutive names. 2463 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2464 * 2465 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16 2466 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0 2467 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 2468 * @param[in] DATA_TYPE The data type of the vectors 2469 * @param[in] BASENAME The basename of the variables 2470 * @param[in] PTR The base pointer 2471 * @param[in] STRIDE_Y The stride value in y-axis direction 2472 * @param[in] Z The offset in z-axis direction 2473 * @{ 2474 */ 2475#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2476#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2477/** Store a block that can be partial in both x and y dimensions 2478 * 2479 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2480 * 2481 * The data to store is expected to have consecutive names for each row. 2482 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2483 * The Z offset is expected to have consecutive names. 2484 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2485 * 2486 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2487 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2488 * @param[in] DATA_TYPE The data type of the vectors 2489 * @param[in] BASENAME The basename of the variables 2490 * @param[in] PTR The base pointer 2491 * @param[in] STRIDE_Y The stride value in y-axis direction 2492 * @param[in] Z The offset in z-axis direction 2493 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 2494 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 2495 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 2496 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 2497 */ 2498#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2499 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2500 { \ 2501 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2502 } \ 2503 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2504 { \ 2505 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2506 } \ 2507 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2508 { \ 2509 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2510 } \ 2511 else \ 2512 { \ 2513 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2514 } 2515/** Store a block that can only be partial in x but not y. 2516 * 2517 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2518 * 2519 * The data to store is expected to have consecutive names for each row. 2520 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2521 * The Z offset is expected to have consecutive names. 2522 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2523 * 2524 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2525 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2526 * @param[in] DATA_TYPE The data type of the vectors 2527 * @param[in] BASENAME The basename of the variables 2528 * @param[in] PTR The base pointer 2529 * @param[in] STRIDE_Y The stride value in y-axis direction 2530 * @param[in] Z The offset in z-axis direction 2531 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 2532 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 2533 */ 2534#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2535 if(!(PARTIAL_COND_X)) \ 2536 { \ 2537 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2538 } \ 2539 else \ 2540 { \ 2541 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2542 } 2543/** Store a block that can only be partial in y but not x. 2544 * 2545 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2546 * 2547 * The data to store is expected to have consecutive names for each row. 2548 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2549 * The Z offset is expected to have consecutive names. 2550 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2551 * 2552 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2553 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2554 * @param[in] DATA_TYPE The data type of the vectors 2555 * @param[in] BASENAME The basename of the variables 2556 * @param[in] PTR The base pointer 2557 * @param[in] STRIDE_Y The stride value in y-axis direction 2558 * @param[in] Z The offset in z-axis direction 2559 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 2560 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 2561 */ 2562#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2563 if(!(PARTIAL_COND_Y)) \ 2564 { \ 2565 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2566 } \ 2567 else \ 2568 { \ 2569 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 2570 } 2571/** @} */ // end of group STORE_BLOCK_PARTIAL 2572 2573#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 2574 2575/** Boundary-aware GEMM block store 2576 * @name STORE_BLOCK_BOUNDARY_AWARE 2577 * This macro assumes the following schemes to achieve boundary-awareness: 2578 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 2579 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 2580 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 2581 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim. 2582 * 2583 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 2584 * blocks **at the end**. 2585 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 2586 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 2587 * 2588 * *--x--> x == 0 x == 1 2589 * | |<------------------------------N-------------------------->| 2590 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 2591 * | -------------############################################################# 2592 * * | | |...............................|...........................| 2593 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 2594 * | | |...............................|...........................| 2595 * M --############################################################# 2596 * | | | |...........................| 2597 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 2598 * | | | |...........................| 2599 * |------------############################################################# 2600 * 2601 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 2602 * 2603 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2604 * 2605 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 2606 * and select corresponding store methods such that the boundary detection logic is only added when needed. 2607 * 2608 * The data to store is expected to have consecutive names for each row. 2609 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 2610 * The Z offset is expected to have consecutive names. 2611 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 2612 * 2613 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2614 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 2615 * @param[in] DATA_TYPE The data type of the vectors 2616 * @param[in] BASENAME The basename of the variables 2617 * @param[in] PTR The base pointer 2618 * @param[in] STRIDE_Y The stride value in y-axis direction 2619 * @param[in] Z The offset in z-axis direction 2620 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 2621 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 2622 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 2623 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 2624 * @{ 2625 */ 2626#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2627// Case1: No partial blocks in either x or y 2628#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2629 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2630 2631#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2632// Case2: Partial blocks in y 2633#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2634 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2635 2636#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2637// Case3: Partial blocks in x 2638#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2639 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2640 2641#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2642// Case4: Partial blocks in both x and y 2643#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2644 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2645 2646#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2647 2648#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 2649/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE 2650 2651#if defined(PARTIAL_STORE_M0) 2652/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding 2653 * @name COMPUTE_M0_START_ROW 2654 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows. 2655 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent 2656 * blocks in the y dimension to avoid any padding. 2657 * EG: M0=4, PARTIAL_STORE_M0=1: 2658 * | Non-overlapping | +M0_ROW_SHIFT (Overlapping) 2659 * block 0 (partial)| start row = 0 | start row = 0 2660 * block 1 (full) | start row = 4 | start row = 1 2661 * block 2 (full) | start row = 8 | start row = 5 2662 * 2663 * @param[in] y Global id of current block in y. 2664 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 2665 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 2666 * @{ 2667 */ 2668#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 2669 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 2670#else // defined(PARTIAL_STORE_M0) 2671#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 2672 ((uint)(y * M0)) 2673#endif // defined(PARTIAL_STORE_M0) 2674/** @} */ // end of group COMPUTE_M0_START_ROW 2675 2676/** Store a vector that can only be partial in x. 2677 * 2678 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 2679 * 2680 * The data to store is expected to end in a 0. 2681 * E.g., for basename=c, the expected name is c0. 2682 * 2683 * @param[in] basename The name of the variable without trailing 0 2684 * @param[in] data_type The data type of the vector 2685 * @param[in] ptr The base pointer 2686 * @param[in] vec_size The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16 2687 * @param[in] leftover The vector size if cond = true. Supported range: [1, @p vec_size0) 2688 * @param[in] cond Condition to select either vec_size0 or vec_size1 2689 * @{ 2690 */ 2691#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 2692 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 2693/** @} */ // end of group STORE_VECTOR_SELECT 2694 2695#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2696#pragma OPENCL EXTENSION cl_khr_fp16 : enable 2697#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2698 2699#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 2700#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 2701#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 2702 2703#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 2704#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 2705#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 2706 2707#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 2708#pragma OPENCL EXTENSION cl_arm_printf : enable 2709#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 2710 2711#define GPU_ARCH_MIDGARD 0x100 2712#define GPU_ARCH_BIFROST 0x200 2713 2714/** Concatenate two inputs. 2715 * 2716 * @param[in] a The first input to be concatenated 2717 * @param[in] b The second input to be concatenated 2718 * 2719 * @return The concatenated output 2720 */ 2721#define CONCAT(a, b) a##b 2722 2723/** Expand the given vector 2724 * 2725 * @param[in] x The vector to be expanded 2726 * 2727 * @return The expanded output 2728 */ 2729#define EXPAND(x) x 2730 2731/** Clamp the given value between an upper and lower bound. 2732 * 2733 * @param[in] x The value to be clamped 2734 * @param[in] min_val The lower bound 2735 * @param[in] max_val The upper bound 2736 * 2737 * @return The clamped value. 2738 */ 2739#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 2740 2741/** REVn reverses the given vector whose size is n. 2742 * @name REVn 2743 * 2744 * @param[in] x The vector to be reversed 2745 * 2746 * @return The reversed vector 2747 * @{ 2748 */ 2749#define REV1(x) ((x)) 2750#define REV2(x) ((x).s10) 2751#define REV3(x) ((x).s210) 2752#define REV4(x) ((x).s3210) 2753#define REV8(x) ((x).s76543210) 2754#define REV16(x) ((x).sFEDCBA9876543210) 2755/** @} */ // end of group REVn 2756 2757/** Reverse the given vector. 2758 * @name REVERSE 2759 * 2760 * @param[in] x The vector to be reversed 2761 * @param[in] s The size of the vector 2762 * 2763 * @return The reversed vector 2764 * @{ 2765 */ 2766#define REVERSE_STR(x, s) REV##s((x)) 2767#define REVERSE(x, s) REVERSE_STR(x, s) 2768/** @} */ // end of group REVERSE 2769 2770/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. 2771 * @name ROTs_n 2772 * 2773 * @param[in] x The vector to be shifted 2774 * 2775 * @return The shifted vector 2776 * @{ 2777 */ 2778#define ROT1_0(x) ((x)) 2779 2780#define ROT2_0(x) ((x)) 2781#define ROT2_1(x) ((x).s10) 2782 2783#define ROT3_0(x) ((x)) 2784#define ROT3_1(x) ((x).s201) 2785#define ROT3_2(x) ((x).s120) 2786 2787#define ROT4_0(x) ((x)) 2788#define ROT4_1(x) ((x).s3012) 2789#define ROT4_2(x) ((x).s2301) 2790#define ROT4_3(x) ((x).s1230) 2791 2792#define ROT8_0(x) ((x)) 2793#define ROT8_1(x) ((x).s70123456) 2794#define ROT8_2(x) ((x).s67012345) 2795#define ROT8_3(x) ((x).s56701234) 2796#define ROT8_4(x) ((x).s45670123) 2797#define ROT8_5(x) ((x).s34567012) 2798#define ROT8_6(x) ((x).s23456701) 2799#define ROT8_7(x) ((x).s12345670) 2800 2801#define ROT16_0(x) ((x)) 2802#define ROT16_1(x) ((x).sF0123456789ABCDE) 2803#define ROT16_2(x) ((x).sEF0123456789ABCD) 2804#define ROT16_3(x) ((x).sDEF0123456789ABC) 2805#define ROT16_4(x) ((x).sCDEF0123456789AB) 2806#define ROT16_5(x) ((x).sBCDEF0123456789A) 2807#define ROT16_6(x) ((x).sABCDEF0123456789) 2808#define ROT16_7(x) ((x).s9ABCDEF012345678) 2809#define ROT16_8(x) ((x).s89ABCDEF01234567) 2810#define ROT16_9(x) ((x).s789ABCDEF0123456) 2811#define ROT16_10(x) ((x).s6789ABCDEF012345) 2812#define ROT16_11(x) ((x).s56789ABCDEF01234) 2813#define ROT16_12(x) ((x).s456789ABCDEF0123) 2814#define ROT16_13(x) ((x).s3456789ABCDEF012) 2815#define ROT16_14(x) ((x).s23456789ABCDEF01) 2816#define ROT16_15(x) ((x).s123456789ABCDEF0) 2817/** @} */ // end of group ROTs_n 2818 2819/** Circular-right-shift (rotate-right) the given vector by the given amount. 2820 * @name ROTATE 2821 * 2822 * @param[in] x The vector to be shifted 2823 * @param[in] s The size of the vector 2824 * @param[in] n The amount to be shifted 2825 * 2826 * @return The shifted vector 2827 * @{ 2828 */ 2829#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 2830#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 2831/** @} */ // end of group ROTATE 2832 2833/** Creates a vector of size n filled with offset values corresponding to the location of each element. 2834 * @name V_OFFSn 2835 * 2836 * @param[in] dt The data type of the output vector 2837 * 2838 * @return The vector filled with offset values 2839 * @{ 2840 */ 2841#define V_OFFS1(dt) (dt##1)(0) 2842#define V_OFFS2(dt) (dt##2)(0, 1) 2843#define V_OFFS3(dt) (dt##3)(0, 1, 2) 2844#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 2845#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 2846#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 2847/** @} */ // end of group V_OFFSn 2848 2849/** Create a vector filled with offset values corresponding to the location of each element. 2850 * @name VEC_OFFS 2851 * 2852 * @param[in] dt The data type of the output vector 2853 * @param[in] s The size of the output vector 2854 * 2855 * @return The vector filled with offset values 2856 * @{ 2857 */ 2858#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 2859#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 2860/** @} */ // end of group VEC_OFFS 2861 2862#define VLOAD_STR(size) vload##size 2863#define VLOAD(size) VLOAD_STR(size) 2864 2865#define PIXEL_UNIT4 1 2866#define PIXEL_UNIT8 2 2867#define PIXEL_UNIT16 4 2868 2869/** Utility macro to convert a vector size in pixel unit. 2870 * 2871 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 2872 * 2873 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported 2874 * 2875 * @return The pixel unit (number of pixels) 2876 * @{ 2877 */ 2878#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 2879#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 2880/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 2881 2882#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 2883#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 2884#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 2885 2886#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2887#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 2888#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 2889#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 2890#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 2891 2892/** Utility macro to read a 2D OpenCL image object. 2893 * 2894 * @note Coordinates are not normalized 2895 * 2896 * @param[in] data_type Data type 2897 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported 2898 * @param[in] img OpenCL image object 2899 * @param[in] x_coord The x coordinate for the top-left pixel 2900 * @param[in] y_coord The y coordinate for the top-left pixel 2901 * 2902 * @return Pixels from the 2D OpenCL image object 2903 * @{ 2904 */ 2905#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 2906#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 2907 2908#define VSTORE_STR(size) vstore##size 2909#define VSTORE(size) VSTORE_STR(size) 2910 2911#define float1 float 2912#define half1 half 2913#define char1 char 2914#define uchar1 uchar 2915#define short1 short 2916#define ushort1 ushort 2917#define int1 int 2918#define uint1 uint 2919#define long1 long 2920#define ulong1 ulong 2921#define double1 double 2922 2923#define vload1(OFFSET, PTR) *(OFFSET + PTR) 2924#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 2925 2926/** Extended partial vstore that correctly handles scalar values as well. 2927 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 2928 * @name VSTORE_PARTIAL 2929 * 2930 * @note With this macro, the passed data can be both a vector and a scalar 2931 * @note @p store_size needs to be <= @p size 2932 * eg 1: Valid 2933 * VSTORE_PARTIAL(16, 15) ...; 2934 * eg 2: Invalid 2935 * VSTORE_PARTIAL(4, 7) ...; 2936 * 2937 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 2938 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size 2939 * @{ 2940 */ 2941#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 2942#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 2943 2944#define NO_STORE(data, offs, ptr) \ 2945 { \ 2946 } 2947 2948// Size == 1 (scalar) 2949#define vstore_partial_1_0 NO_STORE 2950#define vstore_partial_1_1 vstore1 2951#define vstore_partial_1_2 NO_STORE 2952#define vstore_partial_1_3 NO_STORE 2953#define vstore_partial_1_4 NO_STORE 2954#define vstore_partial_1_5 NO_STORE 2955#define vstore_partial_1_6 NO_STORE 2956#define vstore_partial_1_7 NO_STORE 2957#define vstore_partial_1_8 NO_STORE 2958#define vstore_partial_1_9 NO_STORE 2959#define vstore_partial_1_10 NO_STORE 2960#define vstore_partial_1_11 NO_STORE 2961#define vstore_partial_1_12 NO_STORE 2962#define vstore_partial_1_13 NO_STORE 2963#define vstore_partial_1_14 NO_STORE 2964#define vstore_partial_1_15 NO_STORE 2965#define vstore_partial_1_16 NO_STORE 2966// Size == 2 2967#define vstore_partial_2_0 NO_STORE 2968#define vstore_partial_2_1 vstore_partial_1 2969#define vstore_partial_2_2 vstore_partial_2 2970#define vstore_partial_2_3 NO_STORE 2971#define vstore_partial_2_4 NO_STORE 2972#define vstore_partial_2_5 NO_STORE 2973#define vstore_partial_2_6 NO_STORE 2974#define vstore_partial_2_7 NO_STORE 2975#define vstore_partial_2_8 NO_STORE 2976#define vstore_partial_2_9 NO_STORE 2977#define vstore_partial_2_10 NO_STORE 2978#define vstore_partial_2_11 NO_STORE 2979#define vstore_partial_2_12 NO_STORE 2980#define vstore_partial_2_13 NO_STORE 2981#define vstore_partial_2_14 NO_STORE 2982#define vstore_partial_2_15 NO_STORE 2983#define vstore_partial_2_16 NO_STORE 2984// Size == 3 2985#define vstore_partial_3_0 NO_STORE 2986#define vstore_partial_3_1 vstore_partial_1 2987#define vstore_partial_3_2 vstore_partial_2 2988#define vstore_partial_3_3 vstore_partial_3 2989#define vstore_partial_3_4 NO_STORE 2990#define vstore_partial_3_5 NO_STORE 2991#define vstore_partial_3_6 NO_STORE 2992#define vstore_partial_3_7 NO_STORE 2993#define vstore_partial_3_8 NO_STORE 2994#define vstore_partial_3_9 NO_STORE 2995#define vstore_partial_3_10 NO_STORE 2996#define vstore_partial_3_11 NO_STORE 2997#define vstore_partial_3_12 NO_STORE 2998#define vstore_partial_3_13 NO_STORE 2999#define vstore_partial_3_14 NO_STORE 3000#define vstore_partial_3_15 NO_STORE 3001#define vstore_partial_3_16 NO_STORE 3002// Size == 4 3003#define vstore_partial_4_0 NO_STORE 3004#define vstore_partial_4_1 vstore_partial_1 3005#define vstore_partial_4_2 vstore_partial_2 3006#define vstore_partial_4_3 vstore_partial_3 3007#define vstore_partial_4_4 vstore_partial_4 3008#define vstore_partial_4_5 NO_STORE 3009#define vstore_partial_4_6 NO_STORE 3010#define vstore_partial_4_7 NO_STORE 3011#define vstore_partial_4_8 NO_STORE 3012#define vstore_partial_4_9 NO_STORE 3013#define vstore_partial_4_10 NO_STORE 3014#define vstore_partial_4_11 NO_STORE 3015#define vstore_partial_4_12 NO_STORE 3016#define vstore_partial_4_13 NO_STORE 3017#define vstore_partial_4_14 NO_STORE 3018#define vstore_partial_4_15 NO_STORE 3019#define vstore_partial_4_16 NO_STORE 3020// Size == 8 3021#define vstore_partial_8_0 NO_STORE 3022#define vstore_partial_8_1 vstore_partial_1 3023#define vstore_partial_8_2 vstore_partial_2 3024#define vstore_partial_8_3 vstore_partial_3 3025#define vstore_partial_8_4 vstore_partial_4 3026#define vstore_partial_8_5 vstore_partial_5 3027#define vstore_partial_8_6 vstore_partial_6 3028#define vstore_partial_8_7 vstore_partial_7 3029#define vstore_partial_8_8 vstore_partial_8 3030#define vstore_partial_8_9 NO_STORE 3031#define vstore_partial_8_10 NO_STORE 3032#define vstore_partial_8_11 NO_STORE 3033#define vstore_partial_8_12 NO_STORE 3034#define vstore_partial_8_13 NO_STORE 3035#define vstore_partial_8_14 NO_STORE 3036#define vstore_partial_8_15 NO_STORE 3037#define vstore_partial_8_16 NO_STORE 3038// Size == 16 3039#define vstore_partial_16_0 NO_STORE 3040#define vstore_partial_16_1 vstore_partial_1 3041#define vstore_partial_16_2 vstore_partial_2 3042#define vstore_partial_16_3 vstore_partial_3 3043#define vstore_partial_16_4 vstore_partial_4 3044#define vstore_partial_16_5 vstore_partial_5 3045#define vstore_partial_16_6 vstore_partial_6 3046#define vstore_partial_16_7 vstore_partial_7 3047#define vstore_partial_16_8 vstore_partial_8 3048#define vstore_partial_16_9 vstore_partial_9 3049#define vstore_partial_16_10 vstore_partial_10 3050#define vstore_partial_16_11 vstore_partial_11 3051#define vstore_partial_16_12 vstore_partial_12 3052#define vstore_partial_16_13 vstore_partial_13 3053#define vstore_partial_16_14 vstore_partial_14 3054#define vstore_partial_16_15 vstore_partial_15 3055#define vstore_partial_16_16 vstore_partial_16 3056 3057/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 3058 * @name vstore_partial_n 3059 * 3060 * @note @p DATA needs to be a vector not a scalar 3061 * @note n needs to be <= the vector width of the input variable @p DATA 3062 * eg 1: Valid 3063 * vstore_partial_15(var:float16, 0, 0xabcd); 3064 * eg 2: Invalid 3065 * vstore_partial_7(var:float4, 0, 0xabcd); 3066 * 3067 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. 3068 * 3069 * @param[in] DATA The name of the variable 3070 * @param[in] OFFSET Offset in n 3071 * @param[in] PTR The base pointer 3072 * @{ 3073 */ 3074#define vstore_partial_1(DATA, OFFSET, PTR) \ 3075 vstore1(DATA.s0, OFFSET, PTR); 3076 3077#define vstore_partial_2(DATA, OFFSET, PTR) \ 3078 vstore2(DATA.s01, OFFSET, PTR); 3079 3080#define vstore_partial_3(DATA, OFFSET, PTR) \ 3081 vstore3(DATA.s012, OFFSET, PTR); 3082 3083#define vstore_partial_4(DATA, OFFSET, PTR) \ 3084 vstore4(DATA.s0123, OFFSET, PTR); 3085 3086#define vstore_partial_5(DATA, OFFSET, PTR) \ 3087 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3088 vstore1(DATA.s4, OFFSET, PTR + 4); 3089 3090#define vstore_partial_6(DATA, OFFSET, PTR) \ 3091 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3092 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 3093 3094#define vstore_partial_7(DATA, OFFSET, PTR) \ 3095 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 3096 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 3097 3098#define vstore_partial_8(DATA, OFFSET, PTR) \ 3099 vstore8(DATA.s01234567, OFFSET, PTR); 3100 3101#define vstore_partial_9(DATA, OFFSET, PTR) \ 3102 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3103 vstore1(DATA.s8, OFFSET, PTR + 8); 3104 3105#define vstore_partial_10(DATA, OFFSET, PTR) \ 3106 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3107 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 3108 3109#define vstore_partial_11(DATA, OFFSET, PTR) \ 3110 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3111 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 3112 3113#define vstore_partial_12(DATA, OFFSET, PTR) \ 3114 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3115 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 3116 3117#define vstore_partial_13(DATA, OFFSET, PTR) \ 3118 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3119 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 3120 3121#define vstore_partial_14(DATA, OFFSET, PTR) \ 3122 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3123 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 3124 3125#define vstore_partial_15(DATA, OFFSET, PTR) \ 3126 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 3127 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 3128 3129#define vstore_partial_16(DATA, OFFSET, PTR) \ 3130 vstore16(DATA, OFFSET, PTR); 3131/** @} */ // end of groupd vstore_partial_n 3132/** @} */ // end of groupd VSTORE_PARTIAL 3133 3134// Convert built-in functions with _sat modifier are not supported in floating point so we create defines 3135// without _sat to overcome this issue 3136#define convert_float_sat convert_float 3137#define convert_float1_sat convert_float 3138#define convert_float2_sat convert_float2 3139#define convert_float3_sat convert_float3 3140#define convert_float4_sat convert_float4 3141#define convert_float8_sat convert_float8 3142#define convert_float16_sat convert_float16 3143#define convert_half_sat convert_float 3144#define convert_half1_sat convert_half 3145#define convert_half2_sat convert_half2 3146#define convert_half3_sat convert_half3 3147#define convert_half4_sat convert_half4 3148#define convert_half8_sat convert_half8 3149#define convert_half16_sat convert_half16 3150 3151#define convert_float1 convert_float 3152#define convert_half1 convert_half 3153#define convert_char1 convert_char 3154#define convert_uchar1 convert_uchar 3155#define convert_short1 convert_short 3156#define convert_ushort1 convert_ushort 3157#define convert_int1 convert_int 3158#define convert_uint1 convert_uint 3159#define convert_long1 convert_long 3160#define convert_ulong1 convert_ulong 3161#define convert_double1 convert_double 3162 3163#define convert_char1_sat convert_char_sat 3164#define convert_uchar1_sat convert_uchar_sat 3165#define convert_short1_sat convert_short_sat 3166#define convert_ushort1_sat convert_ushort_sat 3167#define convert_int1_sat convert_int_sat 3168#define convert_uint1_sat convert_uint_sat 3169#define convert_long1_sat convert_long_sat 3170#define convert_ulong1_sat convert_ulong_sat 3171#define convert_double1_sat convert_double_sat 3172 3173#define VEC_DATA_TYPE_STR(type, size) type##size 3174#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 3175 3176#define CONVERT_STR(x, type) (convert_##type((x))) 3177#define CONVERT(x, type) CONVERT_STR(x, type) 3178 3179#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 3180#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 3181 3182#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 3183#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 3184 3185#define select_vec_dt_uchar(size) uchar##size 3186#define select_vec_dt_char(size) char##size 3187#define select_vec_dt_ushort(size) ushort##size 3188#define select_vec_dt_short(size) short##size 3189#define select_vec_dt_half(size) short##size 3190#define select_vec_dt_uint(size) uint##size 3191#define select_vec_dt_int(size) int##size 3192#define select_vec_dt_float(size) int##size 3193#define select_vec_dt_ulong(size) ulong##size 3194#define select_vec_dt_long(size) long##size 3195 3196#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 3197#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 3198#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 3199 3200#define sum_reduce_1(x) (x) 3201#define sum_reduce_2(x) ((x).s0) + ((x).s1) 3202#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 3203#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 3204#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 3205#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 3206 3207#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 3208#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 3209 3210#define max_reduce_1(x) (x) 3211#define max_reduce_2(x) max(((x).s0), ((x).s1)) 3212#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 3213#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 3214#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 3215#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 3216 3217#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 3218#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 3219 3220#define VECTOR_DECLARATION(name) \ 3221 __global uchar *name##_ptr, \ 3222 uint name##_stride_x, \ 3223 uint name##_step_x, \ 3224 uint name##_offset_first_element_in_bytes 3225 3226#define IMAGE_DECLARATION(name) \ 3227 __global uchar *name##_ptr, \ 3228 uint name##_stride_x, \ 3229 uint name##_step_x, \ 3230 uint name##_stride_y, \ 3231 uint name##_step_y, \ 3232 uint name##_offset_first_element_in_bytes 3233 3234#define TENSOR3D_DECLARATION(name) \ 3235 __global uchar *name##_ptr, \ 3236 uint name##_stride_x, \ 3237 uint name##_step_x, \ 3238 uint name##_stride_y, \ 3239 uint name##_step_y, \ 3240 uint name##_stride_z, \ 3241 uint name##_step_z, \ 3242 uint name##_offset_first_element_in_bytes 3243 3244#define TENSOR4D_DECLARATION(name) \ 3245 __global uchar *name##_ptr, \ 3246 uint name##_stride_x, \ 3247 uint name##_step_x, \ 3248 uint name##_stride_y, \ 3249 uint name##_step_y, \ 3250 uint name##_stride_z, \ 3251 uint name##_step_z, \ 3252 uint name##_stride_w, \ 3253 uint name##_step_w, \ 3254 uint name##_offset_first_element_in_bytes 3255 3256#define CONVERT_TO_VECTOR_STRUCT(name) \ 3257 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 3258 3259#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 3260 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 3261 3262#define CONVERT_TO_IMAGE_STRUCT(name) \ 3263 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 3264 3265#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 3266 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 3267 3268#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 3269 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 3270 3271#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 3272 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 3273 3274#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 3275 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 3276 3277#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 3278 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3279 name##_stride_z, name##_step_z) 3280 3281#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 3282 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 3283 3284#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 3285 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3286 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 3287 3288#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 3289 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 3290 3291#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 3292 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 3293 name##_stride_z, name##_step_z) 3294 3295/** Structure to hold Vector information */ 3296typedef struct Vector 3297{ 3298 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 3299 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 3300 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 3301} Vector; 3302 3303/** Structure to hold Image information */ 3304typedef struct Image 3305{ 3306 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 3307 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 3308 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 3309 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 3310} Image; 3311 3312/** Structure to hold 3D tensor information */ 3313typedef struct Tensor3D 3314{ 3315 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 3316 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 3317 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 3318 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 3319 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 3320} Tensor3D; 3321 3322/** Structure to hold 4D tensor information */ 3323typedef struct Tensor4D 3324{ 3325 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 3326 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 3327 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 3328 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 3329 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 3330 int stride_w; /**< Stride of the image in W dimension (in bytes) */ 3331} Tensor4D; 3332 3333/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. 3334 * 3335 * @param[in] ptr Pointer to the starting postion of the buffer 3336 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector 3337 * @param[in] stride_x Stride of the vector in X dimension (in bytes) 3338 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 3339 * 3340 * @return An image object 3341 */ 3342inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 3343{ 3344 Vector vector = 3345 { 3346 .ptr = ptr, 3347 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3348 .stride_x = stride_x, 3349 }; 3350 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 3351 return vector; 3352} 3353 3354/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. 3355 * 3356 * @param[in] ptr Pointer to the starting postion of the buffer 3357 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 3358 * @param[in] stride_x Stride of the image in X dimension (in bytes) 3359 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 3360 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 3361 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 3362 * 3363 * @return An image object 3364 */ 3365inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 3366{ 3367 Image img = 3368 { 3369 .ptr = ptr, 3370 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3371 .stride_x = stride_x, 3372 .stride_y = stride_y 3373 }; 3374 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 3375 return img; 3376} 3377 3378/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. 3379 * 3380 * @param[in] ptr Pointer to the starting postion of the buffer 3381 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 3382 * @param[in] stride_x Stride of the image in X dimension (in bytes) 3383 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 3384 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 3385 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 3386 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 3387 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 3388 * 3389 * @return A 3D tensor object 3390 */ 3391inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3392{ 3393 Image img = 3394 { 3395 .ptr = ptr, 3396 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3397 .stride_x = stride_x, 3398 .stride_y = stride_y 3399 }; 3400 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 3401 return img; 3402} 3403 3404/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. 3405 * 3406 * @param[in] ptr Pointer to the starting postion of the buffer 3407 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 3408 * @param[in] stride_x Stride of the image in X dimension (in bytes) 3409 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 3410 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 3411 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 3412 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 3413 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 3414 * 3415 * @return A 3D tensor object 3416 */ 3417inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3418{ 3419 Tensor3D tensor = 3420 { 3421 .ptr = ptr, 3422 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3423 .stride_x = stride_x, 3424 .stride_y = stride_y, 3425 .stride_z = stride_z 3426 }; 3427 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 3428 return tensor; 3429} 3430 3431/** Wrap 3D tensor information into an tensor structure. 3432 * 3433 * @param[in] ptr Pointer to the starting postion of the buffer 3434 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 3435 * @param[in] stride_x Stride of the image in X dimension (in bytes) 3436 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 3437 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 3438 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 3439 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 3440 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 3441 * 3442 * @return A 3D tensor object 3443 */ 3444inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 3445{ 3446 Tensor3D tensor = 3447 { 3448 .ptr = ptr, 3449 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3450 .stride_x = stride_x, 3451 .stride_y = stride_y, 3452 .stride_z = stride_z 3453 }; 3454 return tensor; 3455} 3456 3457inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 3458 uint step_w, 3459 uint mod_size) 3460{ 3461 Tensor4D tensor = 3462 { 3463 .ptr = ptr, 3464 .offset_first_element_in_bytes = offset_first_element_in_bytes, 3465 .stride_x = stride_x, 3466 .stride_y = stride_y, 3467 .stride_z = stride_z, 3468 .stride_w = stride_w 3469 }; 3470 3471 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 3472 return tensor; 3473} 3474 3475/** Get the pointer position of a Vector 3476 * 3477 * @param[in] vec Pointer to the starting position of the buffer 3478 * @param[in] x Relative X position 3479 */ 3480inline __global const uchar *vector_offset(const Vector *vec, int x) 3481{ 3482 return vec->ptr + x * vec->stride_x; 3483} 3484 3485/** Get the pointer position of a Image 3486 * 3487 * @param[in] img Pointer to the starting position of the buffer 3488 * @param[in] x Relative X position 3489 * @param[in] y Relative Y position 3490 */ 3491inline __global uchar *offset(const Image *img, int x, int y) 3492{ 3493 return img->ptr + x * img->stride_x + y * img->stride_y; 3494} 3495 3496/** Get the pointer position of a Tensor3D 3497 * 3498 * @param[in] tensor Pointer to the starting position of the buffer 3499 * @param[in] x Relative X position 3500 * @param[in] y Relative Y position 3501 * @param[in] z Relative Z position 3502 */ 3503inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 3504{ 3505 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 3506} 3507 3508/** Get the pointer position of a Tensor4D 3509 * 3510 * @param[in] tensor Pointer to the starting position of the buffer 3511 * @param[in] x Relative X position 3512 * @param[in] y Relative Y position 3513 * @param[in] z Relative Z position 3514 * @param[in] w Relative W position 3515 */ 3516inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 3517{ 3518 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 3519} 3520 3521/** Get the offset for a given linear index of a Tensor3D 3522 * 3523 * @param[in] tensor Pointer to the starting position of the buffer 3524 * @param[in] width Width of the input tensor 3525 * @param[in] height Height of the input tensor 3526 * @param[in] depth Depth of the input tensor 3527 * @param[in] index Linear index 3528 */ 3529inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 3530{ 3531 uint num_elements = width * height; 3532 3533 const uint z = index / num_elements; 3534 3535 index %= num_elements; 3536 3537 const uint y = index / width; 3538 3539 index %= width; 3540 3541 const uint x = index; 3542 3543 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 3544} 3545 3546#endif // _HELPER_H 3547 3548/** Convert the given vector with round to nearest even rounding mode 3549 * 3550 * @param[in] x The target to be converted 3551 * @param[in] type The target type 3552 * 3553 * @return The converted vector 3554 */ 3555#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) 3556#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) 3557 3558/** Quantize a floating-point scalar value to 8-bit asymmetric 3559 * 3560 * @param[in] input Input value to quantize 3561 * @param[in] offset Quantization offset 3562 * @param[in] scale Quantization scale 3563 * 3564 * @return quantized value 3565 */ 3566inline uchar quantize_qasymm8(float input, float offset, float scale) 3567{ 3568 float out_f32 = input / scale + offset; 3569 uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); 3570 return res_u8; 3571} 3572 3573/** Dequantize a scalar value from 8-bit asymmetric to floating-point 3574 * 3575 * @param[in] input Input value to quantize 3576 * @param[in] offset Quantization offset 3577 * @param[in] scale Quantization scale 3578 * 3579 * @return quantized value 3580 */ 3581inline float dequantize_qasymm8(uchar input, float offset, float scale) 3582{ 3583 return ((float)input - offset) * scale; 3584} 3585 3586/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point 3587 * 3588 * @param[in] input Input value to quantize 3589 * @param[in] offset Quantization offset 3590 * @param[in] scale Quantization scale 3591 * 3592 * @return quantized value 3593 */ 3594inline float dequantize_qasymm8_signed(char input, float offset, float scale) 3595{ 3596 return ((float)input - offset) * scale; 3597} 3598 3599/** Quantize a vector of values from floating-point 3600 * 3601 * @param[in] type Output data type. 3602 * @param[in] size Size of vector. 3603 * 3604 * @return quantized values 3605 */ 3606#define QUANTIZE_IMPL(type, size) \ 3607 inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ 3608 { \ 3609 VEC_DATA_TYPE(float, size) \ 3610 out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ 3611 VEC_DATA_TYPE(type, size) \ 3612 res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ 3613 return res; \ 3614 } 3615 3616/** Dequantize a vector of values to floating-point 3617 * 3618 * @param[in] type Input data type. 3619 * @param[in] size Size of vector. 3620 * 3621 * @return dequantized values in floating point 3622 */ 3623#define DEQUANTIZE_IMPL(type, size) \ 3624 inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ 3625 { \ 3626 return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ 3627 } 3628 3629/** Correctly-rounded-to-nearest division by a power-of-two. 3630 * 3631 * @param[in] size Size of vector. 3632 * 3633 * @return Correctly-rounded-to-nearest division by a power-of-two. 3634 */ 3635#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ 3636 inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ 3637 { \ 3638 const VEC_DATA_TYPE(int, size) \ 3639 zero = (VEC_DATA_TYPE(int, size))0; \ 3640 const VEC_DATA_TYPE(int, size) \ 3641 one = (VEC_DATA_TYPE(int, size))1; \ 3642 VEC_DATA_TYPE(int, size) \ 3643 mask = (one << exponent) - one; \ 3644 VEC_DATA_TYPE(int, size) \ 3645 threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ 3646 return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ 3647 } 3648 3649/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), 3650 * rounding to the nearest value, and saturating -1 * -1 to the maximum value. 3651 * 3652 * @param[in] size Size of vector. 3653 * 3654 * @return Product of two fixed-point numbers. 3655 */ 3656#define ASYMM_MULT_IMPL(size) \ 3657 inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 3658 { \ 3659 VEC_DATA_TYPE(int, size) \ 3660 overflow = a == b && a == INT_MIN; \ 3661 VEC_DATA_TYPE(long, size) \ 3662 a_64 = convert_long##size(a); \ 3663 VEC_DATA_TYPE(long, size) \ 3664 b_64 = convert_long##size(b); \ 3665 VEC_DATA_TYPE(long, size) \ 3666 ab_64 = a_64 * b_64; \ 3667 /* Revert COMPMID-907 */ \ 3668 VEC_DATA_TYPE(long, size) \ 3669 mask1 = 1 << 30; \ 3670 VEC_DATA_TYPE(long, size) \ 3671 mask2 = 1 - (1 << 30); \ 3672 VEC_DATA_TYPE(long, size) \ 3673 is_positive_or_zero = ab_64 >= 0; \ 3674 VEC_DATA_TYPE(long, size) \ 3675 nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \ 3676 VEC_DATA_TYPE(long, size) \ 3677 mask = 1ll << 31; \ 3678 VEC_DATA_TYPE(int, size) \ 3679 ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ 3680 return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \ 3681 } 3682 3683/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). 3684 * 3685 * @param[in] size Size of vector. 3686 * 3687 * @return Result in fixed-point format Q0. 3688 */ 3689#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ 3690 inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ 3691 { \ 3692 const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ 3693 const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ 3694 const int k_fractional_bits = 31; \ 3695 VEC_DATA_TYPE(int, size) \ 3696 x = a + (1 << (k_fractional_bits - 3)); \ 3697 VEC_DATA_TYPE(int, size) \ 3698 x2 = ASYMM_MULT(x, x, size); \ 3699 VEC_DATA_TYPE(int, size) \ 3700 x3 = ASYMM_MULT(x2, x, size); \ 3701 VEC_DATA_TYPE(int, size) \ 3702 x4 = ASYMM_MULT(x2, x2, size); \ 3703 VEC_DATA_TYPE(int, size) \ 3704 x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ 3705 VEC_DATA_TYPE(int, size) \ 3706 x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ 3707 VEC_DATA_TYPE(int, size) \ 3708 x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ 3709 return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ 3710 } 3711 3712/** Each bit of the result is set to the corresponding bit of either then_val or 3713 * else_val depending on whether the corresponding bit of if_mask is set. 3714 * Equivalent to the VBSL instruction in ARM NEON. 3715 * 3716 * @param[in] size Size of vector. 3717 * 3718 * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not. 3719 */ 3720#define ASYMM_SELECT_USING_MASK_IMPL(size) \ 3721 inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ 3722 { \ 3723 return (if_mask & then_val) ^ (~if_mask & else_val); \ 3724 } 3725 3726/** For each element of input vector, the corresponding bits of the result item are set 3727 * if the input item is zero. 3728 * 3729 * @param[in] size Size of vector. 3730 * 3731 * @returns Output vector with bits set when corresponding bit in @p a is zero. 3732 */ 3733#define ASYMM_MASK_IF_ZERO_IMPL(size) \ 3734 inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ 3735 { \ 3736 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 3737 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 3738 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0)); \ 3739 } 3740 3741/** For each element of input vector, the corresponding bits of the result item are set 3742 * if the input item is non-zero. 3743 * 3744 * @param[in] size Size of vector. 3745 * 3746 * @returns Output vector with bits set when corresponding bit in @p a is non zero. 3747 */ 3748#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ 3749 inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ 3750 { \ 3751 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 3752 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 3753 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ 3754 } 3755 3756#define EXP_BARREL_SHIFTER_IMPL(size) \ 3757 inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ 3758 { \ 3759 if(k_integer_bits > exponent) \ 3760 { \ 3761 const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ 3762 return ASYMM_SELECT_USING_MASK( \ 3763 ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ 3764 ASYMM_MULT(result, fp_multiplier, size), result, size); \ 3765 } \ 3766 \ 3767 return result; \ 3768 } 3769 3770/** Calculates \f$ exp(x) \f$ for x < 0. 3771 * 3772 * @param[in] size Size of vector. 3773 * 3774 * @return Result in fixed-point format Q0. 3775 */ 3776#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ 3777 inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ 3778 { \ 3779 const int k_fractional_bits = 31 - k_integer_bits; \ 3780 VEC_DATA_TYPE(int, size) \ 3781 k_one_quarter = 1 << (k_fractional_bits - 2); \ 3782 VEC_DATA_TYPE(int, size) \ 3783 mask = k_one_quarter - 1; \ 3784 VEC_DATA_TYPE(int, size) \ 3785 a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ 3786 VEC_DATA_TYPE(int, size) \ 3787 a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ 3788 VEC_DATA_TYPE(int, size) \ 3789 result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ 3790 VEC_DATA_TYPE(int, size) \ 3791 remainder = a_mod_quarter_minus_one_quarter - a; \ 3792 \ 3793 result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ 3794 result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ 3795 result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ 3796 result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ 3797 result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ 3798 result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ 3799 result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ 3800 \ 3801 if(k_integer_bits > 5) \ 3802 { \ 3803 const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ 3804 result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ 3805 } \ 3806 \ 3807 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 3808 return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ 3809 } 3810 3811/** Calculates the product of a integer value by a power of two, with either a positive exponent 3812 * (equivalent to an arithmetic left shift, saturating) or a negative exponent 3813 * (equivalent to an arithmetic right shift, rounding to nearest). 3814 * 3815 * @param[in] size Size of vector. 3816 * 3817 * @return Arithmetic left or right shift. 3818 */ 3819#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ 3820 inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ 3821 { \ 3822 if(exponent < 0) \ 3823 { \ 3824 return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ 3825 } \ 3826 \ 3827 const VEC_DATA_TYPE(int, size) min = INT_MIN; \ 3828 const VEC_DATA_TYPE(int, size) max = INT_MAX; \ 3829 int threshold = ((1 << (31 - exponent)) - 1); \ 3830 VEC_DATA_TYPE(int, size) \ 3831 positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ 3832 VEC_DATA_TYPE(int, size) \ 3833 negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ 3834 VEC_DATA_TYPE(int, size) \ 3835 result = x << exponent; \ 3836 result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ 3837 result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ 3838 return result; \ 3839 } 3840 3841/** Calculates (a+b)/2, rounded to the nearest integer. 3842 * Equivalent to VRHADD in the ARM NEON instruction set. 3843 * 3844 * @param[in] size Size of vector. 3845 * 3846 * @return (a+b)/2, rounded to the nearest integer. 3847 */ 3848#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ 3849 inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 3850 { \ 3851 VEC_DATA_TYPE(long, size) \ 3852 a64 = convert_long##size(a); \ 3853 VEC_DATA_TYPE(long, size) \ 3854 b64 = convert_long##size(b); \ 3855 VEC_DATA_TYPE(long, size) \ 3856 sum = a64 + b64; \ 3857 const VEC_DATA_TYPE(long, size) one = 1; \ 3858 const VEC_DATA_TYPE(long, size) minus_one = -1; \ 3859 VEC_DATA_TYPE(long, size) \ 3860 sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ 3861 return convert_int##size((sum + sign) / 2); \ 3862 } 3863 3864/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). 3865 * 3866 * @param[in] size Size of vector. 3867 * 3868 * @return Result in fixed-point format Q0. 3869 */ 3870#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ 3871 inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ 3872 { \ 3873 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 3874 const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ 3875 VEC_DATA_TYPE(int, size) \ 3876 half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ 3877 const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ 3878 const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ 3879 VEC_DATA_TYPE(int, size) \ 3880 x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ 3881 for(int i = 0; i < 3; i++) \ 3882 { \ 3883 VEC_DATA_TYPE(int, size) \ 3884 half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ 3885 VEC_DATA_TYPE(int, size) \ 3886 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ 3887 VEC_DATA_TYPE(int, size) \ 3888 tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ 3889 x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ 3890 } \ 3891 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ 3892 } 3893 3894/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly. 3895 * 3896 * @param[in] size Size of vector. 3897 * 3898 * @return Rescaled value. 3899 */ 3900#define ASYMM_RESCALE_IMPL(size) \ 3901 inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ 3902 { \ 3903 int exponent = src_integer_bits - dst_integer_bits; \ 3904 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ 3905 } 3906 3907#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) 3908#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) 3909#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) 3910#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) 3911 3912#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) 3913#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) 3914#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) 3915#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) 3916#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ 3917 ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) 3918#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ 3919 ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) 3920#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) 3921#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) 3922#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) 3923#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) 3924#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) 3925#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) 3926#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) 3927#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) 3928#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) 3929#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) 3930#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) 3931#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) 3932#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) 3933 3934#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ 3935 inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ 3936 { \ 3937 const int left_shift = shift > 0 ? shift : 0; \ 3938 const int right_shift = shift > 0 ? 0 : -shift; \ 3939 return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ 3940 } 3941#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) 3942 3943QUANTIZE_IMPL(uchar, 1) 3944QUANTIZE_IMPL(char, 1) 3945QUANTIZE_IMPL(uint, 1) 3946QUANTIZE_IMPL(int, 1) 3947QUANTIZE_IMPL(uchar, 4) 3948QUANTIZE_IMPL(ushort, 4) 3949QUANTIZE_IMPL(short, 4) 3950QUANTIZE_IMPL(uchar, 16) 3951QUANTIZE_IMPL(char, 16) 3952QUANTIZE_IMPL(ushort, 16) 3953QUANTIZE_IMPL(short, 16) 3954QUANTIZE_IMPL(uint, 16) 3955QUANTIZE_IMPL(int, 16) 3956 3957DEQUANTIZE_IMPL(uchar, 1) 3958DEQUANTIZE_IMPL(char, 1) 3959DEQUANTIZE_IMPL(uint, 1) 3960DEQUANTIZE_IMPL(int, 1) 3961DEQUANTIZE_IMPL(uchar, 4) 3962DEQUANTIZE_IMPL(ushort, 4) 3963DEQUANTIZE_IMPL(short, 4) 3964DEQUANTIZE_IMPL(uchar, 16) 3965DEQUANTIZE_IMPL(char, 16) 3966DEQUANTIZE_IMPL(ushort, 16) 3967DEQUANTIZE_IMPL(short, 16) 3968DEQUANTIZE_IMPL(uint, 16) 3969DEQUANTIZE_IMPL(int, 16) 3970 3971ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) 3972ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) 3973ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3) 3974ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) 3975ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) 3976ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) 3977 3978ASYMM_MULT_IMPL(1) 3979ASYMM_MULT_IMPL(2) 3980ASYMM_MULT_IMPL(3) 3981ASYMM_MULT_IMPL(4) 3982ASYMM_MULT_IMPL(8) 3983ASYMM_MULT_IMPL(16) 3984 3985ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1) 3986ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) 3987ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3) 3988ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) 3989ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) 3990ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) 3991 3992ASYMM_SELECT_USING_MASK_IMPL(1) 3993ASYMM_SELECT_USING_MASK_IMPL(2) 3994ASYMM_SELECT_USING_MASK_IMPL(3) 3995ASYMM_SELECT_USING_MASK_IMPL(4) 3996ASYMM_SELECT_USING_MASK_IMPL(8) 3997ASYMM_SELECT_USING_MASK_IMPL(16) 3998 3999ASYMM_MASK_IF_ZERO_IMPL(1) 4000ASYMM_MASK_IF_ZERO_IMPL(2) 4001ASYMM_MASK_IF_ZERO_IMPL(3) 4002ASYMM_MASK_IF_ZERO_IMPL(4) 4003ASYMM_MASK_IF_ZERO_IMPL(8) 4004ASYMM_MASK_IF_ZERO_IMPL(16) 4005 4006ASYMM_MASK_IF_NON_ZERO_IMPL(1) 4007ASYMM_MASK_IF_NON_ZERO_IMPL(2) 4008ASYMM_MASK_IF_NON_ZERO_IMPL(3) 4009ASYMM_MASK_IF_NON_ZERO_IMPL(4) 4010ASYMM_MASK_IF_NON_ZERO_IMPL(8) 4011ASYMM_MASK_IF_NON_ZERO_IMPL(16) 4012 4013EXP_BARREL_SHIFTER_IMPL(1) 4014EXP_BARREL_SHIFTER_IMPL(2) 4015EXP_BARREL_SHIFTER_IMPL(3) 4016EXP_BARREL_SHIFTER_IMPL(4) 4017EXP_BARREL_SHIFTER_IMPL(8) 4018EXP_BARREL_SHIFTER_IMPL(16) 4019 4020ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1) 4021ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) 4022ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3) 4023ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) 4024ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) 4025ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) 4026 4027ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) 4028ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) 4029ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3) 4030ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) 4031ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) 4032ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) 4033 4034ASYMM_ROUNDING_HALF_SUM_IMPL(1) 4035ASYMM_ROUNDING_HALF_SUM_IMPL(2) 4036ASYMM_ROUNDING_HALF_SUM_IMPL(3) 4037ASYMM_ROUNDING_HALF_SUM_IMPL(4) 4038ASYMM_ROUNDING_HALF_SUM_IMPL(8) 4039ASYMM_ROUNDING_HALF_SUM_IMPL(16) 4040 4041ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1) 4042ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) 4043ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3) 4044ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) 4045ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) 4046ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) 4047 4048ASYMM_RESCALE_IMPL(1) 4049ASYMM_RESCALE_IMPL(2) 4050ASYMM_RESCALE_IMPL(3) 4051ASYMM_RESCALE_IMPL(4) 4052ASYMM_RESCALE_IMPL(8) 4053ASYMM_RESCALE_IMPL(16) 4054 4055MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) 4056MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) 4057MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3) 4058MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) 4059MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) 4060MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) 4061 4062#endif // ARM_COMPUTE_HELPERS_ASYMM_H 4063 4064/** Clamps the given coordinates to the borders according to the border size. 4065 * 4066 * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. 4067 * @param[in] width Width of the image 4068 * @param[in] height Height of the image 4069 * @param[in] border_size Border size of the image 4070 * 4071 */ 4072inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size) 4073{ 4074 const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); 4075 const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); 4076 return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); 4077} 4078 4079/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */ 4080/** Clamps the given coordinates to the borders. 4081 * 4082 * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. 4083 * @param[in] width Width of the image 4084 * @param[in] height Height of the image 4085 * 4086 */ 4087inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height) 4088{ 4089 return clamp_to_border_with_size_quantized(coords, width, height, 1); 4090} 4091 4092/** Given a texel coordinates this function will return the following array of coordinates: 4093 * [ P, right neighbour, below neighbour, below right neighbour ] 4094 * 4095 * @note No checks to see if the coordinates are out of the image are done here. 4096 * 4097 * @param[in] coord Input coordinates 4098 * 4099 * @return vector of 8 floats with the coordinates, even positions are x and odd y. 4100 */ 4101inline const float8 get_neighbour_coords_quantized(const float2 coord) 4102{ 4103 return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); 4104} 4105 4106/** Returns the current thread coordinates. */ 4107inline const float2 get_current_coords_quantized() 4108{ 4109 return (float2)(get_global_id(0) * 4, get_global_id(1)); 4110} 4111 4112/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values 4113 * 4114 * @param[in] in Pointer to the source image. 4115 * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. 4116 * @param[in] width Width of the image 4117 * @param[in] height Height of the image 4118 * @param[in] border_size Border size 4119 * @param[in] scale Scale value 4120 * @param[in] offset_qasymm Offset value 4121 */ 4122inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size, 4123 const float scale, const int offset_qasymm) 4124{ 4125 // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. 4126 4127 // Sets the 4x4 coordinates for each of the four input texels 4128 const float8 fc = floor(coords); 4129 const float16 c1 = (float16)( 4130 clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size), 4131 clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size)); 4132 const float16 c2 = (float16)( 4133 clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size), 4134 clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size)); 4135 4136 // Loads the values from the input image 4137 const int16 t = (int16)( 4138 /* tl, tr, bl, br */ 4139 * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), 4140 *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), 4141 *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), 4142 *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), 4143 *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), 4144 *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), 4145 *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), 4146 *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); 4147 4148 const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale; 4149 4150 const float8 a = coords - fc; 4151 const float8 b = ((float8)(1.f)) - a; 4152 const float4 fr = (float4)( 4153 ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)), 4154 ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)), 4155 ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)), 4156 ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7))); 4157 4158 const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4)); 4159 4160 return res; 4161} 4162 4163/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */ 4164/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values 4165 * 4166 * @param[in] in Pointer to the source image. 4167 * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. 4168 * @param[in] width Width of the image 4169 * @param[in] height Height of the image 4170 * @param[in] scale Scale value 4171 * @param[in] offset_qasymm Offset value 4172 */ 4173inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm) 4174{ 4175 return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm); 4176} 4177 4178/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. 4179 * 4180 * @param[in] coord 2D coordinates to transform. 4181 * @param[in] scale input/output scale ratio 4182 * 4183 * @return a float8 containing 4 2D transformed values in the input image. 4184 */ 4185inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale) 4186{ 4187 const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); 4188#ifdef SAMPLING_POLICY_TOP_LEFT 4189 const float4 new_x = in_x_coords * (float4)(scale.s0); 4190 const float4 new_y = (float4)(coord.s1 * scale.s1); 4191 return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); 4192#elif SAMPLING_POLICY_CENTER 4193 const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); 4194 const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); 4195 return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); 4196#else /* SAMPLING_POLICY */ 4197#error("Unsupported sampling policy"); 4198#endif /* SAMPLING_POLICY */ 4199} 4200 4201/** Performs an affine transformation on an image interpolating with the BILINEAR method. 4202 * 4203 * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT 4204 * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5 4205 * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1 4206 * 4207 * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. 4208 * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) 4209 * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 4210 * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) 4211 * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 4212 * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image 4213 * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) 4214 * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) 4215 * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 4216 * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) 4217 * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 4218 * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image 4219 * @param[in] input_width Input image width 4220 * @param[in] input_height Input image height 4221 * @param[in] scale_x The scale factor along x dimension 4222 * @param[in] scale_y The scale factor along y dimension 4223 */ 4224__kernel void scale_bilinear_quantized_nchw( 4225 IMAGE_DECLARATION(in), 4226 IMAGE_DECLARATION(out), 4227 const float input_width, 4228 const float input_height, 4229 const float scale_x, 4230 const float scale_y) 4231{ 4232 Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); 4233 Image out = CONVERT_TO_IMAGE_STRUCT(out); 4234 const float2 r = (float2)(scale_x, scale_y); 4235 const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r); 4236 vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr); 4237} 4238 4239#if defined(DEPTH_OUT) 4240/** Performs scale on an image interpolating with the BILINEAR method. (NHWC) 4241 * 4242 * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT 4243 * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5 4244 * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1 4245 * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE 4246 * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 4247 * 4248 * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. 4249 * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) 4250 * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 4251 * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) 4252 * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 4253 * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) 4254 * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 4255 * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image 4256 * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr 4257 * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) 4258 * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 4259 * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) 4260 * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 4261 * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) 4262 * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes) 4263 * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image 4264 * @param[in] input_width Input image width 4265 * @param[in] input_height Input image height 4266 * @param[in] scale_x The scale factor along x dimension 4267 * @param[in] scale_y The scale factor along y dimension 4268 */ 4269__kernel void scale_bilinear_quantized_nhwc( 4270 TENSOR4D_DECLARATION(in), 4271 TENSOR4D_DECLARATION(out), 4272 const float input_width, 4273 const float input_height, 4274 const float scale_x, 4275 const float scale_y) 4276{ 4277 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); 4278 Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); 4279 4280#ifdef SAMPLING_POLICY_TOP_LEFT 4281 const float new_x = get_global_id(1) * scale_x; 4282 const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; 4283#elif SAMPLING_POLICY_CENTER 4284 const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; 4285 const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; 4286#else /* SAMPLING_POLICY */ 4287#error("Unsupported sampling policy"); 4288#endif /* SAMPLING_POLICY */ 4289 4290 const float new_xf = floor(new_x); 4291 const float new_yf = floor(new_y); 4292 float clamped_x = clamp(new_xf, 0.0f, input_width - 1); 4293 float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); 4294 float clamped_x_ = clamped_x; 4295 float clamped_x1_ = clamped_x1; 4296 const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); 4297 const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); 4298 4299#ifndef BORDER_MODE_REPLICATE 4300 clamped_x1 = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1); 4301 clamped_x_ = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1); 4302 clamped_x = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1); 4303 clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1); 4304#endif /* BORDER_MODE_REPLICATE */ 4305 4306 int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), 4307 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), 4308 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), 4309 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); 4310 4311 const float a = new_x - new_xf; 4312 const float b = 1.f - a; 4313 const float a1 = new_y - new_yf; 4314 const float b1 = 1.f - a1; 4315 const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE; 4316 4317 const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1)); 4318 4319 DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE); 4320 4321 *((__global DATA_TYPE *)out.ptr) = res; 4322} 4323#endif /* defined(DEPTH_OUT) */ 4324 4325)"