1R"( 2 3/* 4 * Copyright (c) 2016-2019 Arm Limited. 5 * 6 * SPDX-License-Identifier: MIT 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to 10 * deal in the Software without restriction, including without limitation the 11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 * sell copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in all 16 * copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 * SOFTWARE. 25 */ 26/* 27 * Copyright (c) 2016-2020 Arm Limited. 28 * 29 * SPDX-License-Identifier: MIT 30 * 31 * Permission is hereby granted, free of charge, to any person obtaining a copy 32 * of this software and associated documentation files (the "Software"), to 33 * deal in the Software without restriction, including without limitation the 34 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 35 * sell copies of the Software, and to permit persons to whom the Software is 36 * furnished to do so, subject to the following conditions: 37 * 38 * The above copyright notice and this permission notice shall be included in all 39 * copies or substantial portions of the Software. 40 * 41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 44 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 45 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 46 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 47 * SOFTWARE. 48 */ 49#ifndef ARM_COMPUTE_HELPER_H 50#define ARM_COMPUTE_HELPER_H 51 52/* 53 * Copyright (c) 2020 Arm Limited. 54 * 55 * SPDX-License-Identifier: MIT 56 * 57 * Permission is hereby granted, free of charge, to any person obtaining a copy 58 * of this software and associated documentation files (the "Software"), to 59 * deal in the Software without restriction, including without limitation the 60 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 61 * sell copies of the Software, and to permit persons to whom the Software is 62 * furnished to do so, subject to the following conditions: 63 * 64 * The above copyright notice and this permission notice shall be included in all 65 * copies or substantial portions of the Software. 66 * 67 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 68 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 69 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 70 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 71 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 72 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 73 * SOFTWARE. 74 */ 75 76/** Store the 0 to (n-1)th rows of the given variables 77 * @name STORE_ROW_n 78 * 79 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 80 * @param[in] DATA_TYPE The data type of the vectors 81 * @param[in] BASENAME The basename of the variables 82 * @param[in] PTR The base pointer 83 * @param[in] STRIDE_Y The stride value in y-axis direction 84 * @param[in] Z The offset in z-axis direction 85 * @{ 86 */ 87#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88 VSTORE(N0) \ 89 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 90 91#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 92 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 93 VSTORE(N0) \ 94 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 95 96#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 97 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98 VSTORE(N0) \ 99 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 100 101#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 102 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103 VSTORE(N0) \ 104 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 105 106#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 107 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108 VSTORE(N0) \ 109 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 110 111#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 112 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113 VSTORE(N0) \ 114 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 115 116#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 117 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 VSTORE(N0) \ 119 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 120 121#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 122 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 VSTORE(N0) \ 124 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 125 126#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 127 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 VSTORE(N0) \ 129 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 130 131#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 132 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 VSTORE(N0) \ 134 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 135 136#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 137 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 138 VSTORE(N0) \ 139 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 140 141#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 142 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 VSTORE(N0) \ 144 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 145 146#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 147 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 VSTORE(N0) \ 149 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 150 151#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 152 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 VSTORE(N0) \ 154 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 155 156#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 157 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 VSTORE(N0) \ 159 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 160 161#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 162 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 VSTORE(N0) \ 164 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 165/** @} */ // end of groupd STORE_ROW_n 166 167/** Convert and store the 0th to (n-1)th rows of the given variables 168 * @name CONVERT_STORE_ROW_n 169 * 170 * @param[in] N0 The size of the vectors 171 * @param[in] DATA_TYPE The data type of the vectors 172 * @param[in] BASENAME The basename of the variables 173 * @param[in] PTR The base pointer 174 * @param[in] STRIDE_Y The stride value in y-axis direction 175 * @param[in] Z The offset in z-axis direction 176 * @{ 177 */ 178#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 179 VSTORE(N0) \ 180 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 181 182#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 183 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 184 VSTORE(N0) \ 185 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 186 187#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 188 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 189 VSTORE(N0) \ 190 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 191 192#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 193 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 194 VSTORE(N0) \ 195 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 196 197#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 198 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 199 VSTORE(N0) \ 200 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 201 202#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 203 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 204 VSTORE(N0) \ 205 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 206 207#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 208 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 209 VSTORE(N0) \ 210 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 211 212#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 213 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 214 VSTORE(N0) \ 215 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 216 217#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 218 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 219 VSTORE(N0) \ 220 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 221 222#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 223 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 224 VSTORE(N0) \ 225 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 226 227#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 228 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 229 VSTORE(N0) \ 230 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 231 232#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 233 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 234 VSTORE(N0) \ 235 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 236 237#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 238 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 239 VSTORE(N0) \ 240 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 241 242#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 243 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 244 VSTORE(N0) \ 245 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 246 247#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 248 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 249 VSTORE(N0) \ 250 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 251 252#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 253 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 254 VSTORE(N0) \ 255 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 256 257/** @} */ // end of groupd CONVERT_STORE_ROW_n 258 259/** Store a block of the given size M0xN0 260 * @name STORE_BLOCK 261 * 262 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 263 * The data to store is expected to have consecutive names for each row. 264 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 265 * The Z offset is expected to have consecutive names. 266 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 267 * 268 * @param[in] M0 The number of rows to store 269 * @param[in] N0 The size of each vector 270 * @param[in] DATA_TYPE The data type of the vectors 271 * @param[in] BASENAME The basename of the variables 272 * @param[in] PTR The base pointer 273 * @param[in] STRIDE_Y The stride value in y-axis direction 274 * @param[in] Z The offset in z-axis direction 275 * @{ 276 */ 277#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 278#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 279/** @} */ // end of group STORE_BLOCK 280 281/** Convert and store a block of the given size M0xN0 282 * @name CONVERT_STORE_BLOCK 283 * 284 * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. 285 * The data to store is expected to have consecutive names for each row. 286 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 287 * The Z offset is expected to have consecutive names. 288 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 289 * 290 * @param[in] M0 The number of rows to store 291 * @param[in] N0 The size of each vector 292 * @param[in] DATA_TYPE The data type of the vectors 293 * @param[in] BASENAME The basename of the variables 294 * @param[in] PTR The base pointer 295 * @param[in] STRIDE_Y The stride value in y-axis direction 296 * @param[in] Z The offset in z-axis direction 297 * @{ 298 */ 299#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 300#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 301/** @} */ // end of group CONVERT_STORE_BLOCK 302 303/** Partially store the 0 to (n-1)th rows of the given variables 304 * @name STORE_ROW_PARTIAL_n 305 * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0 306 * 307 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 308 * 309 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 310 * @param[in] STORE_N0 The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0 311 * @param[in] DATA_TYPE The data type of the vectors 312 * @param[in] BASENAME The basename of the variables 313 * @param[in] PTR The base pointer 314 * @param[in] STRIDE_Y The stride value in y-axis direction 315 * @param[in] Z The offset in z-axis direction 316 * @{ 317 */ 318#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 319 VSTORE_PARTIAL(N0, STORE_N0) \ 320 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 321 322#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 323 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 324 VSTORE_PARTIAL(N0, STORE_N0) \ 325 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 326 327#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 328 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 329 VSTORE_PARTIAL(N0, STORE_N0) \ 330 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 331 332#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 333 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 334 VSTORE_PARTIAL(N0, STORE_N0) \ 335 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 336 337#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 338 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 339 VSTORE_PARTIAL(N0, STORE_N0) \ 340 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 341 342#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 343 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 344 VSTORE_PARTIAL(N0, STORE_N0) \ 345 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 346 347#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 348 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 349 VSTORE_PARTIAL(N0, STORE_N0) \ 350 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 351 352#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 353 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 354 VSTORE_PARTIAL(N0, STORE_N0) \ 355 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 356 357#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 358 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 359 VSTORE_PARTIAL(N0, STORE_N0) \ 360 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 361 362#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 363 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 364 VSTORE_PARTIAL(N0, STORE_N0) \ 365 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 366 367#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 368 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 369 VSTORE_PARTIAL(N0, STORE_N0) \ 370 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 371 372#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 373 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 374 VSTORE_PARTIAL(N0, STORE_N0) \ 375 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 376 377#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 378 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 379 VSTORE_PARTIAL(N0, STORE_N0) \ 380 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 381 382#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 383 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 384 VSTORE_PARTIAL(N0, STORE_N0) \ 385 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 386 387#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 388 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 389 VSTORE_PARTIAL(N0, STORE_N0) \ 390 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 391 392#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 393 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 394 VSTORE_PARTIAL(N0, STORE_N0) \ 395 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 396/** @} */ // end of groupd STORE_ROW_PARTIAL_n 397 398/** Partially store a block of the given size STORE_M0xSTORE_N0 399 * @name STORE_BLOCK_PARTIAL 400 * 401 * @note The vector width @p N0 is also required for correct partial storing behaviour. 402 * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 403 * 404 * The data to store is expected to have consecutive names for each row. 405 * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2. 406 * The Z offset is expected to have consecutive names. 407 * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 408 * 409 * @param[in] STORE_M0 The number of rows to store. Supported: 1-16 410 * @param[in] STORE_N0 The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0 411 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 412 * @param[in] DATA_TYPE The data type of the vectors 413 * @param[in] BASENAME The basename of the variables 414 * @param[in] PTR The base pointer 415 * @param[in] STRIDE_Y The stride value in y-axis direction 416 * @param[in] Z The offset in z-axis direction 417 * @{ 418 */ 419#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 420#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 421/** Store a block that can be partial in both x and y dimensions 422 * 423 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 424 * 425 * The data to store is expected to have consecutive names for each row. 426 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 427 * The Z offset is expected to have consecutive names. 428 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 429 * 430 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 431 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 432 * @param[in] DATA_TYPE The data type of the vectors 433 * @param[in] BASENAME The basename of the variables 434 * @param[in] PTR The base pointer 435 * @param[in] STRIDE_Y The stride value in y-axis direction 436 * @param[in] Z The offset in z-axis direction 437 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 438 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 439 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 440 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 441 */ 442#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 443 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 444 { \ 445 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 446 } \ 447 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 448 { \ 449 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 450 } \ 451 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 452 { \ 453 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 454 } \ 455 else \ 456 { \ 457 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 458 } 459/** Store a block that can only be partial in x but not y. 460 * 461 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 462 * 463 * The data to store is expected to have consecutive names for each row. 464 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 465 * The Z offset is expected to have consecutive names. 466 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 467 * 468 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 469 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 470 * @param[in] DATA_TYPE The data type of the vectors 471 * @param[in] BASENAME The basename of the variables 472 * @param[in] PTR The base pointer 473 * @param[in] STRIDE_Y The stride value in y-axis direction 474 * @param[in] Z The offset in z-axis direction 475 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 476 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 477 */ 478#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 479 if(!(PARTIAL_COND_X)) \ 480 { \ 481 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 482 } \ 483 else \ 484 { \ 485 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 486 } 487/** Store a block that can only be partial in y but not x. 488 * 489 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 490 * 491 * The data to store is expected to have consecutive names for each row. 492 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 493 * The Z offset is expected to have consecutive names. 494 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 495 * 496 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 497 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 498 * @param[in] DATA_TYPE The data type of the vectors 499 * @param[in] BASENAME The basename of the variables 500 * @param[in] PTR The base pointer 501 * @param[in] STRIDE_Y The stride value in y-axis direction 502 * @param[in] Z The offset in z-axis direction 503 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 504 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 505 */ 506#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 507 if(!(PARTIAL_COND_Y)) \ 508 { \ 509 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 510 } \ 511 else \ 512 { \ 513 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 514 } 515/** @} */ // end of group STORE_BLOCK_PARTIAL 516 517#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 518 519/** Boundary-aware GEMM block store 520 * @name STORE_BLOCK_BOUNDARY_AWARE 521 * This macro assumes the following schemes to achieve boundary-awareness: 522 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 523 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 524 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 525 * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim. 526 * 527 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 528 * blocks **at the end**. 529 * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 530 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 531 * 532 * *--x--> x == 0 x == 1 533 * | |<------------------------------N-------------------------->| 534 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 535 * | -------------############################################################# 536 * * | | |...............................|...........................| 537 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 538 * | | |...............................|...........................| 539 * M --############################################################# 540 * | | | |...........................| 541 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 542 * | | | |...........................| 543 * |------------############################################################# 544 * 545 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 546 * 547 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 548 * 549 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 550 * and select corresponding store methods such that the boundary detection logic is only added when needed. 551 * 552 * The data to store is expected to have consecutive names for each row. 553 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 554 * The Z offset is expected to have consecutive names. 555 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 556 * 557 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 558 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 559 * @param[in] DATA_TYPE The data type of the vectors 560 * @param[in] BASENAME The basename of the variables 561 * @param[in] PTR The base pointer 562 * @param[in] STRIDE_Y The stride value in y-axis direction 563 * @param[in] Z The offset in z-axis direction 564 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 565 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 566 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 567 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. 568 * @{ 569 */ 570#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 571// Case1: No partial blocks in either x or y 572#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 573 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 574 575#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 576// Case2: Partial blocks in y 577#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 578 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 579 580#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 581// Case3: Partial blocks in x 582#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 583 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 584 585#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 586// Case4: Partial blocks in both x and y 587#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 588 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 589 590#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 591 592#endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 593/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE 594 595#if defined(PARTIAL_STORE_M0) 596/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding 597 * @name COMPUTE_M0_START_ROW 598 * If there're any partial blocks in y dimension, they are placed at the beginning of the rows. 599 * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent 600 * blocks in the y dimension to avoid any padding. 601 * EG: M0=4, PARTIAL_STORE_M0=1: 602 * | Non-overlapping | +M0_ROW_SHIFT (Overlapping) 603 * block 0 (partial)| start row = 0 | start row = 0 604 * block 1 (full) | start row = 4 | start row = 1 605 * block 2 (full) | start row = 8 | start row = 5 606 * 607 * @param[in] y Global id of current block in y. 608 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 609 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 610 * @{ 611 */ 612#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 613 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 614#else // defined(PARTIAL_STORE_M0) 615#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 616 ((uint)(y * M0)) 617#endif // defined(PARTIAL_STORE_M0) 618/** @} */ // end of group COMPUTE_M0_START_ROW 619 620/** Store a vector that can only be partial in x. 621 * 622 * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. 623 * 624 * The data to store is expected to end in a 0. 625 * E.g., for basename=c, the expected name is c0. 626 * 627 * @param[in] basename The name of the variable without trailing 0 628 * @param[in] data_type The data type of the vector 629 * @param[in] ptr The base pointer 630 * @param[in] vec_size The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16 631 * @param[in] leftover The vector size if cond = true. Supported range: [1, @p vec_size0) 632 * @param[in] cond Condition to select either vec_size0 or vec_size1 633 * @{ 634 */ 635#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 636 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 637/** @} */ // end of group STORE_VECTOR_SELECT 638 639#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 640#pragma OPENCL EXTENSION cl_khr_fp16 : enable 641#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 642 643#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 644#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 645#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 646 647#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 648#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 649#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 650 651#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 652#pragma OPENCL EXTENSION cl_arm_printf : enable 653#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 654 655#define GPU_ARCH_MIDGARD 0x100 656#define GPU_ARCH_BIFROST 0x200 657 658/** Concatenate two inputs. 659 * 660 * @param[in] a The first input to be concatenated 661 * @param[in] b The second input to be concatenated 662 * 663 * @return The concatenated output 664 */ 665#define CONCAT(a, b) a##b 666 667/** Expand the given vector 668 * 669 * @param[in] x The vector to be expanded 670 * 671 * @return The expanded output 672 */ 673#define EXPAND(x) x 674 675/** Clamp the given value between an upper and lower bound. 676 * 677 * @param[in] x The value to be clamped 678 * @param[in] min_val The lower bound 679 * @param[in] max_val The upper bound 680 * 681 * @return The clamped value. 682 */ 683#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 684 685/** REVn reverses the given vector whose size is n. 686 * @name REVn 687 * 688 * @param[in] x The vector to be reversed 689 * 690 * @return The reversed vector 691 * @{ 692 */ 693#define REV1(x) ((x)) 694#define REV2(x) ((x).s10) 695#define REV3(x) ((x).s210) 696#define REV4(x) ((x).s3210) 697#define REV8(x) ((x).s76543210) 698#define REV16(x) ((x).sFEDCBA9876543210) 699/** @} */ // end of group REVn 700 701/** Reverse the given vector. 702 * @name REVERSE 703 * 704 * @param[in] x The vector to be reversed 705 * @param[in] s The size of the vector 706 * 707 * @return The reversed vector 708 * @{ 709 */ 710#define REVERSE_STR(x, s) REV##s((x)) 711#define REVERSE(x, s) REVERSE_STR(x, s) 712/** @} */ // end of group REVERSE 713 714/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. 715 * @name ROTs_n 716 * 717 * @param[in] x The vector to be shifted 718 * 719 * @return The shifted vector 720 * @{ 721 */ 722#define ROT1_0(x) ((x)) 723 724#define ROT2_0(x) ((x)) 725#define ROT2_1(x) ((x).s10) 726 727#define ROT3_0(x) ((x)) 728#define ROT3_1(x) ((x).s201) 729#define ROT3_2(x) ((x).s120) 730 731#define ROT4_0(x) ((x)) 732#define ROT4_1(x) ((x).s3012) 733#define ROT4_2(x) ((x).s2301) 734#define ROT4_3(x) ((x).s1230) 735 736#define ROT8_0(x) ((x)) 737#define ROT8_1(x) ((x).s70123456) 738#define ROT8_2(x) ((x).s67012345) 739#define ROT8_3(x) ((x).s56701234) 740#define ROT8_4(x) ((x).s45670123) 741#define ROT8_5(x) ((x).s34567012) 742#define ROT8_6(x) ((x).s23456701) 743#define ROT8_7(x) ((x).s12345670) 744 745#define ROT16_0(x) ((x)) 746#define ROT16_1(x) ((x).sF0123456789ABCDE) 747#define ROT16_2(x) ((x).sEF0123456789ABCD) 748#define ROT16_3(x) ((x).sDEF0123456789ABC) 749#define ROT16_4(x) ((x).sCDEF0123456789AB) 750#define ROT16_5(x) ((x).sBCDEF0123456789A) 751#define ROT16_6(x) ((x).sABCDEF0123456789) 752#define ROT16_7(x) ((x).s9ABCDEF012345678) 753#define ROT16_8(x) ((x).s89ABCDEF01234567) 754#define ROT16_9(x) ((x).s789ABCDEF0123456) 755#define ROT16_10(x) ((x).s6789ABCDEF012345) 756#define ROT16_11(x) ((x).s56789ABCDEF01234) 757#define ROT16_12(x) ((x).s456789ABCDEF0123) 758#define ROT16_13(x) ((x).s3456789ABCDEF012) 759#define ROT16_14(x) ((x).s23456789ABCDEF01) 760#define ROT16_15(x) ((x).s123456789ABCDEF0) 761/** @} */ // end of group ROTs_n 762 763/** Circular-right-shift (rotate-right) the given vector by the given amount. 764 * @name ROTATE 765 * 766 * @param[in] x The vector to be shifted 767 * @param[in] s The size of the vector 768 * @param[in] n The amount to be shifted 769 * 770 * @return The shifted vector 771 * @{ 772 */ 773#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 774#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 775/** @} */ // end of group ROTATE 776 777/** Creates a vector of size n filled with offset values corresponding to the location of each element. 778 * @name V_OFFSn 779 * 780 * @param[in] dt The data type of the output vector 781 * 782 * @return The vector filled with offset values 783 * @{ 784 */ 785#define V_OFFS1(dt) (dt##1)(0) 786#define V_OFFS2(dt) (dt##2)(0, 1) 787#define V_OFFS3(dt) (dt##3)(0, 1, 2) 788#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 789#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 790#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 791/** @} */ // end of group V_OFFSn 792 793/** Create a vector filled with offset values corresponding to the location of each element. 794 * @name VEC_OFFS 795 * 796 * @param[in] dt The data type of the output vector 797 * @param[in] s The size of the output vector 798 * 799 * @return The vector filled with offset values 800 * @{ 801 */ 802#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 803#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 804/** @} */ // end of group VEC_OFFS 805 806#define VLOAD_STR(size) vload##size 807#define VLOAD(size) VLOAD_STR(size) 808 809#define PIXEL_UNIT4 1 810#define PIXEL_UNIT8 2 811#define PIXEL_UNIT16 4 812 813/** Utility macro to convert a vector size in pixel unit. 814 * 815 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 816 * 817 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported 818 * 819 * @return The pixel unit (number of pixels) 820 * @{ 821 */ 822#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 823#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 824/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT 825 826#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 827#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 828#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 829 830#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 831#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 832#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 833#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 834#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 835 836/** Utility macro to read a 2D OpenCL image object. 837 * 838 * @note Coordinates are not normalized 839 * 840 * @param[in] data_type Data type 841 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported 842 * @param[in] img OpenCL image object 843 * @param[in] x_coord The x coordinate for the top-left pixel 844 * @param[in] y_coord The y coordinate for the top-left pixel 845 * 846 * @return Pixels from the 2D OpenCL image object 847 * @{ 848 */ 849#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 850#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 851 852#define VSTORE_STR(size) vstore##size 853#define VSTORE(size) VSTORE_STR(size) 854 855#define float1 float 856#define half1 half 857#define char1 char 858#define uchar1 uchar 859#define short1 short 860#define ushort1 ushort 861#define int1 int 862#define uint1 uint 863#define long1 long 864#define ulong1 ulong 865#define double1 double 866 867#define vload1(OFFSET, PTR) *(OFFSET + PTR) 868#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 869 870/** Extended partial vstore that correctly handles scalar values as well. 871 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 872 * @name VSTORE_PARTIAL 873 * 874 * @note With this macro, the passed data can be both a vector and a scalar 875 * @note @p store_size needs to be <= @p size 876 * eg 1: Valid 877 * VSTORE_PARTIAL(16, 15) ...; 878 * eg 2: Invalid 879 * VSTORE_PARTIAL(4, 7) ...; 880 * 881 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 882 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size 883 * @{ 884 */ 885#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 886#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 887 888#define NO_STORE(data, offs, ptr) \ 889 { \ 890 } 891 892// Size == 1 (scalar) 893#define vstore_partial_1_0 NO_STORE 894#define vstore_partial_1_1 vstore1 895#define vstore_partial_1_2 NO_STORE 896#define vstore_partial_1_3 NO_STORE 897#define vstore_partial_1_4 NO_STORE 898#define vstore_partial_1_5 NO_STORE 899#define vstore_partial_1_6 NO_STORE 900#define vstore_partial_1_7 NO_STORE 901#define vstore_partial_1_8 NO_STORE 902#define vstore_partial_1_9 NO_STORE 903#define vstore_partial_1_10 NO_STORE 904#define vstore_partial_1_11 NO_STORE 905#define vstore_partial_1_12 NO_STORE 906#define vstore_partial_1_13 NO_STORE 907#define vstore_partial_1_14 NO_STORE 908#define vstore_partial_1_15 NO_STORE 909#define vstore_partial_1_16 NO_STORE 910// Size == 2 911#define vstore_partial_2_0 NO_STORE 912#define vstore_partial_2_1 vstore_partial_1 913#define vstore_partial_2_2 vstore_partial_2 914#define vstore_partial_2_3 NO_STORE 915#define vstore_partial_2_4 NO_STORE 916#define vstore_partial_2_5 NO_STORE 917#define vstore_partial_2_6 NO_STORE 918#define vstore_partial_2_7 NO_STORE 919#define vstore_partial_2_8 NO_STORE 920#define vstore_partial_2_9 NO_STORE 921#define vstore_partial_2_10 NO_STORE 922#define vstore_partial_2_11 NO_STORE 923#define vstore_partial_2_12 NO_STORE 924#define vstore_partial_2_13 NO_STORE 925#define vstore_partial_2_14 NO_STORE 926#define vstore_partial_2_15 NO_STORE 927#define vstore_partial_2_16 NO_STORE 928// Size == 3 929#define vstore_partial_3_0 NO_STORE 930#define vstore_partial_3_1 vstore_partial_1 931#define vstore_partial_3_2 vstore_partial_2 932#define vstore_partial_3_3 vstore_partial_3 933#define vstore_partial_3_4 NO_STORE 934#define vstore_partial_3_5 NO_STORE 935#define vstore_partial_3_6 NO_STORE 936#define vstore_partial_3_7 NO_STORE 937#define vstore_partial_3_8 NO_STORE 938#define vstore_partial_3_9 NO_STORE 939#define vstore_partial_3_10 NO_STORE 940#define vstore_partial_3_11 NO_STORE 941#define vstore_partial_3_12 NO_STORE 942#define vstore_partial_3_13 NO_STORE 943#define vstore_partial_3_14 NO_STORE 944#define vstore_partial_3_15 NO_STORE 945#define vstore_partial_3_16 NO_STORE 946// Size == 4 947#define vstore_partial_4_0 NO_STORE 948#define vstore_partial_4_1 vstore_partial_1 949#define vstore_partial_4_2 vstore_partial_2 950#define vstore_partial_4_3 vstore_partial_3 951#define vstore_partial_4_4 vstore_partial_4 952#define vstore_partial_4_5 NO_STORE 953#define vstore_partial_4_6 NO_STORE 954#define vstore_partial_4_7 NO_STORE 955#define vstore_partial_4_8 NO_STORE 956#define vstore_partial_4_9 NO_STORE 957#define vstore_partial_4_10 NO_STORE 958#define vstore_partial_4_11 NO_STORE 959#define vstore_partial_4_12 NO_STORE 960#define vstore_partial_4_13 NO_STORE 961#define vstore_partial_4_14 NO_STORE 962#define vstore_partial_4_15 NO_STORE 963#define vstore_partial_4_16 NO_STORE 964// Size == 8 965#define vstore_partial_8_0 NO_STORE 966#define vstore_partial_8_1 vstore_partial_1 967#define vstore_partial_8_2 vstore_partial_2 968#define vstore_partial_8_3 vstore_partial_3 969#define vstore_partial_8_4 vstore_partial_4 970#define vstore_partial_8_5 vstore_partial_5 971#define vstore_partial_8_6 vstore_partial_6 972#define vstore_partial_8_7 vstore_partial_7 973#define vstore_partial_8_8 vstore_partial_8 974#define vstore_partial_8_9 NO_STORE 975#define vstore_partial_8_10 NO_STORE 976#define vstore_partial_8_11 NO_STORE 977#define vstore_partial_8_12 NO_STORE 978#define vstore_partial_8_13 NO_STORE 979#define vstore_partial_8_14 NO_STORE 980#define vstore_partial_8_15 NO_STORE 981#define vstore_partial_8_16 NO_STORE 982// Size == 16 983#define vstore_partial_16_0 NO_STORE 984#define vstore_partial_16_1 vstore_partial_1 985#define vstore_partial_16_2 vstore_partial_2 986#define vstore_partial_16_3 vstore_partial_3 987#define vstore_partial_16_4 vstore_partial_4 988#define vstore_partial_16_5 vstore_partial_5 989#define vstore_partial_16_6 vstore_partial_6 990#define vstore_partial_16_7 vstore_partial_7 991#define vstore_partial_16_8 vstore_partial_8 992#define vstore_partial_16_9 vstore_partial_9 993#define vstore_partial_16_10 vstore_partial_10 994#define vstore_partial_16_11 vstore_partial_11 995#define vstore_partial_16_12 vstore_partial_12 996#define vstore_partial_16_13 vstore_partial_13 997#define vstore_partial_16_14 vstore_partial_14 998#define vstore_partial_16_15 vstore_partial_15 999#define vstore_partial_16_16 vstore_partial_16 1000 1001/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops 1002 * @name vstore_partial_n 1003 * 1004 * @note @p DATA needs to be a vector not a scalar 1005 * @note n needs to be <= the vector width of the input variable @p DATA 1006 * eg 1: Valid 1007 * vstore_partial_15(var:float16, 0, 0xabcd); 1008 * eg 2: Invalid 1009 * vstore_partial_7(var:float4, 0, 0xabcd); 1010 * 1011 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. 1012 * 1013 * @param[in] DATA The name of the variable 1014 * @param[in] OFFSET Offset in n 1015 * @param[in] PTR The base pointer 1016 * @{ 1017 */ 1018#define vstore_partial_1(DATA, OFFSET, PTR) \ 1019 vstore1(DATA.s0, OFFSET, PTR); 1020 1021#define vstore_partial_2(DATA, OFFSET, PTR) \ 1022 vstore2(DATA.s01, OFFSET, PTR); 1023 1024#define vstore_partial_3(DATA, OFFSET, PTR) \ 1025 vstore3(DATA.s012, OFFSET, PTR); 1026 1027#define vstore_partial_4(DATA, OFFSET, PTR) \ 1028 vstore4(DATA.s0123, OFFSET, PTR); 1029 1030#define vstore_partial_5(DATA, OFFSET, PTR) \ 1031 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1032 vstore1(DATA.s4, OFFSET, PTR + 4); 1033 1034#define vstore_partial_6(DATA, OFFSET, PTR) \ 1035 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1036 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 1037 1038#define vstore_partial_7(DATA, OFFSET, PTR) \ 1039 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 1040 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 1041 1042#define vstore_partial_8(DATA, OFFSET, PTR) \ 1043 vstore8(DATA.s01234567, OFFSET, PTR); 1044 1045#define vstore_partial_9(DATA, OFFSET, PTR) \ 1046 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1047 vstore1(DATA.s8, OFFSET, PTR + 8); 1048 1049#define vstore_partial_10(DATA, OFFSET, PTR) \ 1050 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1051 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 1052 1053#define vstore_partial_11(DATA, OFFSET, PTR) \ 1054 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1055 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 1056 1057#define vstore_partial_12(DATA, OFFSET, PTR) \ 1058 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1059 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 1060 1061#define vstore_partial_13(DATA, OFFSET, PTR) \ 1062 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1063 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 1064 1065#define vstore_partial_14(DATA, OFFSET, PTR) \ 1066 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1067 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 1068 1069#define vstore_partial_15(DATA, OFFSET, PTR) \ 1070 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 1071 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 1072 1073#define vstore_partial_16(DATA, OFFSET, PTR) \ 1074 vstore16(DATA, OFFSET, PTR); 1075/** @} */ // end of groupd vstore_partial_n 1076/** @} */ // end of groupd VSTORE_PARTIAL 1077 1078// Convert built-in functions with _sat modifier are not supported in floating point so we create defines 1079// without _sat to overcome this issue 1080#define convert_float_sat convert_float 1081#define convert_float1_sat convert_float 1082#define convert_float2_sat convert_float2 1083#define convert_float3_sat convert_float3 1084#define convert_float4_sat convert_float4 1085#define convert_float8_sat convert_float8 1086#define convert_float16_sat convert_float16 1087#define convert_half_sat convert_float 1088#define convert_half1_sat convert_half 1089#define convert_half2_sat convert_half2 1090#define convert_half3_sat convert_half3 1091#define convert_half4_sat convert_half4 1092#define convert_half8_sat convert_half8 1093#define convert_half16_sat convert_half16 1094 1095#define convert_float1 convert_float 1096#define convert_half1 convert_half 1097#define convert_char1 convert_char 1098#define convert_uchar1 convert_uchar 1099#define convert_short1 convert_short 1100#define convert_ushort1 convert_ushort 1101#define convert_int1 convert_int 1102#define convert_uint1 convert_uint 1103#define convert_long1 convert_long 1104#define convert_ulong1 convert_ulong 1105#define convert_double1 convert_double 1106 1107#define convert_char1_sat convert_char_sat 1108#define convert_uchar1_sat convert_uchar_sat 1109#define convert_short1_sat convert_short_sat 1110#define convert_ushort1_sat convert_ushort_sat 1111#define convert_int1_sat convert_int_sat 1112#define convert_uint1_sat convert_uint_sat 1113#define convert_long1_sat convert_long_sat 1114#define convert_ulong1_sat convert_ulong_sat 1115#define convert_double1_sat convert_double_sat 1116 1117#define VEC_DATA_TYPE_STR(type, size) type##size 1118#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 1119 1120#define CONVERT_STR(x, type) (convert_##type((x))) 1121#define CONVERT(x, type) CONVERT_STR(x, type) 1122 1123#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 1124#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 1125 1126#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 1127#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 1128 1129#define select_vec_dt_uchar(size) uchar##size 1130#define select_vec_dt_char(size) char##size 1131#define select_vec_dt_ushort(size) ushort##size 1132#define select_vec_dt_short(size) short##size 1133#define select_vec_dt_half(size) short##size 1134#define select_vec_dt_uint(size) uint##size 1135#define select_vec_dt_int(size) int##size 1136#define select_vec_dt_float(size) int##size 1137#define select_vec_dt_ulong(size) ulong##size 1138#define select_vec_dt_long(size) long##size 1139 1140#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 1141#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 1142#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 1143 1144#define sum_reduce_1(x) (x) 1145#define sum_reduce_2(x) ((x).s0) + ((x).s1) 1146#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 1147#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 1148#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 1149#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 1150 1151#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 1152#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 1153 1154#define max_reduce_1(x) (x) 1155#define max_reduce_2(x) max(((x).s0), ((x).s1)) 1156#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 1157#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 1158#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 1159#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 1160 1161#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 1162#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 1163 1164#define VECTOR_DECLARATION(name) \ 1165 __global uchar *name##_ptr, \ 1166 uint name##_stride_x, \ 1167 uint name##_step_x, \ 1168 uint name##_offset_first_element_in_bytes 1169 1170#define IMAGE_DECLARATION(name) \ 1171 __global uchar *name##_ptr, \ 1172 uint name##_stride_x, \ 1173 uint name##_step_x, \ 1174 uint name##_stride_y, \ 1175 uint name##_step_y, \ 1176 uint name##_offset_first_element_in_bytes 1177 1178#define TENSOR3D_DECLARATION(name) \ 1179 __global uchar *name##_ptr, \ 1180 uint name##_stride_x, \ 1181 uint name##_step_x, \ 1182 uint name##_stride_y, \ 1183 uint name##_step_y, \ 1184 uint name##_stride_z, \ 1185 uint name##_step_z, \ 1186 uint name##_offset_first_element_in_bytes 1187 1188#define TENSOR4D_DECLARATION(name) \ 1189 __global uchar *name##_ptr, \ 1190 uint name##_stride_x, \ 1191 uint name##_step_x, \ 1192 uint name##_stride_y, \ 1193 uint name##_step_y, \ 1194 uint name##_stride_z, \ 1195 uint name##_step_z, \ 1196 uint name##_stride_w, \ 1197 uint name##_step_w, \ 1198 uint name##_offset_first_element_in_bytes 1199 1200#define CONVERT_TO_VECTOR_STRUCT(name) \ 1201 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1202 1203#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1204 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1205 1206#define CONVERT_TO_IMAGE_STRUCT(name) \ 1207 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1208 1209#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1210 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1211 1212#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1213 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1214 1215#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1216 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1217 1218#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1219 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1220 1221#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1222 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1223 name##_stride_z, name##_step_z) 1224 1225#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1226 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1227 1228#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1229 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1230 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1231 1232#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1233 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1234 1235#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1236 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1237 name##_stride_z, name##_step_z) 1238 1239/** Structure to hold Vector information */ 1240typedef struct Vector 1241{ 1242 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1243 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1244 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1245} Vector; 1246 1247/** Structure to hold Image information */ 1248typedef struct Image 1249{ 1250 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1251 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1252 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1253 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1254} Image; 1255 1256/** Structure to hold 3D tensor information */ 1257typedef struct Tensor3D 1258{ 1259 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1260 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1261 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1262 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1263 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1264} Tensor3D; 1265 1266/** Structure to hold 4D tensor information */ 1267typedef struct Tensor4D 1268{ 1269 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ 1270 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ 1271 int stride_x; /**< Stride of the image in X dimension (in bytes) */ 1272 int stride_y; /**< Stride of the image in Y dimension (in bytes) */ 1273 int stride_z; /**< Stride of the image in Z dimension (in bytes) */ 1274 int stride_w; /**< Stride of the image in W dimension (in bytes) */ 1275} Tensor4D; 1276 1277/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data. 1278 * 1279 * @param[in] ptr Pointer to the starting postion of the buffer 1280 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector 1281 * @param[in] stride_x Stride of the vector in X dimension (in bytes) 1282 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1283 * 1284 * @return An image object 1285 */ 1286inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1287{ 1288 Vector vector = 1289 { 1290 .ptr = ptr, 1291 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1292 .stride_x = stride_x, 1293 }; 1294 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1295 return vector; 1296} 1297 1298/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. 1299 * 1300 * @param[in] ptr Pointer to the starting postion of the buffer 1301 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1302 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1303 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1304 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1305 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1306 * 1307 * @return An image object 1308 */ 1309inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1310{ 1311 Image img = 1312 { 1313 .ptr = ptr, 1314 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1315 .stride_x = stride_x, 1316 .stride_y = stride_y 1317 }; 1318 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1319 return img; 1320} 1321 1322/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. 1323 * 1324 * @param[in] ptr Pointer to the starting postion of the buffer 1325 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1326 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1327 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1328 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1329 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1330 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1331 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1332 * 1333 * @return A 3D tensor object 1334 */ 1335inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1336{ 1337 Image img = 1338 { 1339 .ptr = ptr, 1340 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1341 .stride_x = stride_x, 1342 .stride_y = stride_y 1343 }; 1344 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1345 return img; 1346} 1347 1348/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. 1349 * 1350 * @param[in] ptr Pointer to the starting postion of the buffer 1351 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1352 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1353 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1354 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1355 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1356 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1357 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1358 * 1359 * @return A 3D tensor object 1360 */ 1361inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1362{ 1363 Tensor3D tensor = 1364 { 1365 .ptr = ptr, 1366 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1367 .stride_x = stride_x, 1368 .stride_y = stride_y, 1369 .stride_z = stride_z 1370 }; 1371 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1372 return tensor; 1373} 1374 1375/** Wrap 3D tensor information into an tensor structure. 1376 * 1377 * @param[in] ptr Pointer to the starting postion of the buffer 1378 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image 1379 * @param[in] stride_x Stride of the image in X dimension (in bytes) 1380 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) 1381 * @param[in] stride_y Stride of the image in Y dimension (in bytes) 1382 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) 1383 * @param[in] stride_z Stride of the image in Z dimension (in bytes) 1384 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) 1385 * 1386 * @return A 3D tensor object 1387 */ 1388inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1389{ 1390 Tensor3D tensor = 1391 { 1392 .ptr = ptr, 1393 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1394 .stride_x = stride_x, 1395 .stride_y = stride_y, 1396 .stride_z = stride_z 1397 }; 1398 return tensor; 1399} 1400 1401inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1402 uint step_w, 1403 uint mod_size) 1404{ 1405 Tensor4D tensor = 1406 { 1407 .ptr = ptr, 1408 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1409 .stride_x = stride_x, 1410 .stride_y = stride_y, 1411 .stride_z = stride_z, 1412 .stride_w = stride_w 1413 }; 1414 1415 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1416 return tensor; 1417} 1418 1419/** Get the pointer position of a Vector 1420 * 1421 * @param[in] vec Pointer to the starting position of the buffer 1422 * @param[in] x Relative X position 1423 */ 1424inline __global const uchar *vector_offset(const Vector *vec, int x) 1425{ 1426 return vec->ptr + x * vec->stride_x; 1427} 1428 1429/** Get the pointer position of a Image 1430 * 1431 * @param[in] img Pointer to the starting position of the buffer 1432 * @param[in] x Relative X position 1433 * @param[in] y Relative Y position 1434 */ 1435inline __global uchar *offset(const Image *img, int x, int y) 1436{ 1437 return img->ptr + x * img->stride_x + y * img->stride_y; 1438} 1439 1440/** Get the pointer position of a Tensor3D 1441 * 1442 * @param[in] tensor Pointer to the starting position of the buffer 1443 * @param[in] x Relative X position 1444 * @param[in] y Relative Y position 1445 * @param[in] z Relative Z position 1446 */ 1447inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1448{ 1449 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1450} 1451 1452/** Get the pointer position of a Tensor4D 1453 * 1454 * @param[in] tensor Pointer to the starting position of the buffer 1455 * @param[in] x Relative X position 1456 * @param[in] y Relative Y position 1457 * @param[in] z Relative Z position 1458 * @param[in] w Relative W position 1459 */ 1460inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1461{ 1462 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1463} 1464 1465/** Get the offset for a given linear index of a Tensor3D 1466 * 1467 * @param[in] tensor Pointer to the starting position of the buffer 1468 * @param[in] width Width of the input tensor 1469 * @param[in] height Height of the input tensor 1470 * @param[in] depth Depth of the input tensor 1471 * @param[in] index Linear index 1472 */ 1473inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1474{ 1475 uint num_elements = width * height; 1476 1477 const uint z = index / num_elements; 1478 1479 index %= num_elements; 1480 1481 const uint y = index / width; 1482 1483 index %= width; 1484 1485 const uint x = index; 1486 1487 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1488} 1489 1490#endif // _HELPER_H 1491 1492#ifndef DATA_TYPE 1493#define DATA_TYPE short 1494#endif /* DATA_TYPE */ 1495 1496#ifndef COMPUTE_TYPE 1497#define COMPUTE_TYPE int 1498#endif /* COMPUTE_TYPE */ 1499 1500#ifndef DATA_TYPE_OUT 1501#define DATA_TYPE_OUT uchar 1502#endif /* DATA_TYPE_OUT */ 1503 1504/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). 1505 * 1506 * @param[in] left_pixel Pointer to the left pixel 1507 * @param[in] left1_coeff Weight of the most left pixel 1508 * @param[in] left2_coeff Weight of the left pixel 1509 * @param[in] middle_coeff Weight of the middle pixel 1510 * @param[in] right1_coeff Weight of the right pixel 1511 * @param[in] right2_coeff Weight of the most right pixel 1512 * 1513 * @return a short8 containing 8 convoluted values. 1514 */ 1515VEC_DATA_TYPE(DATA_TYPE, 8) 1516convolution1x5( 1517 __global const uchar *left_pixel, 1518 const short left1_coeff, 1519 const short left2_coeff, 1520 const short middle_coeff, 1521 const short right1_coeff, 1522 const short right2_coeff) 1523{ 1524 uchar16 temp = vload16(0, left_pixel); 1525 1526 VEC_DATA_TYPE(DATA_TYPE, 8) 1527 left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); 1528 VEC_DATA_TYPE(DATA_TYPE, 8) 1529 left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); 1530 VEC_DATA_TYPE(DATA_TYPE, 8) 1531 middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); 1532 VEC_DATA_TYPE(DATA_TYPE, 8) 1533 right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); 1534 VEC_DATA_TYPE(DATA_TYPE, 8) 1535 right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); 1536 1537 return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff 1538 + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff; 1539} 1540 1541/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). 1542 * 1543 * @param[in] src Pointer to source image. 1544 * @param[in] up1_coeff Weight of the most up pixel 1545 * @param[in] up2_coeff Weight of the up pixel 1546 * @param[in] middle_coeff Weight of the middle pixel 1547 * @param[in] down1_coeff Weight of the down pixel 1548 * @param[in] down2_coeff Weight of the most down pixel 1549 * 1550 * @return a short8 containing 8 convoluted values. 1551 */ 1552VEC_DATA_TYPE(COMPUTE_TYPE, 8) 1553convolution5x1( 1554 Image *src, 1555 const short up1_coeff, 1556 const short up2_coeff, 1557 const short middle_coeff, 1558 const short down1_coeff, 1559 const short down2_coeff) 1560{ 1561 VEC_DATA_TYPE(COMPUTE_TYPE, 8) 1562 val; 1563 VEC_DATA_TYPE(COMPUTE_TYPE, 8) 1564 out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; 1565 1566 val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); 1567 out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; 1568 1569 val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); 1570 out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; 1571 1572 val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); 1573 out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; 1574 1575 val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); 1576 out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; 1577 1578 val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); 1579 out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; 1580 1581 return out; 1582} 1583 1584/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result. 1585 * 1586 * Convolution matrix layout:\n 1587 * [ mat0, mat1, mat2, mat3 , mat4 ]\n 1588 * [ mat5, mat6, mat7, mat8, mat9 ]\n 1589 * [ mat10, mat11, mat12, mat13, mat14 ]\n 1590 * [ mat15, mat16, mat17, mat18, mat19 ]\n 1591 * [ mat20, mat21, mat22, mat23, mat24 ] 1592 * 1593 * @param[in] src A pointer to source Image structure. 1594 * @param[in] mat0 Coefficient from the convolution matrix 1595 * @param[in] mat1 Coefficient from the convolution matrix 1596 * @param[in] mat2 Coefficient from the convolution matrix 1597 * @param[in] mat3 Coefficient from the convolution matrix 1598 * @param[in] mat4 Coefficient from the convolution matrix 1599 * @param[in] mat5 Coefficient from the convolution matrix 1600 * @param[in] mat6 Coefficient from the convolution matrix 1601 * @param[in] mat7 Coefficient from the convolution matrix 1602 * @param[in] mat8 Coefficient from the convolution matrix 1603 * @param[in] mat9 Coefficient from the convolution matrix 1604 * @param[in] mat10 Coefficient from the convolution matrix 1605 * @param[in] mat11 Coefficient from the convolution matrix 1606 * @param[in] mat12 Coefficient from the convolution matrix 1607 * @param[in] mat13 Coefficient from the convolution matrix 1608 * @param[in] mat14 Coefficient from the convolution matrix 1609 * @param[in] mat15 Coefficient from the convolution matrix 1610 * @param[in] mat16 Coefficient from the convolution matrix 1611 * @param[in] mat17 Coefficient from the convolution matrix 1612 * @param[in] mat18 Coefficient from the convolution matrix 1613 * @param[in] mat19 Coefficient from the convolution matrix 1614 * @param[in] mat20 Coefficient from the convolution matrix 1615 * @param[in] mat21 Coefficient from the convolution matrix 1616 * @param[in] mat22 Coefficient from the convolution matrix 1617 * @param[in] mat23 Coefficient from the convolution matrix 1618 * @param[in] mat24 Coefficient from the convolution matrix 1619 * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) 1620 * 1621 * @return a short8 containing 8 convoluted and scaled values. 1622 */ 1623short8 convolution5x5( 1624 Image *src, 1625 const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, 1626 const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, 1627 const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, 1628 const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, 1629 const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, 1630 uint scale) 1631{ 1632 VEC_DATA_TYPE(DATA_TYPE, 8) 1633 pixels; 1634 1635 pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4); 1636 pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9); 1637 pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14); 1638 pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19); 1639 pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24); 1640 1641 if(scale > 0) 1642 { 1643 pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; 1644 } 1645 1646 return convert_short8_sat(pixels); 1647} 1648 1649#ifndef DYNAMIC_MATRIX_CONVOLUTION 1650 1651/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32). 1652 * 1653 * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n 1654 * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int 1655 * 1656 * @param[in] src_ptr Pointer to the source image. Supported data types: U8 1657 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 1658 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1659 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 1660 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1661 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 1662 * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 1663 * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) 1664 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 1665 * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) 1666 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 1667 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image 1668 */ 1669__kernel void convolution_separable1x5_static( 1670 IMAGE_DECLARATION(src), 1671 IMAGE_DECLARATION(dst)) 1672{ 1673 Image src = CONVERT_TO_IMAGE_STRUCT(src); 1674 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 1675 1676 // Output pixels 1677 VEC_DATA_TYPE(DATA_TYPE, 8) 1678 pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4); 1679 1680 // Store result in dst 1681 vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); 1682} 1683 1684/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image. 1685 * 1686 * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n 1687 * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int 1688 * 1689 * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 1690 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 1691 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1692 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 1693 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1694 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 1695 * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 1696 * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) 1697 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 1698 * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) 1699 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 1700 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image 1701 */ 1702__kernel void convolution_separable5x1_static( 1703 IMAGE_DECLARATION(src), 1704 IMAGE_DECLARATION(dst)) 1705{ 1706 Image src = CONVERT_TO_IMAGE_STRUCT(src); 1707 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 1708 1709 // Output pixels 1710 VEC_DATA_TYPE(COMPUTE_TYPE, 8) 1711 pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9); 1712 1713 // Divide by the scale 1714 pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; 1715 1716 // Store result in dst 1717 vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); 1718} 1719 1720/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders 1721 * 1722 * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n 1723 * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int 1724 * 1725 * @param[in] src_ptr Pointer to the source image. Supported data types: U8 1726 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) 1727 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1728 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) 1729 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1730 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image 1731 * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 1732 * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) 1733 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 1734 * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) 1735 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 1736 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image 1737 */ 1738__kernel void convolution5x5_static( 1739 IMAGE_DECLARATION(src), 1740 IMAGE_DECLARATION(dst)) 1741{ 1742 Image src = CONVERT_TO_IMAGE_STRUCT(src); 1743 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 1744 1745 short8 pixels = convolution5x5(&src, 1746 MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, 1747 MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE); 1748 1749 // Store the result as is in dst 1750 vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); 1751} 1752 1753#endif // DYNAMIC_MATRIX_CONVOLUTION 1754 1755)"