1/* 2 * Copyright (c) 2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "activation_float_helpers.h" 25#include "helpers.h" 26#include "tile_helpers.h" 27 28#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL) 29/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension: 30 * 31 * The LHS matrix is NOT reshaped 32 * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed 33 * 34 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). 35 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) 36 * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2) 37 * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2) 38 * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2) 39 * @note Only the following configurations of M0, N0 and K0 are currently supported: 40 * - M0 > 0 41 * - N0 = 1, 2, 3, 4, 8, 16 42 * - K0 = 1 43 * 44 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. 45 * The activation function is performed after the bias addition 46 * 47 * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 48 * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) 49 * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) 50 * @param[in] lhs_w The size of the width dimension of the LHS tensor 51 * @param[in] lhs_h The size of the height dimension of the LHS tensor 52 * @param[in] lhs_n The size of the depth dimension of the LHS tensor 53 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor 54 * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr 55 * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) 56 * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) 57 * @param[in] rhs_w The size of the width dimension of the RHS tensor 58 * @param[in] rhs_h The size of the height dimension of the RHS tensor 59 * @param[in] rhs_n The size of the depth dimension of the RHS tensor 60 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor 61 * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr 62 * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) 63 * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) 64 * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor 65 * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor 66 * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor 67 * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor 68 * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p lhs_ptr 69 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 70 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 71 * @param[in] dst_w The size of the width dimension of the destination tensor 72 * @param[in] dst_h The size of the height dimension of the destination tensor 73 * @param[in] dst_n The size of the depth dimension of the destination tensor 74 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 75 * @param[in] M Number of rows in LHS matrix not reshaped 76 * @param[in] N Number of columns in RHS matrix not reshaped 77 * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped 78 */ 79__kernel void gemm_mm_reshaped_only_rhs_nt_mmul( 80 TENSOR3D_T(lhs, BUFFER), 81 TENSOR3D_T(rhs, BUFFER), 82#if defined(BETA) 83 TENSOR3D_T(bia, BUFFER), 84#endif // defined(BETA) 85 TENSOR3D_T(dst, BUFFER), 86 const int M, 87 const int N, 88 const int K) 89{ 90#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0) 91 92 uint x0 = get_global_id(0); // (N / N0) * MMUL_K0 93 uint y0 = get_global_id(1); // (M / M0) / MMUL_M0 94 uint z = get_global_id(2); // Batch 95 96 // Get block ID and thread ID within the block 97 uint block_id = (x0 / MMUL_BLOCK_SIZE); 98 uint thread_id = (x0 % MMUL_BLOCK_SIZE); 99 100 // Coordinate within a block 101 uint block_x = thread_id % MMUL_N0; 102 uint block_y = (thread_id / MMUL_M0); 103 104 // Starting destination coordinates 105 uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1)); 106 uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0)); 107 108 // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication 109 // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results 110 111 // Starting LHS coordinates 112 uint lhs_x = block_x; 113 uint lhs_y = dst_y; 114 115 // Starting RHS coordinates 116 uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0; 117 uint rhs_y = block_id; 118 119 // Compute LHS/RHS/DST matrix address 120 lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z; 121 rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z; 122 dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z; 123 124 // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for 125 // not sliding the tensor 126 127 // Initialize the accumulators 128 // MMUL extension accumulate the result in F32 for both F32 and F16 129 TILE(float, M0, N0, c_f32); 130 131#if !defined(HALF_PRECISION) 132#define c c_f32 133#endif // !defined(HALF_PRECISION) 134 135 LOOP_UNROLLING(int, i, 0, 1, M0, 136 { 137 c_f32[i].v = 0; 138 }) 139 140 for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0) 141 { 142 TILE(DATA_TYPE, M0, 1, a); 143 TILE(DATA_TYPE, 1, N0, b); 144 145 // Load tile from the lhs/rhs tensors 146 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); 147 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, 0, b); 148 149 LOOP_UNROLLING(int, m0, 0, 1, M0, 150 { 151 LOOP_UNROLLING(int, n0, 0, 1, N0, 152 { 153 c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]); 154 }) 155 }) 156 157 lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE); 158 rhs_offset_first_element_in_bytes += MMUL_K0 * MMUL_N0 * N0 * sizeof(DATA_TYPE); 159 } 160 161 if(block_x * N0 + block_id * MMUL_N0 * N0 >= N) 162 { 163 return; 164 } 165 166 if(block_y * M0 + y0 * M0 * MMUL_M0 >= M) 167 { 168 return; 169 } 170 171#if defined(HALF_PRECISION) 172 TILE(DATA_TYPE, M0, N0, c); 173 174 // Conversion required for the half precision 175 LOOP_UNROLLING(int, m0, 0, 1, M0, 176 { 177 LOOP_UNROLLING(int, n0, 0, 1, N0, 178 { 179 c[m0].s[n0] = c_f32[m0].s[n0]; 180 }) 181 }) 182#endif // defined(HALF_PRECISION) 183 184 // Multiply by the weight of matrix-matrix product and store the result 185#if defined(ALPHA) 186 T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); 187#endif // defined(ALPHA) 188 189 // Add beta*bias 190#if defined(BETA) 191#if defined(BROADCAST_BIAS) 192 bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE); 193 194 TILE(DATA_TYPE, 1, N0, bias0); 195 196 if(dst_x + N0 <= N || N0_LEFTOVER == 0) 197 { 198 bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); 199 } 200 else 201 { 202 VLOAD_PARTIAL(N0, N0_LEFTOVER) 203 (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); 204 } 205 206#ifndef UNIT_BETA 207 T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); 208#endif // UNIT_BIAS 209 210 // c = c + bias[broadcasted] 211 T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c); 212#else // defined(BROADCAST_BIAS) 213 TILE(DATA_TYPE, M0, N0, bias0); 214 215 bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z; 216 217 if(dst_x + N0 <= N || N0_LEFTOVER == 0) 218 { 219 LOOP_UNROLLING(int, m0, 0, 1, M0, 220 { 221 if(dst_y + m0 < M || M0_LEFTOVER == 0) 222 { 223 bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); 224 } 225 }) 226 } 227 else 228 { 229 LOOP_UNROLLING(int, m0, 0, 1, M0, 230 { 231 if(dst_y + m0 < M || M0_LEFTOVER == 0) 232 { 233 VLOAD_PARTIAL(N0, N0_LEFTOVER) 234 (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); 235 } 236 }) 237 } 238 239#ifndef UNIT_BETA 240 T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); 241#endif // UNIT_BIAS 242 243 // c = c + bias 244 T_ADD(DATA_TYPE, M0, N0, c, bias0, c); 245 // c = c + bias 246#endif // defined(BROADCAST_BIAS) 247#endif // defined(BETA) 248 249 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); 250 251 // Store 252 if(dst_x + N0 <= N || N0_LEFTOVER == 0) 253 { 254 LOOP_UNROLLING(int, m0, 0, 1, M0, 255 { 256 if(dst_y + m0 < M || M0_LEFTOVER == 0) 257 { 258 VSTORE(N0) 259 (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); 260 } 261 }) 262 } 263 else 264 { 265 LOOP_UNROLLING(int, m0, 0, 1, M0, 266 { 267 if(dst_y + m0 < M || M0_LEFTOVER == 0) 268 { 269 VSTORE_PARTIAL(N0, N0_LEFTOVER) 270 (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); 271 } 272 }) 273 } 274 275#undef RHS_BLOCK_SIZE 276#undef RHS_OFFSET_X 277#undef RHS_STEP_X 278} 279#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL) 280 281#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL_TEXTURE) 282/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension and the OpenCL image for RHS: 283 * 284 * The LHS matrix is NOT reshaped 285 * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed 286 * 287 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). 288 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) 289 * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2) 290 * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2) 291 * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2) 292 * @note Only the following configurations of M0, N0 and K0 are currently supported: 293 * - M0 > 0 294 * - N0 = 1, 2, 3, 4, 8, 16 295 * - K0 = 1 296 * 297 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. 298 * The activation function is performed after the bias addition 299 * 300 * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 301 * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) 302 * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) 303 * @param[in] lhs_w The size of the width dimension of the LHS tensor 304 * @param[in] lhs_h The size of the height dimension of the LHS tensor 305 * @param[in] lhs_n The size of the depth dimension of the LHS tensor 306 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor 307 * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr 308 * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) 309 * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) 310 * @param[in] rhs_w The size of the width dimension of the RHS tensor 311 * @param[in] rhs_h The size of the height dimension of the RHS tensor 312 * @param[in] rhs_n The size of the depth dimension of the RHS tensor 313 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor 314 * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr 315 * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) 316 * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) 317 * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor 318 * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor 319 * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor 320 * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor 321 * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p lhs_ptr 322 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 323 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 324 * @param[in] dst_w The size of the width dimension of the destination tensor 325 * @param[in] dst_h The size of the height dimension of the destination tensor 326 * @param[in] dst_n The size of the depth dimension of the destination tensor 327 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 328 * @param[in] M Number of rows in LHS matrix not reshaped 329 * @param[in] N Number of columns in RHS matrix not reshaped 330 * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped 331 */ 332__kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture( 333 TENSOR3D_T(lhs, BUFFER), 334 TENSOR3D_T(rhs, IMAGE), 335#if defined(BETA) 336 TENSOR3D_T(bia, BUFFER), 337#endif // defined(BETA) 338 TENSOR3D_T(dst, BUFFER), 339 const int M, 340 const int N, 341 const int K) 342{ 343#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0) 344 345 uint x0 = get_global_id(0); // (N / N0) * MMUL_K0 346 uint y0 = get_global_id(1); // (M / M0) / MMUL_M0 347 uint z = get_global_id(2); // Batch 348 349 // Get block ID and thread ID within the block 350 uint block_id = (x0 / MMUL_BLOCK_SIZE); 351 uint thread_id = (x0 % MMUL_BLOCK_SIZE); 352 353 // Coordinate within a block 354 uint block_x = thread_id % MMUL_N0; 355 uint block_y = (thread_id / MMUL_M0); 356 357 // Starting destination coordinates 358 uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1)); 359 uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0)); 360 361 // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication 362 // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results 363 364 // Starting LHS coordinates 365 uint lhs_x = block_x; 366 uint lhs_y = dst_y; 367 368 // Starting RHS coordinates 369 uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0; 370 uint rhs_y = block_id + z * rhs_h; 371 372 // Compute LHS/RHS/DST matrix address 373 lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z; 374 dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z; 375 376 // Initialize the accumulators 377 // MMUL extension accumulate the result in F32 for both F32 and F16 378 TILE(float, M0, N0, c_f32); 379 380#if !defined(HALF_PRECISION) 381#define c c_f32 382#endif // !defined(HALF_PRECISION) 383 384 LOOP_UNROLLING(int, i, 0, 1, M0, 385 { 386 c_f32[i].v = 0; 387 }) 388 389 for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0) 390 { 391 TILE(DATA_TYPE, M0, 1, a); 392 TILE(DATA_TYPE, 1, N0, b); 393 394 // Load tile from the lhs/rhs tensors 395 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); 396 T_LOAD(DATA_TYPE, 1, N0, IMAGE, rhs, rhs_x, rhs_y, 1, rhs_stride_y, b); 397 398 LOOP_UNROLLING(int, m0, 0, 1, M0, 399 { 400 LOOP_UNROLLING(int, n0, 0, 1, N0, 401 { 402 c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]); 403 }) 404 }) 405 406 lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE); 407 rhs_x += MMUL_K0 * MMUL_N0 * N0; 408 } 409 410 if(block_x * N0 + block_id * MMUL_N0 * N0 >= N) 411 { 412 return; 413 } 414 415 if(block_y * M0 + y0 * M0 * MMUL_M0 >= M) 416 { 417 return; 418 } 419 420#if defined(HALF_PRECISION) 421 TILE(DATA_TYPE, M0, N0, c); 422 423 // Conversion required for the half precision 424 LOOP_UNROLLING(int, m0, 0, 1, M0, 425 { 426 LOOP_UNROLLING(int, n0, 0, 1, N0, 427 { 428 c[m0].s[n0] = c_f32[m0].s[n0]; 429 }) 430 }) 431#endif // defined(HALF_PRECISION) 432 433 // Multiply by the weight of matrix-matrix product and store the result 434#if defined(ALPHA) 435 T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); 436#endif // defined(ALPHA) 437 438 // Add beta*bias 439#if defined(BETA) 440#if defined(BROADCAST_BIAS) 441 bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE); 442 443 TILE(DATA_TYPE, 1, N0, bias0); 444 445 if(dst_x + N0 <= N || N0_LEFTOVER == 0) 446 { 447 bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); 448 } 449 else 450 { 451 VLOAD_PARTIAL(N0, N0_LEFTOVER) 452 (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); 453 } 454 455#ifndef UNIT_BETA 456 T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); 457#endif // UNIT_BIAS 458 459 // c = c + bias[broadcasted] 460 T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c); 461#else // defined(BROADCAST_BIAS) 462 TILE(DATA_TYPE, M0, N0, bias0); 463 464 bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z; 465 466 if(dst_x + N0 <= N || N0_LEFTOVER == 0) 467 { 468 LOOP_UNROLLING(int, m0, 0, 1, M0, 469 { 470 if(dst_y + m0 < M || M0_LEFTOVER == 0) 471 { 472 bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); 473 } 474 }) 475 } 476 else 477 { 478 LOOP_UNROLLING(int, m0, 0, 1, M0, 479 { 480 if(dst_y + m0 < M || M0_LEFTOVER == 0) 481 { 482 VLOAD_PARTIAL(N0, N0_LEFTOVER) 483 (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); 484 } 485 }) 486 } 487 488#ifndef UNIT_BETA 489 T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); 490#endif // UNIT_BIAS 491 492 // c = c + bias 493 T_ADD(DATA_TYPE, M0, N0, c, bias0, c); 494 // c = c + bias 495#endif // defined(BROADCAST_BIAS) 496#endif // defined(BETA) 497 498 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); 499 500 // Store 501 if(dst_x + N0 <= N || N0_LEFTOVER == 0) 502 { 503 LOOP_UNROLLING(int, m0, 0, 1, M0, 504 { 505 if(dst_y + m0 < M || M0_LEFTOVER == 0) 506 { 507 VSTORE(N0) 508 (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); 509 } 510 }) 511 } 512 else 513 { 514 LOOP_UNROLLING(int, m0, 0, 1, M0, 515 { 516 if(dst_y + m0 < M || M0_LEFTOVER == 0) 517 { 518 VSTORE_PARTIAL(N0, N0_LEFTOVER) 519 (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); 520 } 521 }) 522 } 523 524#undef RHS_BLOCK_SIZE 525#undef RHS_OFFSET_X 526#undef RHS_STEP_X 527} 528#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)