• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "activation_float_helpers.h"
25#include "helpers.h"
26#include "tile_helpers.h"
27
28#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL)
29/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
30 *
31 *  The LHS matrix is NOT reshaped
32 *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
33 *
34 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
35 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
36 * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
37 * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
38 * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
39 * @note Only the following configurations of M0, N0 and K0 are currently supported:
40 *  - M0 > 0
41 *  - N0 = 1, 2, 3, 4, 8, 16
42 *  - K0 = 1
43 *
44 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
45 *       The activation function is performed after the bias addition
46 *
47 * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
48 * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
49 * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
50 * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
51 * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
52 * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
53 * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
54 * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
55 * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
56 * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
57 * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
58 * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
59 * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
60 * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
61 * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
62 * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
63 * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
64 * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
65 * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
66 * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
67 * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
68 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
69 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
70 * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
71 * @param[in]  dst_w                             The size of the width dimension of the destination tensor
72 * @param[in]  dst_h                             The size of the height dimension of the destination tensor
73 * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
74 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
75 * @param[in]  M                                 Number of rows in LHS matrix not reshaped
76 * @param[in]  N                                 Number of columns in RHS matrix not reshaped
77 * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
78 */
79__kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
80    TENSOR3D_T(lhs, BUFFER),
81    TENSOR3D_T(rhs, BUFFER),
82#if defined(BETA)
83    TENSOR3D_T(bia, BUFFER),
84#endif // defined(BETA)
85    TENSOR3D_T(dst, BUFFER),
86    const int M,
87    const int N,
88    const int K)
89{
90#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
91
92    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
93    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
94    uint z  = get_global_id(2); // Batch
95
96    // Get block ID and thread ID within the block
97    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
98    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
99
100    // Coordinate within a block
101    uint block_x = thread_id % MMUL_N0;
102    uint block_y = (thread_id / MMUL_M0);
103
104    // Starting destination coordinates
105    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
106    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
107
108    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
109    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
110
111    // Starting LHS coordinates
112    uint lhs_x = block_x;
113    uint lhs_y = dst_y;
114
115    // Starting RHS coordinates
116    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
117    uint rhs_y = block_id;
118
119    // Compute LHS/RHS/DST matrix address
120    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
121    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
122    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
123
124    // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
125    // not sliding the tensor
126
127    // Initialize the accumulators
128    // MMUL extension accumulate the result in F32 for both F32 and F16
129    TILE(float, M0, N0, c_f32);
130
131#if !defined(HALF_PRECISION)
132#define c c_f32
133#endif // !defined(HALF_PRECISION)
134
135    LOOP_UNROLLING(int, i, 0, 1, M0,
136    {
137        c_f32[i].v = 0;
138    })
139
140    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
141    {
142        TILE(DATA_TYPE, M0, 1, a);
143        TILE(DATA_TYPE, 1, N0, b);
144
145        // Load tile from the lhs/rhs tensors
146        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
147        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, 0, b);
148
149        LOOP_UNROLLING(int, m0, 0, 1, M0,
150        {
151            LOOP_UNROLLING(int, n0, 0, 1, N0,
152            {
153                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
154            })
155        })
156
157        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
158        rhs_offset_first_element_in_bytes += MMUL_K0 * MMUL_N0 * N0 * sizeof(DATA_TYPE);
159    }
160
161    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
162    {
163        return;
164    }
165
166    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
167    {
168        return;
169    }
170
171#if defined(HALF_PRECISION)
172    TILE(DATA_TYPE, M0, N0, c);
173
174    // Conversion required for the half precision
175    LOOP_UNROLLING(int, m0, 0, 1, M0,
176    {
177        LOOP_UNROLLING(int, n0, 0, 1, N0,
178        {
179            c[m0].s[n0] = c_f32[m0].s[n0];
180        })
181    })
182#endif // defined(HALF_PRECISION)
183
184    // Multiply by the weight of matrix-matrix product and store the result
185#if defined(ALPHA)
186    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
187#endif // defined(ALPHA)
188
189    // Add beta*bias
190#if defined(BETA)
191#if defined(BROADCAST_BIAS)
192    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
193
194    TILE(DATA_TYPE, 1, N0, bias0);
195
196    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
197    {
198        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
199    }
200    else
201    {
202        VLOAD_PARTIAL(N0, N0_LEFTOVER)
203        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
204    }
205
206#ifndef UNIT_BETA
207    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
208#endif // UNIT_BIAS
209
210    // c = c + bias[broadcasted]
211    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
212#else // defined(BROADCAST_BIAS)
213    TILE(DATA_TYPE, M0, N0, bias0);
214
215    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
216
217    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
218    {
219        LOOP_UNROLLING(int, m0, 0, 1, M0,
220        {
221            if(dst_y + m0 < M || M0_LEFTOVER == 0)
222            {
223                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
224            }
225        })
226    }
227    else
228    {
229        LOOP_UNROLLING(int, m0, 0, 1, M0,
230        {
231            if(dst_y + m0 < M || M0_LEFTOVER == 0)
232            {
233                VLOAD_PARTIAL(N0, N0_LEFTOVER)
234                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
235            }
236        })
237    }
238
239#ifndef UNIT_BETA
240    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
241#endif // UNIT_BIAS
242
243    // c = c + bias
244    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
245    // c = c + bias
246#endif // defined(BROADCAST_BIAS)
247#endif // defined(BETA)
248
249    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
250
251    // Store
252    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
253    {
254        LOOP_UNROLLING(int, m0, 0, 1, M0,
255        {
256            if(dst_y + m0 < M || M0_LEFTOVER == 0)
257            {
258                VSTORE(N0)
259                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
260            }
261        })
262    }
263    else
264    {
265        LOOP_UNROLLING(int, m0, 0, 1, M0,
266        {
267            if(dst_y + m0 < M || M0_LEFTOVER == 0)
268            {
269                VSTORE_PARTIAL(N0, N0_LEFTOVER)
270                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
271            }
272        })
273    }
274
275#undef RHS_BLOCK_SIZE
276#undef RHS_OFFSET_X
277#undef RHS_STEP_X
278}
279#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL)
280
281#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL_TEXTURE)
282/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension and the OpenCL image for RHS:
283 *
284 *  The LHS matrix is NOT reshaped
285 *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
286 *
287 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
288 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
289 * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
290 * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
291 * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
292 * @note Only the following configurations of M0, N0 and K0 are currently supported:
293 *  - M0 > 0
294 *  - N0 = 1, 2, 3, 4, 8, 16
295 *  - K0 = 1
296 *
297 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
298 *       The activation function is performed after the bias addition
299 *
300 * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
301 * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
302 * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
303 * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
304 * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
305 * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
306 * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
307 * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
308 * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
309 * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
310 * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
311 * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
312 * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
313 * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
314 * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
315 * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
316 * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
317 * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
318 * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
319 * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
320 * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
321 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
322 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
323 * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
324 * @param[in]  dst_w                             The size of the width dimension of the destination tensor
325 * @param[in]  dst_h                             The size of the height dimension of the destination tensor
326 * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
327 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
328 * @param[in]  M                                 Number of rows in LHS matrix not reshaped
329 * @param[in]  N                                 Number of columns in RHS matrix not reshaped
330 * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
331 */
332__kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
333    TENSOR3D_T(lhs, BUFFER),
334    TENSOR3D_T(rhs, IMAGE),
335#if defined(BETA)
336    TENSOR3D_T(bia, BUFFER),
337#endif // defined(BETA)
338    TENSOR3D_T(dst, BUFFER),
339    const int M,
340    const int N,
341    const int K)
342{
343#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
344
345    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
346    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
347    uint z  = get_global_id(2); // Batch
348
349    // Get block ID and thread ID within the block
350    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
351    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
352
353    // Coordinate within a block
354    uint block_x = thread_id % MMUL_N0;
355    uint block_y = (thread_id / MMUL_M0);
356
357    // Starting destination coordinates
358    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
359    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
360
361    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
362    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
363
364    // Starting LHS coordinates
365    uint lhs_x = block_x;
366    uint lhs_y = dst_y;
367
368    // Starting RHS coordinates
369    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
370    uint rhs_y = block_id + z * rhs_h;
371
372    // Compute LHS/RHS/DST matrix address
373    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
374    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
375
376    // Initialize the accumulators
377    // MMUL extension accumulate the result in F32 for both F32 and F16
378    TILE(float, M0, N0, c_f32);
379
380#if !defined(HALF_PRECISION)
381#define c c_f32
382#endif // !defined(HALF_PRECISION)
383
384    LOOP_UNROLLING(int, i, 0, 1, M0,
385    {
386        c_f32[i].v = 0;
387    })
388
389    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
390    {
391        TILE(DATA_TYPE, M0, 1, a);
392        TILE(DATA_TYPE, 1, N0, b);
393
394        // Load tile from the lhs/rhs tensors
395        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
396        T_LOAD(DATA_TYPE, 1, N0, IMAGE, rhs, rhs_x, rhs_y, 1, rhs_stride_y, b);
397
398        LOOP_UNROLLING(int, m0, 0, 1, M0,
399        {
400            LOOP_UNROLLING(int, n0, 0, 1, N0,
401            {
402                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
403            })
404        })
405
406        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
407        rhs_x += MMUL_K0 * MMUL_N0 * N0;
408    }
409
410    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
411    {
412        return;
413    }
414
415    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
416    {
417        return;
418    }
419
420#if defined(HALF_PRECISION)
421    TILE(DATA_TYPE, M0, N0, c);
422
423    // Conversion required for the half precision
424    LOOP_UNROLLING(int, m0, 0, 1, M0,
425    {
426        LOOP_UNROLLING(int, n0, 0, 1, N0,
427        {
428            c[m0].s[n0] = c_f32[m0].s[n0];
429        })
430    })
431#endif // defined(HALF_PRECISION)
432
433    // Multiply by the weight of matrix-matrix product and store the result
434#if defined(ALPHA)
435    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
436#endif // defined(ALPHA)
437
438    // Add beta*bias
439#if defined(BETA)
440#if defined(BROADCAST_BIAS)
441    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
442
443    TILE(DATA_TYPE, 1, N0, bias0);
444
445    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
446    {
447        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
448    }
449    else
450    {
451        VLOAD_PARTIAL(N0, N0_LEFTOVER)
452        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
453    }
454
455#ifndef UNIT_BETA
456    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
457#endif // UNIT_BIAS
458
459    // c = c + bias[broadcasted]
460    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
461#else // defined(BROADCAST_BIAS)
462    TILE(DATA_TYPE, M0, N0, bias0);
463
464    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
465
466    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
467    {
468        LOOP_UNROLLING(int, m0, 0, 1, M0,
469        {
470            if(dst_y + m0 < M || M0_LEFTOVER == 0)
471            {
472                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
473            }
474        })
475    }
476    else
477    {
478        LOOP_UNROLLING(int, m0, 0, 1, M0,
479        {
480            if(dst_y + m0 < M || M0_LEFTOVER == 0)
481            {
482                VLOAD_PARTIAL(N0, N0_LEFTOVER)
483                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
484            }
485        })
486    }
487
488#ifndef UNIT_BETA
489    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
490#endif // UNIT_BIAS
491
492    // c = c + bias
493    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
494    // c = c + bias
495#endif // defined(BROADCAST_BIAS)
496#endif // defined(BETA)
497
498    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
499
500    // Store
501    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
502    {
503        LOOP_UNROLLING(int, m0, 0, 1, M0,
504        {
505            if(dst_y + m0 < M || M0_LEFTOVER == 0)
506            {
507                VSTORE(N0)
508                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
509            }
510        })
511    }
512    else
513    {
514        LOOP_UNROLLING(int, m0, 0, 1, M0,
515        {
516            if(dst_y + m0 < M || M0_LEFTOVER == 0)
517            {
518                VSTORE_PARTIAL(N0, N0_LEFTOVER)
519                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
520            }
521        })
522    }
523
524#undef RHS_BLOCK_SIZE
525#undef RHS_OFFSET_X
526#undef RHS_STEP_X
527}
528#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)