1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33# x12 a4 34# x4 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x18 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 46# A1 v1 47# A2 v2 48# A3 v3 49# A4 v4 50# A5 v5 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59# unused A v8 v9 v10 v11 60# unused B v12 v13 v14 v15 61 62BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64 63 64 # Clamp A and C pointers 65 CMP x0, 2 // if mr < 2 66 ADD x9, x3, x4 // a1 = a0 + a_stride 67 ADD x16, x6, x7 // c1 = c0 + cm_stride 68 CSEL x9, x3, x9, LO // a1 = a0 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 ADD x10, x9, x4 // a2 = a1 + a_stride 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x10, x9, x10, LS // a2 = a1 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 CMP x0, 4 // if mr < 4 78 ADD x11, x10, x4 // a3 = a2 + a_stride 79 ADD x18, x17, x7 // c3 = c2 + cm_stride 80 CSEL x11, x10, x11, LO // a3 = a2 81 CSEL x18, x17, x18, LO // c3 = c2 82 83 ADD x12, x11, x4 // a4 = a3 + a_stride 84 ADD x13, x18, x7 // c4 = c3 + cm_stride 85 // if mr <= 5 86 CSEL x12, x11, x12, LS // a4 = a3 87 CSEL x13, x18, x13, LS // c4 = c3 88 89 # Load acc, params pointer 90 LDP x15, x8, [sp, 8] 91 92 CMP x0, 6 // if mr < 6 93 ADD x4, x12, x4 // a5 = a4 + a_stride 94 ADD x7, x13, x7 // c5 = c4 + cm_stride 95 CSEL x4, x12, x4, LO // a5 = a4 96 CSEL x7, x13, x7, LO // c5 = c4 97 98 # Load clamping_params values 99 LD2R {v6.4s, v7.4s}, [x8] 100 101 # Load cn_stride 102 LDR x14, [sp] 103 1040: 105 # Load initial accumulators 106 LDP q20, q21, [x15], 32 107 LDP q22, q23, [x15], 32 108 LDP q24, q25, [x15], 32 109 LDP q26, q27, [x15], 32 110 LDP q28, q29, [x15], 32 111 LDP q30, q31, [x15], 32 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 PRFM PLDL1KEEP, [x5, 64] 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 PRFM PLDL1KEEP, [x3] // Prefetch A 117 PRFM PLDL1KEEP, [x9] 118 PRFM PLDL1KEEP, [x10] 119 PRFM PLDL1KEEP, [x11] 120 PRFM PLDL1KEEP, [x12] 121 PRFM PLDL1KEEP, [x4] 122 123 # Is there at least 2 floats (8 bytes) for main loop? 124 SUBS x0, x2, 8 // k = kc - 8 125 B.LO 4f 126 127 # Main loop - 2 floats of A (8 bytes) 128 # 24 FMA + 6 LD64 A + 2 LDP B 1291: 130 LDR d0, [x3], 8 131 LDP q16, q17, [x5], 32 132 LDR d1, [x9], 8 133 LDR d2, [x10], 8 134 LDR d3, [x11], 8 135 LDR d4, [x12], 8 136 LDR d5, [x4], 8 137 FMLA v20.4s, v16.4s, v0.s[0] 138 FMLA v22.4s, v16.4s, v1.s[0] 139 FMLA v24.4s, v16.4s, v2.s[0] 140 FMLA v26.4s, v16.4s, v3.s[0] 141 LDP q18, q19, [x5], 32 142 FMLA v28.4s, v16.4s, v4.s[0] 143 FMLA v30.4s, v16.4s, v5.s[0] 144 FMLA v21.4s, v17.4s, v0.s[0] 145 FMLA v23.4s, v17.4s, v1.s[0] 146 FMLA v25.4s, v17.4s, v2.s[0] 147 FMLA v27.4s, v17.4s, v3.s[0] 148 FMLA v29.4s, v17.4s, v4.s[0] 149 FMLA v31.4s, v17.4s, v5.s[0] 150 151 FMLA v20.4s, v18.4s, v0.s[1] 152 FMLA v22.4s, v18.4s, v1.s[1] 153 FMLA v24.4s, v18.4s, v2.s[1] 154 FMLA v26.4s, v18.4s, v3.s[1] 155 FMLA v28.4s, v18.4s, v4.s[1] 156 FMLA v30.4s, v18.4s, v5.s[1] 157 FMLA v21.4s, v19.4s, v0.s[1] 158 FMLA v23.4s, v19.4s, v1.s[1] 159 FMLA v25.4s, v19.4s, v2.s[1] 160 FMLA v27.4s, v19.4s, v3.s[1] 161 SUBS x0, x0, 8 162 FMLA v29.4s, v19.4s, v4.s[1] 163 FMLA v31.4s, v19.4s, v5.s[1] 164 B.HS 1b 165 166 # Is there a remainder?- 1 floats of A (4 bytes) 167 TBNZ x0, 2, 4f 1683: 169 # Clamp 170 FMIN v20.4s, v20.4s, v6.4s 171 SUBS x1, x1, 8 172 FMIN v21.4s, v21.4s, v6.4s 173 FMIN v22.4s, v22.4s, v6.4s 174 FMIN v23.4s, v23.4s, v6.4s 175 FMIN v24.4s, v24.4s, v6.4s 176 FMIN v25.4s, v25.4s, v6.4s 177 FMIN v26.4s, v26.4s, v6.4s 178 FMIN v27.4s, v27.4s, v6.4s 179 FMIN v28.4s, v28.4s, v6.4s 180 FMIN v29.4s, v29.4s, v6.4s 181 FMIN v30.4s, v30.4s, v6.4s 182 FMIN v31.4s, v31.4s, v6.4s 183 FMAX v20.4s, v20.4s, v7.4s 184 FMAX v21.4s, v21.4s, v7.4s 185 FMAX v22.4s, v22.4s, v7.4s 186 FMAX v23.4s, v23.4s, v7.4s 187 FMAX v24.4s, v24.4s, v7.4s 188 FMAX v25.4s, v25.4s, v7.4s 189 FMAX v26.4s, v26.4s, v7.4s 190 FMAX v27.4s, v27.4s, v7.4s 191 FMAX v28.4s, v28.4s, v7.4s 192 FMAX v29.4s, v29.4s, v7.4s 193 FMAX v30.4s, v30.4s, v7.4s 194 FMAX v31.4s, v31.4s, v7.4s 195 196 # Store full 6 x 8 197 B.LO 5f 198 199 ST1 {v30.16b, v31.16b}, [x7], x14 200 SUB x3, x3, x2 // a0 -= kc 201 ST1 {v28.16b, v29.16b}, [x13], x14 202 SUB x9, x9, x2 // a1 -= kc 203 ST1 {v26.16b, v27.16b}, [x18], x14 204 SUB x10, x10, x2 // a2 -= kc 205 ST1 {v24.16b, v25.16b}, [x17], x14 206 SUB x11, x11, x2 // a3 -= kc 207 ST1 {v22.16b, v23.16b}, [x16], x14 208 SUB x12, x12, x2 // a4 -= kc 209 ST1 {v20.16b, v21.16b}, [x6], x14 210 SUB x4, x4, x2 // a5 -= kc 211 212 B.HI 0b 213 RET 214 2154: 216 # Remainder- 1 floats of A (4 bytes) 217 LDR s0, [x3], 4 218 LDP q16, q17, [x5], 32 219 LDR s1, [x9], 4 220 LDR s2, [x10], 4 221 LDR s3, [x11], 4 222 LDR s4, [x12], 4 223 LDR s5, [x4], 4 224 FMLA v20.4s, v16.4s, v0.s[0] 225 FMLA v22.4s, v16.4s, v1.s[0] 226 FMLA v24.4s, v16.4s, v2.s[0] 227 FMLA v26.4s, v16.4s, v3.s[0] 228 FMLA v28.4s, v16.4s, v4.s[0] 229 FMLA v30.4s, v16.4s, v5.s[0] 230 FMLA v21.4s, v17.4s, v0.s[0] 231 FMLA v23.4s, v17.4s, v1.s[0] 232 FMLA v25.4s, v17.4s, v2.s[0] 233 FMLA v27.4s, v17.4s, v3.s[0] 234 FMLA v29.4s, v17.4s, v4.s[0] 235 FMLA v31.4s, v17.4s, v5.s[0] 236 B 3b 237 238 # Store odd width 2395: 240 TBZ x1, 2, 6f 241 STR q30, [x7], 16 242 MOV v30.16b, v31.16b 243 STR q28, [x13], 16 244 MOV v28.16b, v29.16b 245 STR q26, [x18], 16 246 MOV v26.16b, v27.16b 247 STR q24, [x17], 16 248 MOV v24.16b, v25.16b 249 STR q22, [x16], 16 250 MOV v22.16b, v23.16b 251 STR q20, [x6], 16 252 MOV v20.16b, v21.16b 253 2546: 255 TBZ x1, 1, 7f 256 STR d30, [x7], 8 257 DUP d30, v30.d[1] 258 STR d28, [x13], 8 259 DUP d28, v28.d[1] 260 STR d26, [x18], 8 261 DUP d26, v26.d[1] 262 STR d24, [x17], 8 263 DUP d24, v24.d[1] 264 STR d22, [x16], 8 265 DUP d22, v22.d[1] 266 STR d20, [x6], 8 267 DUP d20, v20.d[1] 268 2697: 270 TBZ x1, 0, 8f 271 STR s30, [x7] 272 STR s28, [x13] 273 STR s26, [x18] 274 STR s24, [x17] 275 STR s22, [x16] 276 STR s20, [x6] 2778: 278 RET 279 280END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64 281 282#ifdef __ELF__ 283.section ".note.GNU-stack","",%progbits 284#endif 285