1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v2 v3 33 34# A53 based on A57/A75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 37 38 # Load cn_stride, params pointer 39 LDP x14, x8, [sp] 40 41 # Load min/max values 42 LD2R {v2.4s, v3.4s}, [x8] 430: 44 # Load initial bias from w into accumulators 45 LD1 {v16.16b, v17.16b, v18.16b}, [x5], 48 46 47 MOVI v5.4s, 0 // second set of C for pipelining FMLA 48 PRFM PLDL1KEEP, [x5] 49 MOVI v6.4s, 0 50 PRFM PLDL1KEEP, [x5, 64] 51 MOVI v7.4s, 0 52 PRFM PLDL1KEEP, [x5, 128] 53 PRFM PLDL1KEEP, [x5, 192] 54 55 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 56 SUBS x0, x2, 32 // k = kc - 32 57 58 B.LO 3f 59 60 # 16 prologue 61 # Read first block of 1 A and B. 62 LDP q20, q21, [x5], 32 63 LDP q22, q23, [x5], 32 64 LDP q24, q25, [x5], 32 65 LDP q26, q27, [x5], 32 66 LDP q28, q29, [x5], 32 67 LDP q30, q31, [x5], 32 68 LDR q0, [x3], 16 69 70 # Is there at least 32. yes do main loop 71 SUBS x0, x0, 32 72 B.LO 2f 73 74 # Main loop - 8 floats of A (32 bytes) 751: 76 # First block of 4. FMA for first 4, loads for 2nd block of 4. 77 FMLA v16.4s, v20.4s, v0.s[0] 78 LDR q1, [x3], 16 79 FMLA v17.4s, v21.4s, v0.s[0] 80 LDR q20, [x5], 16 81 FMLA v18.4s, v22.4s, v0.s[0] 82 LDR q21, [x5], 16 83 FMLA v5.4s, v23.4s, v0.s[1] 84 LDR q22, [x5], 16 85 FMLA v6.4s, v24.4s, v0.s[1] 86 LDR q23, [x5], 16 87 FMLA v7.4s, v25.4s, v0.s[1] 88 LDR q24, [x5], 16 89 FMLA v16.4s, v26.4s, v0.s[2] 90 LDR q25, [x5], 16 91 FMLA v17.4s, v27.4s, v0.s[2] 92 LDR q26, [x5], 16 93 FMLA v18.4s, v28.4s, v0.s[2] 94 LDR q27, [x5], 16 95 FMLA v5.4s, v29.4s, v0.s[3] 96 LDR q28, [x5], 16 97 FMLA v6.4s, v30.4s, v0.s[3] 98 LDR q29, [x5], 16 99 FMLA v7.4s, v31.4s, v0.s[3] 100 LDR q30, [x5], 16 101 LDR q31, [x5], 16 102 103 # Second block of 4. FMA for second 4, loads for 1st block of 4. 104 FMLA v16.4s, v20.4s, v1.s[0] 105 LDR q0, [x3], 16 106 FMLA v17.4s, v21.4s, v1.s[0] 107 LDR q20, [x5], 16 108 FMLA v18.4s, v22.4s, v1.s[0] 109 LDR q21, [x5], 16 110 FMLA v5.4s, v23.4s, v1.s[1] 111 LDR q22, [x5], 16 112 FMLA v6.4s, v24.4s, v1.s[1] 113 LDR q23, [x5], 16 114 FMLA v7.4s, v25.4s, v1.s[1] 115 LDR q24, [x5], 16 116 FMLA v16.4s, v26.4s, v1.s[2] 117 LDR q25, [x5], 16 118 FMLA v17.4s, v27.4s, v1.s[2] 119 LDR q26, [x5], 16 120 FMLA v18.4s, v28.4s, v1.s[2] 121 LDR q27, [x5], 16 122 FMLA v5.4s, v29.4s, v1.s[3] 123 LDR q28, [x5], 16 124 FMLA v6.4s, v30.4s, v1.s[3] 125 LDR q29, [x5], 16 126 FMLA v7.4s, v31.4s, v1.s[3] 127 LDR q30, [x5], 16 128 SUBS x0, x0, 32 129 LDR q31, [x5], 16 130 B.HS 1b 131 1322: 133 # Epilogue 134 135 # First block of 4. FMA for first 4, loads for 2nd block of 4. 136 FMLA v16.4s, v20.4s, v0.s[0] 137 LDR q1, [x3], 16 138 FMLA v17.4s, v21.4s, v0.s[0] 139 LDR q20, [x5], 16 140 FMLA v18.4s, v22.4s, v0.s[0] 141 LDR q21, [x5], 16 142 FMLA v5.4s, v23.4s, v0.s[1] 143 LDR q22, [x5], 16 144 FMLA v6.4s, v24.4s, v0.s[1] 145 LDR q23, [x5], 16 146 FMLA v7.4s, v25.4s, v0.s[1] 147 LDR q24, [x5], 16 148 FMLA v16.4s, v26.4s, v0.s[2] 149 LDR q25, [x5], 16 150 FMLA v17.4s, v27.4s, v0.s[2] 151 LDR q26, [x5], 16 152 FMLA v18.4s, v28.4s, v0.s[2] 153 LDR q27, [x5], 16 154 FMLA v5.4s, v29.4s, v0.s[3] 155 LDR q28, [x5], 16 156 FMLA v6.4s, v30.4s, v0.s[3] 157 LDR q29, [x5], 16 158 FMLA v7.4s, v31.4s, v0.s[3] 159 LDR q30, [x5], 16 160 161 # Second block of 4. FMA for second 4, no loads. 162 FMLA v16.4s, v20.4s, v1.s[0] 163 LDR q31, [x5], 16 164 FMLA v17.4s, v21.4s, v1.s[0] 165 FMLA v18.4s, v22.4s, v1.s[0] 166 FMLA v5.4s, v23.4s, v1.s[1] 167 FMLA v6.4s, v24.4s, v1.s[1] 168 FMLA v7.4s, v25.4s, v1.s[1] 169 FMLA v16.4s, v26.4s, v1.s[2] 170 FMLA v17.4s, v27.4s, v1.s[2] 171 FMLA v18.4s, v28.4s, v1.s[2] 172 FMLA v5.4s, v29.4s, v1.s[3] 173 FMLA v6.4s, v30.4s, v1.s[3] 174 FMLA v7.4s, v31.4s, v1.s[3] 175 1763: 177 # Is there a remainder?- 4 floats of A (16 bytes) 178 TBNZ x0, 4, 5f 179 # Is there a remainder?- 2 floats of A (8 bytes) 180 TBNZ x0, 3, 6f 181 # Is there a remainder?- 1 floats of A (4 bytes) 182 TBNZ x0, 2, 8f 183 1844: 185 FADD v16.4s, v16.4s, v5.4s 186 FADD v17.4s, v17.4s, v6.4s 187 FADD v18.4s, v18.4s, v7.4s 188 SUBS x1, x1, 12 189 190 # Clamp 191 FMAX v16.4s, v16.4s, v2.4s 192 FMAX v17.4s, v17.4s, v2.4s 193 FMAX v18.4s, v18.4s, v2.4s 194 FMIN v16.4s, v16.4s, v3.4s 195 FMIN v17.4s, v17.4s, v3.4s 196 FMIN v18.4s, v18.4s, v3.4s 197 198 # Store full 1 x 12 199 B.LO 9f 200 201 ST1 {v16.16b, v17.16b, v18.16b}, [x6], x14 202 SUB x3, x3, x2 // a0 -= kc 203 204 B.HI 0b 205 206 RET 207 2085: 209 # Remainder- 4 floats of A (16 bytes) 210 LDR q0, [x3], 16 211 LDR q20, [x5], 16 212 LDR q21, [x5], 16 213 LDR q22, [x5], 16 214 FMLA v16.4s, v20.4s, v0.s[0] 215 FMLA v17.4s, v21.4s, v0.s[0] 216 FMLA v18.4s, v22.4s, v0.s[0] 217 218 LDR q20, [x5], 16 219 LDR q21, [x5], 16 220 LDR q22, [x5], 16 221 FMLA v16.4s, v20.4s, v0.s[1] 222 FMLA v17.4s, v21.4s, v0.s[1] 223 FMLA v18.4s, v22.4s, v0.s[1] 224 225 LDR q20, [x5], 16 226 LDR q21, [x5], 16 227 LDR q22, [x5], 16 228 FMLA v16.4s, v20.4s, v0.s[2] 229 FMLA v17.4s, v21.4s, v0.s[2] 230 FMLA v18.4s, v22.4s, v0.s[2] 231 232 LDR q20, [x5], 16 233 LDR q21, [x5], 16 234 LDR q22, [x5], 16 235 FMLA v16.4s, v20.4s, v0.s[3] 236 FMLA v17.4s, v21.4s, v0.s[3] 237 FMLA v18.4s, v22.4s, v0.s[3] 238 239 TBZ x0, 3, 7f 2406: 241 # Remainder- 2 floats of A (8 bytes) 242 LDR d0, [x3], 8 243 LDR q20, [x5], 16 244 LDR q21, [x5], 16 245 LDR q22, [x5], 16 246 FMLA v16.4s, v20.4s, v0.s[0] 247 FMLA v17.4s, v21.4s, v0.s[0] 248 FMLA v18.4s, v22.4s, v0.s[0] 249 250 LDR q20, [x5], 16 251 LDR q21, [x5], 16 252 LDR q22, [x5], 16 253 FMLA v16.4s, v20.4s, v0.s[1] 254 FMLA v17.4s, v21.4s, v0.s[1] 255 FMLA v18.4s, v22.4s, v0.s[1] 2567: 257 TBZ x0, 2, 4b 2588: 259 # Remainder- 1 float of A (4 bytes) 260 LDR s0, [x3], 4 261 LDR q20, [x5], 16 262 LDR q21, [x5], 16 263 LDR q22, [x5], 16 264 FMLA v16.4s, v20.4s, v0.s[0] 265 FMLA v17.4s, v21.4s, v0.s[0] 266 FMLA v18.4s, v22.4s, v0.s[0] 267 B 4b 268 269 # Store odd channels 2709: 271 ADD x1, x1, 12 272 TBZ x1, 3, 10f 273 STP q16, q17, [x6], 32 274 MOV v16.16b, v18.16b 275 27610: 277 TBZ x1, 2, 11f 278 STR q16, [x6], 16 279 MOV v16.16b, v17.16b 280 28111: 282 TBZ x1, 1, 12f 283 STR d16, [x6], 8 284 DUP d16, v16.d[1] 285 28612: 287 TBZ x1, 0, 13f 288 STR s16, [x6] 28913: 290 RET 291 292END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 293 294#ifdef __ELF__ 295.section ".note.GNU-stack","",%progbits 296#endif 297