1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v4 v5 33 34# A53 based on A57/A75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 37 38 # Load cn_stride, params pointer 39 LDP x14, x8, [sp] 40 41 # Load min/max values 42 LD2R {v4.4s, v5.4s}, [x8] 430: 44 # Load initial bias from w into accumulators 45 LDP q16, q17, [x5], 32 46 47 MOVI v18.4s, 0 // second set of C for pipelining FMLA 48 PRFM PLDL1KEEP, [x5] 49 MOVI v19.4s, 0 50 PRFM PLDL1KEEP, [x5, 64] 51 PRFM PLDL1KEEP, [x5, 128] 52 PRFM PLDL1KEEP, [x5, 192] 53 54 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 55 SUBS x0, x2, 32 // k = kc - 32 56 57 B.LO 3f 58 59 # 16 prologue 60 # Read first block of 1 A and B. 61 LDP q20, q21, [x5], 32 62 LDP q22, q23, [x5], 32 63 LDP q24, q25, [x5], 32 64 LDP q26, q27, [x5], 32 65 LDR q0, [x3], 16 66 67 # Is there at least 32. yes do main loop 68 SUBS x0, x0, 32 69 B.LO 2f 70 71 # Main loop - 8 floats of A (32 bytes) 721: 73 # First block of 4. FMA for first 4, loads for 2nd block of 4. 74 FMLA v16.4s, v20.4s, v0.s[0] 75 LDR q1, [x3], 16 76 FMLA v17.4s, v21.4s, v0.s[0] 77 LDR q20, [x5], 16 78 FMLA v18.4s, v22.4s, v0.s[1] 79 LDR q21, [x5], 16 80 FMLA v19.4s, v23.4s, v0.s[1] 81 LDR q22, [x5], 16 82 FMLA v16.4s, v24.4s, v0.s[2] 83 LDR q23, [x5], 16 84 FMLA v17.4s, v25.4s, v0.s[2] 85 LDR q24, [x5], 16 86 FMLA v18.4s, v26.4s, v0.s[3] 87 LDR q25, [x5], 16 88 FMLA v19.4s, v27.4s, v0.s[3] 89 LDR q26, [x5], 16 90 LDR q27, [x5], 16 91 92 # Second block of 4. FMA for second 4, loads for 1st block of 4. 93 FMLA v16.4s, v20.4s, v1.s[0] 94 LDR q0, [x3], 16 95 FMLA v17.4s, v21.4s, v1.s[0] 96 LDR q20, [x5], 16 97 FMLA v18.4s, v22.4s, v1.s[1] 98 LDR q21, [x5], 16 99 FMLA v19.4s, v23.4s, v1.s[1] 100 LDR q22, [x5], 16 101 FMLA v16.4s, v24.4s, v1.s[2] 102 LDR q23, [x5], 16 103 FMLA v17.4s, v25.4s, v1.s[2] 104 LDR q24, [x5], 16 105 FMLA v18.4s, v26.4s, v1.s[3] 106 LDR q25, [x5], 16 107 FMLA v19.4s, v27.4s, v1.s[3] 108 LDR q26, [x5], 16 109 SUBS x0, x0, 32 110 LDR q27, [x5], 16 111 B.HS 1b 112 1132: 114 # Epilogue 115 116 # First block of 4. FMA for first 4, loads for 2nd block of 4. 117 FMLA v16.4s, v20.4s, v0.s[0] 118 LDR q1, [x3], 16 119 FMLA v17.4s, v21.4s, v0.s[0] 120 LDR q20, [x5], 16 121 FMLA v18.4s, v22.4s, v0.s[1] 122 LDR q21, [x5], 16 123 FMLA v19.4s, v23.4s, v0.s[1] 124 LDR q22, [x5], 16 125 FMLA v16.4s, v24.4s, v0.s[2] 126 LDR q23, [x5], 16 127 FMLA v17.4s, v25.4s, v0.s[2] 128 LDR q24, [x5], 16 129 FMLA v18.4s, v26.4s, v0.s[3] 130 LDR q25, [x5], 16 131 FMLA v19.4s, v27.4s, v0.s[3] 132 LDR q26, [x5], 16 133 134 # Second block of 4. no loads 135 FMLA v16.4s, v20.4s, v1.s[0] 136 LDR q27, [x5], 16 137 FMLA v17.4s, v21.4s, v1.s[0] 138 FMLA v18.4s, v22.4s, v1.s[1] 139 FMLA v19.4s, v23.4s, v1.s[1] 140 FMLA v16.4s, v24.4s, v1.s[2] 141 FMLA v17.4s, v25.4s, v1.s[2] 142 FMLA v18.4s, v26.4s, v1.s[3] 143 FMLA v19.4s, v27.4s, v1.s[3] 144 1453: 146 # Is there a remainder?- 4 floats of A (16 bytes) 147 TBNZ x0, 4, 5f 148 # Is there a remainder?- 2 floats of A (8 bytes) 149 TBNZ x0, 3, 6f 150 # Is there a remainder?- 1 floats of A (4 bytes) 151 TBNZ x0, 2, 8f 152 1534: 154 FADD v16.4s, v16.4s, v18.4s 155 FADD v17.4s, v17.4s, v19.4s 156 157 # Clamp 158 FMAX v16.4s, v16.4s, v4.4s 159 SUBS x1, x1, 8 160 FMAX v17.4s, v17.4s, v4.4s 161 FMIN v16.4s, v16.4s, v5.4s 162 FMIN v17.4s, v17.4s, v5.4s 163 164 # Store full 1 x 8 165 B.LO 9f 166 167 ST1 {v16.16b, v17.16b}, [x6], x14 168 SUB x3, x3, x2 // a0 -= kc 169 170 B.HI 0b 171 172 RET 173 1745: 175 # Remainder- 4 floats of A (16 bytes) 176 LDR q20, [x5], 16 177 LDR q21, [x5], 16 178 LDR q0, [x3], 16 179 FMLA v16.4s, v20.4s, v0.s[0] 180 FMLA v17.4s, v21.4s, v0.s[0] 181 LDR q22, [x5], 16 182 LDR q23, [x5], 16 183 LDR q24, [x5], 16 184 LDR q25, [x5], 16 185 LDR q26, [x5], 16 186 LDR q27, [x5], 16 187 FMLA v18.4s, v22.4s, v0.s[1] 188 FMLA v19.4s, v23.4s, v0.s[1] 189 FMLA v16.4s, v24.4s, v0.s[2] 190 FMLA v17.4s, v25.4s, v0.s[2] 191 FMLA v18.4s, v26.4s, v0.s[3] 192 FMLA v19.4s, v27.4s, v0.s[3] 193 194 TBZ x0, 3, 7f 1956: 196 # Remainder- 2 floats of A (8 bytes) 197 LDR q20, [x5], 16 198 LDR q21, [x5], 16 199 LDR d0, [x3], 8 200 FMLA v16.4s, v20.4s, v0.s[0] 201 FMLA v17.4s, v21.4s, v0.s[0] 202 LDR q22, [x5], 16 203 LDR q23, [x5], 16 204 FMLA v18.4s, v22.4s, v0.s[1] 205 FMLA v19.4s, v23.4s, v0.s[1] 2067: 207 TBZ x0, 2, 4b 2088: 209 # Remainder- 1 float of A (4 bytes) 210 LDR q20, [x5], 16 211 LDR q21, [x5], 16 212 LDR s0, [x3], 4 213 FMLA v16.4s, v20.4s, v0.s[0] 214 FMLA v17.4s, v21.4s, v0.s[0] 215 B 4b 216 217 # Store odd channels 2189: 219 TBZ x1, 2, 10f 220 STR q16, [x6], 16 221 MOV v16.16b, v17.16b 222 22310: 224 TBZ x1, 1, 11f 225 STR d16, [x6], 8 226 DUP d16, v16.d[1] 227 22811: 229 TBZ x1, 0, 12f 230 STR s16, [x6] 23112: 232 RET 233 234END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 235 236#ifdef __ELF__ 237.section ".note.GNU-stack","",%progbits 238#endif 239