1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointer 29# x8 a0 30 31# C pointer 32# x6 c0 33 34BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57 35 36 # Load cn_stride, a_offset 37 LDP x10, x11, [sp] 38 39 # Load zero, params pointer 40 LDP x12, x8, [sp, 16] 41 42 # Load min/max values 43 LD2R {v30.4s, v31.4s}, [x8] 44 450: 46 # Load initial bias from w into accumulators 47 LDP q16, q17, [x5], 32 48 MOVI v18.4s, 0 // second set of C for pipelining FMLA 49 MOVI v19.4s, 0 50 51 MOV x9, x3 // p = ks 52 531: 54 # Load next A pointer 55 LDR x8, [x4], 8 56 57 CMP x8, x12 // if a0 == zero 58 ADD x8, x8, x11 // a0 += a_offset 59 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 60 61 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 62 SUBS x0, x2, 32 // k = kc - 32 // k = kc 63 B.LO 4f 64 65 # 16 prologue 66 # Read first block of A and B. 67 LDP q20, q21, [x5], 32 68 LDP q22, q23, [x5], 32 69 LDP q24, q25, [x5], 32 70 LDP q26, q27, [x5], 32 71 LDR q0, [x8], 16 72 73 # Is there at least 8. yes do main loop 74 SUBS x0, x0, 32 75 B.LO 3f 76 77 # Main loop - 8 floats of A (32 bytes) 782: 79 # First block of 4. FMA for first 4, loads for 2nd block of 4. 80 FMLA v16.4s, v20.4s, v0.s[0] 81 LDR q1, [x8], 16 82 FMLA v17.4s, v21.4s, v0.s[0] 83 LDP q20, q21, [x5], 32 84 FMLA v18.4s, v22.4s, v0.s[1] 85 FMLA v19.4s, v23.4s, v0.s[1] 86 LDP q22, q23, [x5], 32 87 FMLA v16.4s, v24.4s, v0.s[2] 88 FMLA v17.4s, v25.4s, v0.s[2] 89 LDP q24, q25, [x5], 32 90 FMLA v18.4s, v26.4s, v0.s[3] 91 FMLA v19.4s, v27.4s, v0.s[3] 92 LDP q26, q27, [x5], 32 93 94 # Second block of 4. FMA for second 4, loads for 1st block of 4. 95 FMLA v16.4s, v20.4s, v1.s[0] 96 LDR q0, [x8], 16 97 FMLA v17.4s, v21.4s, v1.s[0] 98 LDP q20, q21, [x5], 32 99 FMLA v18.4s, v22.4s, v1.s[1] 100 FMLA v19.4s, v23.4s, v1.s[1] 101 LDP q22, q23, [x5], 32 102 FMLA v16.4s, v24.4s, v1.s[2] 103 FMLA v17.4s, v25.4s, v1.s[2] 104 LDP q24, q25, [x5], 32 105 FMLA v18.4s, v26.4s, v1.s[3] 106 FMLA v19.4s, v27.4s, v1.s[3] 107 SUBS x0, x0, 32 108 LDP q26, q27, [x5], 32 109 B.HS 2b 110 1113: 112 # Epilogue 113 114 # First block of 4. FMA for first 4, loads for 2nd block of 4. 115 FMLA v16.4s, v20.4s, v0.s[0] 116 LDR q1, [x8], 16 117 FMLA v17.4s, v21.4s, v0.s[0] 118 LDP q20, q21, [x5], 32 119 FMLA v18.4s, v22.4s, v0.s[1] 120 FMLA v19.4s, v23.4s, v0.s[1] 121 LDP q22, q23, [x5], 32 122 FMLA v16.4s, v24.4s, v0.s[2] 123 FMLA v17.4s, v25.4s, v0.s[2] 124 LDP q24, q25, [x5], 32 125 FMLA v18.4s, v26.4s, v0.s[3] 126 FMLA v19.4s, v27.4s, v0.s[3] 127 LDP q26, q27, [x5], 32 128 129 # Second block of 4. no loads 130 FMLA v16.4s, v20.4s, v1.s[0] 131 FMLA v17.4s, v21.4s, v1.s[0] 132 FMLA v18.4s, v22.4s, v1.s[1] 133 FMLA v19.4s, v23.4s, v1.s[1] 134 FMLA v16.4s, v24.4s, v1.s[2] 135 FMLA v17.4s, v25.4s, v1.s[2] 136 FMLA v18.4s, v26.4s, v1.s[3] 137 FMLA v19.4s, v27.4s, v1.s[3] 138 1394: 140 # Is there a remainder?- 4 floats of A (16 bytes) 141 TBNZ x0, 4, 6f 142 # Is there a remainder?- 2 floats of A (8 bytes) 143 TBNZ x0, 3, 7f 144 # Is there a remainder?- 1 floats of A (4 bytes) 145 TBNZ x0, 2, 9f 146 1475: 148 # ks loop 149 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 150 B.HI 1b 151 152 FADD v16.4s, v16.4s, v18.4s 153 FADD v17.4s, v17.4s, v19.4s 154 155 # Clamp 156 FMAX v16.4s, v16.4s, v30.4s 157 FMAX v17.4s, v17.4s, v30.4s 158 FMIN v16.4s, v16.4s, v31.4s 159 FMIN v17.4s, v17.4s, v31.4s 160 161 # Store full 1 x 8 162 SUBS x1, x1, 8 163 B.LO 10f 164 165 STP q16, q17, [x6] 166 ADD x6, x6, x10 167 168 SUB x4, x4, x3 // a -= ks 169 170 # nc loop 171 B.HI 0b 172 173 RET 174 1756: 176 # Remainder- 4 floats of A (16 bytes) 177 LDP q20, q21, [x5], 32 178 LDR q0, [x8], 16 179 FMLA v16.4s, v20.4s, v0.s[0] 180 FMLA v17.4s, v21.4s, v0.s[0] 181 LDP q22, q23, [x5], 32 182 LDP q24, q25, [x5], 32 183 LDP q26, q27, [x5], 32 184 FMLA v18.4s, v22.4s, v0.s[1] 185 FMLA v19.4s, v23.4s, v0.s[1] 186 FMLA v16.4s, v24.4s, v0.s[2] 187 FMLA v17.4s, v25.4s, v0.s[2] 188 FMLA v18.4s, v26.4s, v0.s[3] 189 FMLA v19.4s, v27.4s, v0.s[3] 190 191 TBZ x0, 3, 8f 1927: 193 # Remainder- 2 floats of A (8 bytes) 194 LDP q20, q21, [x5], 32 195 LDR d0, [x8], 8 196 FMLA v16.4s, v20.4s, v0.s[0] 197 FMLA v17.4s, v21.4s, v0.s[0] 198 LDP q22, q23, [x5], 32 199 FMLA v18.4s, v22.4s, v0.s[1] 200 FMLA v19.4s, v23.4s, v0.s[1] 2018: 202 TBZ x0, 2, 5b 2039: 204 # Remainder- 1 float of A (4 bytes) 205 LDP q20, q21, [x5], 32 206 LDR s0, [x8], 4 207 FMLA v16.4s, v20.4s, v0.s[0] 208 FMLA v17.4s, v21.4s, v0.s[0] 209 B 5b 210 21110: 212 # Store odd channels 213 TBZ x1, 2, 11f 214 STR q16, [x6], 16 215 MOV v16.16b, v17.16b 216 21711: 218 TBZ x1, 1, 12f 219 STR d16, [x6], 8 220 DUP d16, v16.d[1] 221 22212: 223 TBZ x1, 0, 13f 224 STR s16, [x6], 4 22513: 226 RET 227 228END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57 229 230#ifdef __ELF__ 231.section ".note.GNU-stack","",%progbits 232#endif 233