1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointer 29# x8 a0 30 31# C pointer 32# x6 c0 33 34BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 35 36 # Load cn_stride, a_offset 37 LDP x10, x11, [sp] 38 39 # Load zero, params pointer 40 LDP x12, x8, [sp, 16] 41 42 # Load min/max values 43 LD2R {v30.4s, v31.4s}, [x8] 44 450: 46 # Load initial bias from w into accumulators 47 LDP q16, q17, [x5], 32 48 MOVI v18.4s, 0 // second set of C for pipelining FMLA 49 PRFM PLDL1KEEP, [x5] 50 MOVI v19.4s, 0 51 PRFM PLDL1KEEP, [x5, 64] 52 PRFM PLDL1KEEP, [x5, 128] 53 PRFM PLDL1KEEP, [x5, 192] 54 55 MOV x9, x3 // p = ks 56 571: 58 # Load next A pointer 59 LDR x8, [x4], 8 60 61 CMP x8, x12 // if a0 == zero 62 ADD x8, x8, x11 // a0 += a_offset 63 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 64 65 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 66 SUBS x0, x2, 32 // k = kc - 32 // k = kc 67 B.LO 4f 68 69 # 16 prologue 70 # Read first block of A and B. 71 LDP q20, q21, [x5], 32 72 LDP q22, q23, [x5], 32 73 LDP q24, q25, [x5], 32 74 LDP q26, q27, [x5], 32 75 LDR q0, [x8], 16 76 77 # Is there at least 8. yes do main loop 78 SUBS x0, x0, 32 79 B.LO 3f 80 81 # Main loop - 8 floats of A (32 bytes) 822: 83 # First block of 4. FMA for first 4, loads for 2nd block of 4. 84 FMLA v16.4s, v20.4s, v0.s[0] 85 LDR q1, [x8], 16 86 FMLA v17.4s, v21.4s, v0.s[0] 87 LDP q20, q21, [x5], 32 88 FMLA v18.4s, v22.4s, v0.s[1] 89 FMLA v19.4s, v23.4s, v0.s[1] 90 LDP q22, q23, [x5], 32 91 FMLA v16.4s, v24.4s, v0.s[2] 92 FMLA v17.4s, v25.4s, v0.s[2] 93 LDP q24, q25, [x5], 32 94 PRFM PLDL1KEEP, [x5, 128] 95 FMLA v18.4s, v26.4s, v0.s[3] 96 PRFM PLDL1KEEP, [x5, 256] 97 FMLA v19.4s, v27.4s, v0.s[3] 98 LDP q26, q27, [x5], 32 99 100 # Second block of 4. FMA for second 4, loads for 1st block of 4. 101 FMLA v16.4s, v20.4s, v1.s[0] 102 LDR q0, [x8], 16 103 FMLA v17.4s, v21.4s, v1.s[0] 104 LDP q20, q21, [x5], 32 105 FMLA v18.4s, v22.4s, v1.s[1] 106 FMLA v19.4s, v23.4s, v1.s[1] 107 LDP q22, q23, [x5], 32 108 FMLA v16.4s, v24.4s, v1.s[2] 109 FMLA v17.4s, v25.4s, v1.s[2] 110 LDP q24, q25, [x5], 32 111 PRFM PLDL1KEEP, [x5, 128] 112 FMLA v18.4s, v26.4s, v1.s[3] 113 PRFM PLDL1KEEP, [x5, 256] 114 FMLA v19.4s, v27.4s, v1.s[3] 115 SUBS x0, x0, 32 116 LDP q26, q27, [x5], 32 117 B.HS 2b 118 1193: 120 # Epilogue 121 122 # First block of 4. FMA for first 4, loads for 2nd block of 4. 123 FMLA v16.4s, v20.4s, v0.s[0] 124 LDR q1, [x8], 16 125 FMLA v17.4s, v21.4s, v0.s[0] 126 LDP q20, q21, [x5], 32 127 FMLA v18.4s, v22.4s, v0.s[1] 128 FMLA v19.4s, v23.4s, v0.s[1] 129 LDP q22, q23, [x5], 32 130 FMLA v16.4s, v24.4s, v0.s[2] 131 FMLA v17.4s, v25.4s, v0.s[2] 132 LDP q24, q25, [x5], 32 133 PRFM PLDL1KEEP, [x5, 128] 134 FMLA v18.4s, v26.4s, v0.s[3] 135 PRFM PLDL1KEEP, [x5, 256] 136 FMLA v19.4s, v27.4s, v0.s[3] 137 LDP q26, q27, [x5], 32 138 139 # Second block of 4. no loads 140 FMLA v16.4s, v20.4s, v1.s[0] 141 FMLA v17.4s, v21.4s, v1.s[0] 142 FMLA v18.4s, v22.4s, v1.s[1] 143 FMLA v19.4s, v23.4s, v1.s[1] 144 FMLA v16.4s, v24.4s, v1.s[2] 145 FMLA v17.4s, v25.4s, v1.s[2] 146 FMLA v18.4s, v26.4s, v1.s[3] 147 FMLA v19.4s, v27.4s, v1.s[3] 148 1494: 150 # Is there a remainder?- 4 floats of A (16 bytes) 151 TBNZ x0, 4, 6f 152 # Is there a remainder?- 2 floats of A (8 bytes) 153 TBNZ x0, 3, 7f 154 # Is there a remainder?- 1 floats of A (4 bytes) 155 TBNZ x0, 2, 9f 156 1575: 158 # ks loop 159 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 160 B.HI 1b 161 162 FADD v16.4s, v16.4s, v18.4s 163 FADD v17.4s, v17.4s, v19.4s 164 165 # Clamp 166 FMAX v16.4s, v16.4s, v30.4s 167 FMAX v17.4s, v17.4s, v30.4s 168 FMIN v16.4s, v16.4s, v31.4s 169 FMIN v17.4s, v17.4s, v31.4s 170 171 # Store full 1 x 8 172 SUBS x1, x1, 8 173 B.LO 10f 174 175 STP q16, q17, [x6] 176 ADD x6, x6, x10 177 178 SUB x4, x4, x3 // a -= ks 179 180 # nc loop 181 B.HI 0b 182 183 RET 184 1856: 186 # Remainder- 4 floats of A (16 bytes) 187 LDP q20, q21, [x5], 32 188 LDR q0, [x8], 16 189 FMLA v16.4s, v20.4s, v0.s[0] 190 FMLA v17.4s, v21.4s, v0.s[0] 191 LDP q22, q23, [x5], 32 192 LDP q24, q25, [x5], 32 193 LDP q26, q27, [x5], 32 194 FMLA v18.4s, v22.4s, v0.s[1] 195 FMLA v19.4s, v23.4s, v0.s[1] 196 FMLA v16.4s, v24.4s, v0.s[2] 197 FMLA v17.4s, v25.4s, v0.s[2] 198 FMLA v18.4s, v26.4s, v0.s[3] 199 FMLA v19.4s, v27.4s, v0.s[3] 200 201 TBZ x0, 3, 8f 2027: 203 # Remainder- 2 floats of A (8 bytes) 204 LDP q20, q21, [x5], 32 205 LDR d0, [x8], 8 206 FMLA v16.4s, v20.4s, v0.s[0] 207 FMLA v17.4s, v21.4s, v0.s[0] 208 LDP q22, q23, [x5], 32 209 FMLA v18.4s, v22.4s, v0.s[1] 210 FMLA v19.4s, v23.4s, v0.s[1] 2118: 212 TBZ x0, 2, 5b 2139: 214 # Remainder- 1 float of A (4 bytes) 215 LDP q20, q21, [x5], 32 216 LDR s0, [x8], 4 217 FMLA v16.4s, v20.4s, v0.s[0] 218 FMLA v17.4s, v21.4s, v0.s[0] 219 B 5b 220 22110: 222 # Store odd channels 223 TBZ x1, 2, 11f 224 STR q16, [x6], 16 225 MOV v16.16b, v17.16b 226 22711: 228 TBZ x1, 1, 12f 229 STR d16, [x6], 8 230 DUP d16, v16.d[1] 231 23212: 233 TBZ x1, 0, 13f 234 STR s16, [x6], 4 23513: 236 RET 237 238END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 239 240#ifdef __ELF__ 241.section ".note.GNU-stack","",%progbits 242#endif 243