1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_output_params params [sp + 24] -> x8 25 26# d8-d15 need to be preserved if used. 27# x19-30 need to be preserved if used. 28 29# A pointer 30# x8 a0 31 32# C pointer 33# x6 c0 34 35BEGIN_FUNCTION xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75 36 37 # Load cn_stride, a_offset 38 LDP x10, x11, [sp] 39 40 # Load zero, clamping params pointer 41 LDP x12, x8, [sp, 16] 42 43 # Load clamping_params values 44 LD2R {v30.4s, v31.4s}, [x8] 45 460: 47 # Load initial bias from w into accumulators 48 LDP q16, q17, [x5], 32 49 MOVI v18.4s, 0 // second set of C for pipelining FMLA 50 PRFM PLDL1KEEP, [x5] 51 MOVI v19.4s, 0 52 PRFM PLDL1KEEP, [x5, 64] 53 PRFM PLDL1KEEP, [x5, 128] 54 PRFM PLDL1KEEP, [x5, 192] 55 56 MOV x9, x3 // p = ks 57 581: 59 # Load next A pointer 60 LDR x8, [x4], 8 61 62 CMP x8, x12 // if a0 == zero 63 ADD x8, x8, x11 // a0 += a_offset 64 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 65 66 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 67 SUBS x0, x2, 32 // k = kc - 32 // k = kc 68 B.LO 4f 69 70 # 16 prologue 71 # Read first block of A and B. 72 LDP q20, q21, [x5], 32 73 LDP q22, q23, [x5], 32 74 LDP q24, q25, [x5], 32 75 LDP q26, q27, [x5], 32 76 LDR q0, [x8], 16 77 78 # Is there at least 8. yes do main loop 79 SUBS x0, x0, 32 80 B.LO 3f 81 82 # Main loop - 8 floats of A (32 bytes) 832: 84 # First block of 4. FMA for first 4, loads for 2nd block of 4. 85 FMLA v16.4s, v20.4s, v0.s[0] 86 LDR q1, [x8], 16 87 FMLA v17.4s, v21.4s, v0.s[0] 88 LDP q20, q21, [x5], 32 89 FMLA v18.4s, v22.4s, v0.s[1] 90 FMLA v19.4s, v23.4s, v0.s[1] 91 LDP q22, q23, [x5], 32 92 FMLA v16.4s, v24.4s, v0.s[2] 93 FMLA v17.4s, v25.4s, v0.s[2] 94 LDP q24, q25, [x5], 32 95 PRFM PLDL1KEEP, [x5, 128] 96 FMLA v18.4s, v26.4s, v0.s[3] 97 PRFM PLDL1KEEP, [x5, 256] 98 FMLA v19.4s, v27.4s, v0.s[3] 99 LDP q26, q27, [x5], 32 100 101 # Second block of 4. FMA for second 4, loads for 1st block of 4. 102 FMLA v16.4s, v20.4s, v1.s[0] 103 LDR q0, [x8], 16 104 FMLA v17.4s, v21.4s, v1.s[0] 105 LDP q20, q21, [x5], 32 106 FMLA v18.4s, v22.4s, v1.s[1] 107 FMLA v19.4s, v23.4s, v1.s[1] 108 LDP q22, q23, [x5], 32 109 FMLA v16.4s, v24.4s, v1.s[2] 110 FMLA v17.4s, v25.4s, v1.s[2] 111 LDP q24, q25, [x5], 32 112 PRFM PLDL1KEEP, [x5, 128] 113 FMLA v18.4s, v26.4s, v1.s[3] 114 PRFM PLDL1KEEP, [x5, 256] 115 FMLA v19.4s, v27.4s, v1.s[3] 116 SUBS x0, x0, 32 117 LDP q26, q27, [x5], 32 118 B.HS 2b 119 1203: 121 # Epilogue 122 123 # First block of 4. FMA for first 4, loads for 2nd block of 4. 124 FMLA v16.4s, v20.4s, v0.s[0] 125 LDR q1, [x8], 16 126 FMLA v17.4s, v21.4s, v0.s[0] 127 LDP q20, q21, [x5], 32 128 FMLA v18.4s, v22.4s, v0.s[1] 129 FMLA v19.4s, v23.4s, v0.s[1] 130 LDP q22, q23, [x5], 32 131 FMLA v16.4s, v24.4s, v0.s[2] 132 FMLA v17.4s, v25.4s, v0.s[2] 133 LDP q24, q25, [x5], 32 134 PRFM PLDL1KEEP, [x5, 128] 135 FMLA v18.4s, v26.4s, v0.s[3] 136 PRFM PLDL1KEEP, [x5, 256] 137 FMLA v19.4s, v27.4s, v0.s[3] 138 LDP q26, q27, [x5], 32 139 140 # Second block of 4. no loads 141 FMLA v16.4s, v20.4s, v1.s[0] 142 FMLA v17.4s, v21.4s, v1.s[0] 143 FMLA v18.4s, v22.4s, v1.s[1] 144 FMLA v19.4s, v23.4s, v1.s[1] 145 FMLA v16.4s, v24.4s, v1.s[2] 146 FMLA v17.4s, v25.4s, v1.s[2] 147 FMLA v18.4s, v26.4s, v1.s[3] 148 FMLA v19.4s, v27.4s, v1.s[3] 149 1504: 151 # Is there a remainder?- 4 floats of A (16 bytes) 152 TBNZ x0, 4, 6f 153 # Is there a remainder?- 2 floats of A (8 bytes) 154 TBNZ x0, 3, 7f 155 # Is there a remainder?- 1 floats of A (4 bytes) 156 TBNZ x0, 2, 9f 157 1585: 159 # ks loop 160 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 161 B.NE 1b 162 163 FADD v16.4s, v16.4s, v18.4s 164 FADD v17.4s, v17.4s, v19.4s 165 166 # Clamp 167 FMIN v16.4s, v16.4s, v30.4s 168 FMIN v17.4s, v17.4s, v30.4s 169 FMAX v16.4s, v16.4s, v31.4s 170 FMAX v17.4s, v17.4s, v31.4s 171 172 # Store full 1 x 8 173 SUBS x1, x1, 8 174 B.LO 10f 175 176 STP q16, q17, [x6] 177 ADD x6, x6, x10 178 179 SUB x4, x4, x3 // a -= ks 180 181 # nc loop 182 B.HI 0b 183 184 RET 185 1866: 187 # Remainder- 4 floats of A (16 bytes) 188 LDP q20, q21, [x5], 32 189 LDR q0, [x8], 16 190 FMLA v16.4s, v20.4s, v0.s[0] 191 FMLA v17.4s, v21.4s, v0.s[0] 192 LDP q22, q23, [x5], 32 193 LDP q24, q25, [x5], 32 194 LDP q26, q27, [x5], 32 195 FMLA v18.4s, v22.4s, v0.s[1] 196 FMLA v19.4s, v23.4s, v0.s[1] 197 FMLA v16.4s, v24.4s, v0.s[2] 198 FMLA v17.4s, v25.4s, v0.s[2] 199 FMLA v18.4s, v26.4s, v0.s[3] 200 FMLA v19.4s, v27.4s, v0.s[3] 201 202 TBZ x0, 3, 8f 2037: 204 # Remainder- 2 floats of A (8 bytes) 205 LDP q20, q21, [x5], 32 206 LDR d0, [x8], 8 207 FMLA v16.4s, v20.4s, v0.s[0] 208 FMLA v17.4s, v21.4s, v0.s[0] 209 LDP q22, q23, [x5], 32 210 FMLA v18.4s, v22.4s, v0.s[1] 211 FMLA v19.4s, v23.4s, v0.s[1] 2128: 213 TBZ x0, 2, 5b 2149: 215 # Remainder- 1 float of A (4 bytes) 216 LDP q20, q21, [x5], 32 217 LDR s0, [x8], 4 218 FMLA v16.4s, v20.4s, v0.s[0] 219 FMLA v17.4s, v21.4s, v0.s[0] 220 B 5b 221 22210: 223 # Store odd channels 224 TBZ x1, 2, 11f 225 STR q16, [x6], 16 226 MOV v16.16b, v17.16b 227 22811: 229 TBZ x1, 1, 12f 230 STR d16, [x6], 8 231 DUP d16, v16.d[1] 232 23312: 234 TBZ x1, 0, 13f 235 STR s16, [x6], 4 23613: 237 RET 238 239END_FUNCTION xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75 240 241#ifdef __ELF__ 242.section ".note.GNU-stack","",%progbits 243#endif 244