1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v4 v5 33 34BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 35 36 # Load cn_stride, params pointer 37 LDP x14, x8, [sp] 38 39 # Load min/max values 40 LD2R {v4.4s, v5.4s}, [x8] 410: 42 # Load initial bias from w into accumulators 43 LDP q16, q17, [x5], 32 44 45 MOVI v18.4s, 0 // second set of C for pipelining FMLA 46 PRFM PLDL1KEEP, [x5] 47 MOVI v19.4s, 0 48 PRFM PLDL1KEEP, [x5, 64] 49 PRFM PLDL1KEEP, [x5, 128] 50 PRFM PLDL1KEEP, [x5, 192] 51 52 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 53 SUBS x0, x2, 32 // k = kc - 32 54 55 B.LO 3f 56 57 # 16 prologue 58 # Read first block of 1 A and B. 59 LDP q20, q21, [x5], 32 60 LDP q22, q23, [x5], 32 61 LDP q24, q25, [x5], 32 62 LDP q26, q27, [x5], 32 63 LDR q0, [x3], 16 64 65 # Is there at least 32. yes do main loop 66 SUBS x0, x0, 32 67 B.LO 2f 68 69 # Main loop - 8 floats of A (32 bytes) 701: 71 # First block of 4. FMA for first 4, loads for 2nd block of 4. 72 FMLA v16.4s, v20.4s, v0.s[0] 73 LDR q1, [x3], 16 74 FMLA v17.4s, v21.4s, v0.s[0] 75 LDP q20, q21, [x5], 32 76 FMLA v18.4s, v22.4s, v0.s[1] 77 PRFM PLDL1KEEP, [x5, 96] 78 FMLA v19.4s, v23.4s, v0.s[1] 79 LDP q22, q23, [x5], 32 80 FMLA v16.4s, v24.4s, v0.s[2] 81 FMLA v17.4s, v25.4s, v0.s[2] 82 LDP q24, q25, [x5], 32 83 FMLA v18.4s, v26.4s, v0.s[3] 84 FMLA v19.4s, v27.4s, v0.s[3] 85 LDP q26, q27, [x5], 32 86 87 # Second block of 4. FMA for second 4, loads for 1st block of 4. 88 FMLA v16.4s, v20.4s, v1.s[0] 89 LDR q0, [x3], 16 90 FMLA v17.4s, v21.4s, v1.s[0] 91 LDP q20, q21, [x5], 32 92 FMLA v18.4s, v22.4s, v1.s[1] 93 FMLA v19.4s, v23.4s, v1.s[1] 94 LDP q22, q23, [x5], 32 95 FMLA v16.4s, v24.4s, v1.s[2] 96 FMLA v17.4s, v25.4s, v1.s[2] 97 LDP q24, q25, [x5], 32 98 FMLA v18.4s, v26.4s, v1.s[3] 99 FMLA v19.4s, v27.4s, v1.s[3] 100 SUBS x0, x0, 32 101 LDP q26, q27, [x5], 32 102 B.HS 1b 103 1042: 105 # Epilogue 106 107 # First block of 4. FMA for first 4, loads for 2nd block of 4. 108 FMLA v16.4s, v20.4s, v0.s[0] 109 LDR q1, [x3], 16 110 FMLA v17.4s, v21.4s, v0.s[0] 111 LDP q20, q21, [x5], 32 112 FMLA v18.4s, v22.4s, v0.s[1] 113 FMLA v19.4s, v23.4s, v0.s[1] 114 LDP q22, q23, [x5], 32 115 FMLA v16.4s, v24.4s, v0.s[2] 116 FMLA v17.4s, v25.4s, v0.s[2] 117 LDP q24, q25, [x5], 32 118 FMLA v18.4s, v26.4s, v0.s[3] 119 FMLA v19.4s, v27.4s, v0.s[3] 120 LDP q26, q27, [x5], 32 121 122 # Second block of 4. no loads 123 FMLA v16.4s, v20.4s, v1.s[0] 124 FMLA v17.4s, v21.4s, v1.s[0] 125 FMLA v18.4s, v22.4s, v1.s[1] 126 FMLA v19.4s, v23.4s, v1.s[1] 127 FMLA v16.4s, v24.4s, v1.s[2] 128 FMLA v17.4s, v25.4s, v1.s[2] 129 FMLA v18.4s, v26.4s, v1.s[3] 130 FMLA v19.4s, v27.4s, v1.s[3] 131 1323: 133 # Is there a remainder?- 4 floats of A (16 bytes) 134 TBNZ x0, 4, 5f 135 # Is there a remainder?- 2 floats of A (8 bytes) 136 TBNZ x0, 3, 6f 137 # Is there a remainder?- 1 floats of A (4 bytes) 138 TBNZ x0, 2, 8f 139 1404: 141 FADD v16.4s, v16.4s, v18.4s 142 SUBS x1, x1, 8 143 FADD v17.4s, v17.4s, v19.4s 144 145 # Clamp 146 FMAX v16.4s, v16.4s, v4.4s 147 FMAX v17.4s, v17.4s, v4.4s 148 FMIN v16.4s, v16.4s, v5.4s 149 FMIN v17.4s, v17.4s, v5.4s 150 151 # Store full 1 x 8 152 B.LO 9f 153 154 STP q16, q17, [x6] 155 ADD x6, x6, x14 156 157 SUB x3, x3, x2 // a0 -= kc 158 159 B.HI 0b 160 161 RET 162 1635: 164 # Remainder- 4 floats of A (16 bytes) 165 LDP q20, q21, [x5], 32 166 LDR q0, [x3], 16 167 FMLA v16.4s, v20.4s, v0.s[0] 168 FMLA v17.4s, v21.4s, v0.s[0] 169 LDP q22, q23, [x5], 32 170 LDP q24, q25, [x5], 32 171 LDP q26, q27, [x5], 32 172 FMLA v18.4s, v22.4s, v0.s[1] 173 FMLA v19.4s, v23.4s, v0.s[1] 174 FMLA v16.4s, v24.4s, v0.s[2] 175 FMLA v17.4s, v25.4s, v0.s[2] 176 FMLA v18.4s, v26.4s, v0.s[3] 177 FMLA v19.4s, v27.4s, v0.s[3] 178 179 TBZ x0, 3, 7f 1806: 181 # Remainder- 2 floats of A (8 bytes) 182 LDP q20, q21, [x5], 32 183 LDR d0, [x3], 8 184 FMLA v16.4s, v20.4s, v0.s[0] 185 FMLA v17.4s, v21.4s, v0.s[0] 186 LDP q22, q23, [x5], 32 187 FMLA v18.4s, v22.4s, v0.s[1] 188 FMLA v19.4s, v23.4s, v0.s[1] 1897: 190 TBZ x0, 2, 4b 1918: 192 # Remainder- 1 float of A (4 bytes) 193 LDP q20, q21, [x5], 32 194 LDR s0, [x3], 4 195 FMLA v16.4s, v20.4s, v0.s[0] 196 FMLA v17.4s, v21.4s, v0.s[0] 197 B 4b 198 199 # Store odd channels 2009: 201 TBZ x1, 2, 10f 202 STR q16, [x6], 16 203 MOV v16.16b, v17.16b 204 20510: 206 TBZ x1, 1, 11f 207 STR d16, [x6], 8 208 DUP d16, v16.d[1] 209 21011: 211 TBZ x1, 0, 12f 212 STR s16, [x6] 21312: 214 RET 215 216END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 217 218#ifdef __ELF__ 219.section ".note.GNU-stack","",%progbits 220#endif 221