1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Clamp v4 v5 34 35BEGIN_FUNCTION xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57 36 37 # Load cn_stride, params pointer 38 LDP x14, x8, [sp] 39 40 # Load clamping_params values 41 LD2R {v4.4s, v5.4s}, [x8] 420: 43 # Load initial bias from w into accumulators 44 LDP q16, q17, [x5], 32 45 46 MOVI v18.4s, 0 // second set of C for pipelining FMLA 47 MOVI v19.4s, 0 48 49 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 50 SUBS x0, x2, 32 // k = kc - 32 51 52 B.LO 3f 53 54 # 16 prologue 55 # Read first block of 1 A and B. 56 LDP q20, q21, [x5], 32 57 LDP q22, q23, [x5], 32 58 LDP q24, q25, [x5], 32 59 LDP q26, q27, [x5], 32 60 LDR q0, [x3], 16 61 62 # Is there at least 32. yes do main loop 63 SUBS x0, x0, 32 64 B.LO 2f 65 66 # Main loop - 8 floats of A (32 bytes) 671: 68 # First block of 4. FMA for first 4, loads for 2nd block of 4. 69 FMLA v16.4s, v20.4s, v0.s[0] 70 LDR q1, [x3], 16 71 FMLA v17.4s, v21.4s, v0.s[0] 72 LDP q20, q21, [x5], 32 73 FMLA v18.4s, v22.4s, v0.s[1] 74 FMLA v19.4s, v23.4s, v0.s[1] 75 LDP q22, q23, [x5], 32 76 FMLA v16.4s, v24.4s, v0.s[2] 77 FMLA v17.4s, v25.4s, v0.s[2] 78 LDP q24, q25, [x5], 32 79 FMLA v18.4s, v26.4s, v0.s[3] 80 FMLA v19.4s, v27.4s, v0.s[3] 81 LDP q26, q27, [x5], 32 82 83 # Second block of 4. FMA for second 4, loads for 1st block of 4. 84 FMLA v16.4s, v20.4s, v1.s[0] 85 LDR q0, [x3], 16 86 FMLA v17.4s, v21.4s, v1.s[0] 87 LDP q20, q21, [x5], 32 88 FMLA v18.4s, v22.4s, v1.s[1] 89 FMLA v19.4s, v23.4s, v1.s[1] 90 LDP q22, q23, [x5], 32 91 FMLA v16.4s, v24.4s, v1.s[2] 92 FMLA v17.4s, v25.4s, v1.s[2] 93 LDP q24, q25, [x5], 32 94 FMLA v18.4s, v26.4s, v1.s[3] 95 FMLA v19.4s, v27.4s, v1.s[3] 96 SUBS x0, x0, 32 97 LDP q26, q27, [x5], 32 98 B.HS 1b 99 1002: 101 # Epilogue 102 103 # First block of 4. FMA for first 4, loads for 2nd block of 4. 104 FMLA v16.4s, v20.4s, v0.s[0] 105 LDR q1, [x3], 16 106 FMLA v17.4s, v21.4s, v0.s[0] 107 LDP q20, q21, [x5], 32 108 FMLA v18.4s, v22.4s, v0.s[1] 109 FMLA v19.4s, v23.4s, v0.s[1] 110 LDP q22, q23, [x5], 32 111 FMLA v16.4s, v24.4s, v0.s[2] 112 FMLA v17.4s, v25.4s, v0.s[2] 113 LDP q24, q25, [x5], 32 114 FMLA v18.4s, v26.4s, v0.s[3] 115 FMLA v19.4s, v27.4s, v0.s[3] 116 LDP q26, q27, [x5], 32 117 118 # Second block of 4. no loads 119 FMLA v16.4s, v20.4s, v1.s[0] 120 FMLA v17.4s, v21.4s, v1.s[0] 121 FMLA v18.4s, v22.4s, v1.s[1] 122 FMLA v19.4s, v23.4s, v1.s[1] 123 FMLA v16.4s, v24.4s, v1.s[2] 124 FMLA v17.4s, v25.4s, v1.s[2] 125 FMLA v18.4s, v26.4s, v1.s[3] 126 FMLA v19.4s, v27.4s, v1.s[3] 127 1283: 129 # Is there a remainder?- 4 floats of A (16 bytes) 130 TBNZ x0, 4, 5f 131 # Is there a remainder?- 2 floats of A (8 bytes) 132 TBNZ x0, 3, 6f 133 # Is there a remainder?- 1 floats of A (4 bytes) 134 TBNZ x0, 2, 8f 135 1364: 137 FADD v16.4s, v16.4s, v18.4s 138 SUBS x1, x1, 8 139 FADD v17.4s, v17.4s, v19.4s 140 141 # Clamp 142 FMIN v16.4s, v16.4s, v4.4s 143 FMIN v17.4s, v17.4s, v4.4s 144 FMAX v16.4s, v16.4s, v5.4s 145 FMAX v17.4s, v17.4s, v5.4s 146 147 # Store full 1 x 8 148 B.LO 9f 149 150 STP q16, q17, [x6] 151 ADD x6, x6, x14 152 153 SUB x3, x3, x2 // a0 -= kc 154 155 B.HI 0b 156 157 RET 158 1595: 160 # Remainder- 4 floats of A (16 bytes) 161 LDP q20, q21, [x5], 32 162 LDR q0, [x3], 16 163 FMLA v16.4s, v20.4s, v0.s[0] 164 FMLA v17.4s, v21.4s, v0.s[0] 165 LDP q22, q23, [x5], 32 166 LDP q24, q25, [x5], 32 167 LDP q26, q27, [x5], 32 168 FMLA v18.4s, v22.4s, v0.s[1] 169 FMLA v19.4s, v23.4s, v0.s[1] 170 FMLA v16.4s, v24.4s, v0.s[2] 171 FMLA v17.4s, v25.4s, v0.s[2] 172 FMLA v18.4s, v26.4s, v0.s[3] 173 FMLA v19.4s, v27.4s, v0.s[3] 174 175 TBZ x0, 3, 7f 1766: 177 # Remainder- 2 floats of A (8 bytes) 178 LDP q20, q21, [x5], 32 179 LDR d0, [x3], 8 180 FMLA v16.4s, v20.4s, v0.s[0] 181 FMLA v17.4s, v21.4s, v0.s[0] 182 LDP q22, q23, [x5], 32 183 FMLA v18.4s, v22.4s, v0.s[1] 184 FMLA v19.4s, v23.4s, v0.s[1] 1857: 186 TBZ x0, 2, 4b 1878: 188 # Remainder- 1 float of A (4 bytes) 189 LDP q20, q21, [x5], 32 190 LDR s0, [x3], 4 191 FMLA v16.4s, v20.4s, v0.s[0] 192 FMLA v17.4s, v21.4s, v0.s[0] 193 B 4b 194 195 # Store odd channels 1969: 197 TBZ x1, 2, 10f 198 STR q16, [x6], 16 199 MOV v16.16b, v17.16b 200 20110: 202 TBZ x1, 1, 11f 203 STR d16, [x6], 8 204 DUP d16, v16.d[1] 205 20611: 207 TBZ x1, 0, 12f 208 STR s16, [x6] 20912: 210 RET 211 212END_FUNCTION xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57 213 214#ifdef __ELF__ 215.section ".note.GNU-stack","",%progbits 216#endif 217