1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, (x4) - unused 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Clamp v4 v5 34 35BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 36 37 $if INC: 38 # Load cn_stride, acc 39 LDP x14, x15, [sp] 40 # Load params pointer 41 LDR x8, [sp, 16] 42 $else: 43 # Load cn_stride, params pointer 44 LDP x14, x8, [sp] 45 46 # Load clamping_params values 47 LD2R {v4.4s, v5.4s}, [x8] 480: 49 $if INC: 50 # Load initial accumulators 51 LDP q16, q17, [x15], 32 52 $else: 53 # Load initial bias from w into accumulators 54 LDP q16, q17, [x5], 32 55 56 MOVI v18.4s, 0 // second set of C for pipelining FMLA 57 $if PREFETCH: 58 PRFM PLDL1KEEP, [x5] 59 MOVI v19.4s, 0 60 $if PREFETCH: 61 PRFM PLDL1KEEP, [x5, 64] 62 $if PREFETCH: 63 PRFM PLDL1KEEP, [x5, 128] 64 $if PREFETCH: 65 PRFM PLDL1KEEP, [x5, 192] 66 67 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 68 SUBS x0, x2, 32 // k = kc - 32 69 70 B.LO 3f 71 72 # 16 prologue 73 # Read first block of 1 A and B. 74 LDP q20, q21, [x5], 32 75 LDP q22, q23, [x5], 32 76 LDP q24, q25, [x5], 32 77 LDP q26, q27, [x5], 32 78 LDR q0, [x3], 16 79 80 # Is there at least 32. yes do main loop 81 SUBS x0, x0, 32 82 B.LO 2f 83 84 # Main loop - 8 floats of A (32 bytes) 851: 86 # First block of 4. FMA for first 4, loads for 2nd block of 4. 87 FMLA v16.4s, v20.4s, v0.s[0] 88 LDR q1, [x3], 16 89 FMLA v17.4s, v21.4s, v0.s[0] 90 LDP q20, q21, [x5], 32 91 FMLA v18.4s, v22.4s, v0.s[1] 92 $if PREFETCH: 93 PRFM PLDL1KEEP, [x5, 96] 94 FMLA v19.4s, v23.4s, v0.s[1] 95 LDP q22, q23, [x5], 32 96 FMLA v16.4s, v24.4s, v0.s[2] 97 FMLA v17.4s, v25.4s, v0.s[2] 98 LDP q24, q25, [x5], 32 99 FMLA v18.4s, v26.4s, v0.s[3] 100 FMLA v19.4s, v27.4s, v0.s[3] 101 LDP q26, q27, [x5], 32 102 103 # Second block of 4. FMA for second 4, loads for 1st block of 4. 104 FMLA v16.4s, v20.4s, v1.s[0] 105 LDR q0, [x3], 16 106 FMLA v17.4s, v21.4s, v1.s[0] 107 LDP q20, q21, [x5], 32 108 FMLA v18.4s, v22.4s, v1.s[1] 109 FMLA v19.4s, v23.4s, v1.s[1] 110 LDP q22, q23, [x5], 32 111 FMLA v16.4s, v24.4s, v1.s[2] 112 FMLA v17.4s, v25.4s, v1.s[2] 113 LDP q24, q25, [x5], 32 114 FMLA v18.4s, v26.4s, v1.s[3] 115 FMLA v19.4s, v27.4s, v1.s[3] 116 SUBS x0, x0, 32 117 LDP q26, q27, [x5], 32 118 B.HS 1b 119 1202: 121 # Epilogue 122 123 # First block of 4. FMA for first 4, loads for 2nd block of 4. 124 FMLA v16.4s, v20.4s, v0.s[0] 125 LDR q1, [x3], 16 126 FMLA v17.4s, v21.4s, v0.s[0] 127 LDP q20, q21, [x5], 32 128 FMLA v18.4s, v22.4s, v0.s[1] 129 FMLA v19.4s, v23.4s, v0.s[1] 130 LDP q22, q23, [x5], 32 131 FMLA v16.4s, v24.4s, v0.s[2] 132 FMLA v17.4s, v25.4s, v0.s[2] 133 LDP q24, q25, [x5], 32 134 FMLA v18.4s, v26.4s, v0.s[3] 135 FMLA v19.4s, v27.4s, v0.s[3] 136 LDP q26, q27, [x5], 32 137 138 # Second block of 4. no loads 139 FMLA v16.4s, v20.4s, v1.s[0] 140 FMLA v17.4s, v21.4s, v1.s[0] 141 FMLA v18.4s, v22.4s, v1.s[1] 142 FMLA v19.4s, v23.4s, v1.s[1] 143 FMLA v16.4s, v24.4s, v1.s[2] 144 FMLA v17.4s, v25.4s, v1.s[2] 145 FMLA v18.4s, v26.4s, v1.s[3] 146 FMLA v19.4s, v27.4s, v1.s[3] 147 1483: 149 # Is there a remainder?- 4 floats of A (16 bytes) 150 TBNZ x0, 4, 5f 151 # Is there a remainder?- 2 floats of A (8 bytes) 152 TBNZ x0, 3, 6f 153 # Is there a remainder?- 1 floats of A (4 bytes) 154 TBNZ x0, 2, 8f 155 1564: 157 FADD v16.4s, v16.4s, v18.4s 158 SUBS x1, x1, 8 159 FADD v17.4s, v17.4s, v19.4s 160 161 # Clamp 162 FMIN v16.4s, v16.4s, v4.4s 163 FMIN v17.4s, v17.4s, v4.4s 164 FMAX v16.4s, v16.4s, v5.4s 165 FMAX v17.4s, v17.4s, v5.4s 166 167 # Store full 1 x 8 168 B.LO 9f 169 170 STP q16, q17, [x6] 171 ADD x6, x6, x14 172 173 SUB x3, x3, x2 // a0 -= kc 174 175 B.HI 0b 176 177 RET 178 1795: 180 # Remainder- 4 floats of A (16 bytes) 181 LDP q20, q21, [x5], 32 182 LDR q0, [x3], 16 183 FMLA v16.4s, v20.4s, v0.s[0] 184 FMLA v17.4s, v21.4s, v0.s[0] 185 LDP q22, q23, [x5], 32 186 LDP q24, q25, [x5], 32 187 LDP q26, q27, [x5], 32 188 FMLA v18.4s, v22.4s, v0.s[1] 189 FMLA v19.4s, v23.4s, v0.s[1] 190 FMLA v16.4s, v24.4s, v0.s[2] 191 FMLA v17.4s, v25.4s, v0.s[2] 192 FMLA v18.4s, v26.4s, v0.s[3] 193 FMLA v19.4s, v27.4s, v0.s[3] 194 195 TBZ x0, 3, 7f 1966: 197 # Remainder- 2 floats of A (8 bytes) 198 LDP q20, q21, [x5], 32 199 LDR d0, [x3], 8 200 FMLA v16.4s, v20.4s, v0.s[0] 201 FMLA v17.4s, v21.4s, v0.s[0] 202 LDP q22, q23, [x5], 32 203 FMLA v18.4s, v22.4s, v0.s[1] 204 FMLA v19.4s, v23.4s, v0.s[1] 2057: 206 TBZ x0, 2, 4b 2078: 208 # Remainder- 1 float of A (4 bytes) 209 LDP q20, q21, [x5], 32 210 LDR s0, [x3], 4 211 FMLA v16.4s, v20.4s, v0.s[0] 212 FMLA v17.4s, v21.4s, v0.s[0] 213 B 4b 214 215 # Store odd channels 2169: 217 TBZ x1, 2, 10f 218 STR q16, [x6], 16 219 MOV v16.16b, v17.16b 220 22110: 222 TBZ x1, 1, 11f 223 STR d16, [x6], 8 224 DUP d16, v16.d[1] 225 22611: 227 TBZ x1, 0, 12f 228 STR s16, [x6] 22912: 230 RET 231 232END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 233 234#ifdef __ELF__ 235.section ".note.GNU-stack","",%progbits 236#endif 237