1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v4, v5, v6 33 34BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 35 36 # Load cn_stride, params pointer 37 LDP x14, x8, [sp] 38 39 # Load params values 40 LD3R {v4.8h, v5.8h, v6.8h}, [x8] 410: 42 # Load initial bias from w into accumulators 43 LDP q16, q17, [x5], 32 44 45 MOVI v18.8h, 0 // second set of C for pipelining FMLA 46 MOVI v19.8h, 0 47 48 # Is there at least 2 halffloats (4 bytes) 49 SUBS x0, x2, 4 // k = kc - 4 50 51 B.LO 3f 52 53 # Main loop - 2 halffloats of A (4 bytes) 541: 55 LDR s0, [x3], 4 56 LDR q20, [x5], 16 57 LDR q21, [x5], 16 58 LDR q22, [x5], 16 59 LDR q23, [x5], 16 60 SUBS x0, x0, 4 61 FMLA v16.8h, v20.8h, v0.h[0] 62 FMLA v17.8h, v21.8h, v0.h[0] 63 FMLA v18.8h, v22.8h, v0.h[1] 64 FMLA v19.8h, v23.8h, v0.h[1] 65 B.HS 1b 66 67 # Is there a remainder?- 1 halffloats of A (2 bytes) 68 TBNZ x0, 1, 3f 69 702: 71 FADD v16.8h, v16.8h, v18.8h 72 FADD v17.8h, v17.8h, v19.8h 73 SUBS x1, x1, 16 74 75 # Scale and Clamp 76 77 FMUL v16.8h, v16.8h, v4.8h 78 FMUL v17.8h, v17.8h, v4.8h 79 FMAX v16.8h, v16.8h, v5.8h 80 FMAX v17.8h, v17.8h, v5.8h 81 FMIN v16.8h, v16.8h, v6.8h 82 FMIN v17.8h, v17.8h, v6.8h 83 84 # Store full 1 x 16 85 B.LO 4f 86 87 STP q16, q17, [x6] 88 ADD x6, x6, x14 89 90 SUB x3, x3, x2 // a0 -= kc 91 92 B.HI 0b 93 94 RET 95 963: 97 # Remainder- 1 halffloat of A (2 bytes) 98 LDR q20, [x5], 16 99 LDR q21, [x5], 16 100 LDR h0, [x3], 2 101 FMLA v16.8h, v20.8h, v0.h[0] 102 FMLA v17.8h, v21.8h, v0.h[0] 103 B 2b 104 105 # Store odd channels 1064: 107 TBZ x1, 3, 5f 108 STR q16, [x6], 16 109 MOV v16.16b, v17.16b 110 1115: 112 TBZ x1, 2, 6f 113 STR d16, [x6], 8 114 DUP d16, v16.d[1] 115 1166: 117 TBZ x1, 1, 7f 118 STR s16, [x6], 4 119 DUP s16, v16.s[1] 120 1217: 122 TBZ x1, 0, 8f 123 STR h16, [x6] 1248: 125 RET 126 127END_FUNCTION xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 128 129#ifdef __ELF__ 130.section ".note.GNU-stack","",%progbits 131#endif 132