1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Clamp v4, v5 34 35BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 36 37 # Load cn_stride, acc 38 LDP x14, x15, [sp] 39 # Load params pointer 40 LDR x8, [sp, 16] 41 42 # Load params values 43 LD2R {v4.8h, v5.8h}, [x8] 440: 45 # Load initial accumulators 46 LDP q16, q17, [x15], 32 47 48 MOVI v18.8h, 0 // second set of C for pipelining FMLA 49 MOVI v19.8h, 0 50 51 # Is there at least 2 halffloats (4 bytes) 52 SUBS x0, x2, 4 // k = kc - 4 53 54 B.LO 3f 55 56 # Main loop - 2 halffloats of A (4 bytes) 571: 58 LDR s0, [x3], 4 59 LDR q20, [x5, 0] 60 LDR q21, [x5, 16] 61 LDR q22, [x5, 32] 62 LDR q23, [x5, 48] 63 SUBS x0, x0, 4 64 FMLA v16.8h, v20.8h, v0.h[0] 65 FMLA v17.8h, v21.8h, v0.h[0] 66 FMLA v18.8h, v22.8h, v0.h[1] 67 FMLA v19.8h, v23.8h, v0.h[1] 68 ADD x5, x5, 64 69 B.HS 1b 70 71 # Is there a remainder?- 1 halffloat of A (2 bytes) 72 TBNZ x0, 1, 3f 73 742: 75 FADD v16.8h, v16.8h, v18.8h 76 FADD v17.8h, v17.8h, v19.8h 77 SUBS x1, x1, 16 78 79 # Clamp 80 FMAX v16.8h, v16.8h, v4.8h 81 FMAX v17.8h, v17.8h, v4.8h 82 FMIN v16.8h, v16.8h, v5.8h 83 FMIN v17.8h, v17.8h, v5.8h 84 85 # Store full 1 x 16 86 B.LO 4f 87 88 STP q16, q17, [x6] 89 ADD x6, x6, x14 90 91 SUB x3, x3, x2 // a0 -= kc 92 93 B.HI 0b 94 95 RET 96 973: 98 # Remainder- 1 halffloat of A (2 bytes) 99 LDR q20, [x5], 16 100 LDR q21, [x5], 16 101 LDR h0, [x3], 2 102 FMLA v16.8h, v20.8h, v0.h[0] 103 FMLA v17.8h, v21.8h, v0.h[0] 104 B 2b 105 106 # Store odd channels 1074: 108 TBZ x1, 3, 5f 109 STR q16, [x6], 16 110 MOV v16.16b, v17.16b 111 1125: 113 TBZ x1, 2, 6f 114 STR d16, [x6], 8 115 DUP d16, v16.d[1] 116 1176: 118 TBZ x1, 1, 7f 119 STR s16, [x6], 4 120 DUP s16, v16.s[1] 121 1227: 123 TBZ x1, 0, 8f 124 STR h16, [x6] 1258: 126 RET 127 128END_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 129 130#ifdef __ELF__ 131.section ".note.GNU-stack","",%progbits 132#endif 133