• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * memset - fill memory with a constant byte
3 *
4 * Copyright (c) 2012, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses
11 *
12 */
13
14#include "../asmdefs.h"
15
16#define dstin	x0
17#define val	x1
18#define valw	w1
19#define count	x2
20#define dst	x3
21#define dstend	x4
22#define tmp1	x5
23#define tmp1w	w5
24#define tmp2	x6
25#define tmp2w	w6
26#define zva_len x7
27#define zva_lenw w7
28
29ENTRY (__memset_aarch64)
30
31	dup	v0.16B, valw
32	add	dstend, dstin, count
33
34	cmp	count, 96
35	b.hi	L(set_long)
36	cmp	count, 16
37	b.hs	L(set_medium)
38	mov	val, v0.D[0]
39
40	/* Set 0..15 bytes.  */
41	tbz	count, 3, 1f
42	str	val, [dstin]
43	str	val, [dstend, -8]
44	ret
45	nop
461:	tbz	count, 2, 2f
47	str	valw, [dstin]
48	str	valw, [dstend, -4]
49	ret
502:	cbz	count, 3f
51	strb	valw, [dstin]
52	tbz	count, 1, 3f
53	strh	valw, [dstend, -2]
543:	ret
55
56	/* Set 17..96 bytes.  */
57L(set_medium):
58	str	q0, [dstin]
59	tbnz	count, 6, L(set96)
60	str	q0, [dstend, -16]
61	tbz	count, 5, 1f
62	str	q0, [dstin, 16]
63	str	q0, [dstend, -32]
641:	ret
65
66	.p2align 4
67	/* Set 64..96 bytes.  Write 64 bytes from the start and
68	   32 bytes from the end.  */
69L(set96):
70	str	q0, [dstin, 16]
71	stp	q0, q0, [dstin, 32]
72	stp	q0, q0, [dstend, -32]
73	ret
74
75	.p2align 3
76	nop
77L(set_long):
78	and	valw, valw, 255
79	bic	dst, dstin, 15
80	str	q0, [dstin]
81	cmp	count, 256
82	ccmp	valw, 0, 0, cs
83	b.eq	L(try_zva)
84L(no_zva):
85	sub	count, dstend, dst	/* Count is 16 too large.  */
86	add	dst, dst, 16
87	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
881:	stp	q0, q0, [dst], 64
89	stp	q0, q0, [dst, -32]
90L(tail64):
91	subs	count, count, 64
92	b.hi	1b
932:	stp	q0, q0, [dstend, -64]
94	stp	q0, q0, [dstend, -32]
95	ret
96
97	.p2align 3
98L(try_zva):
99	mrs	tmp1, dczid_el0
100	tbnz	tmp1w, 4, L(no_zva)
101	and	tmp1w, tmp1w, 15
102	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
103	b.ne	 L(zva_128)
104
105	/* Write the first and last 64 byte aligned block using stp rather
106	   than using DC ZVA.  This is faster on some cores.
107	 */
108L(zva_64):
109	str	q0, [dst, 16]
110	stp	q0, q0, [dst, 32]
111	bic	dst, dst, 63
112	stp	q0, q0, [dst, 64]
113	stp	q0, q0, [dst, 96]
114	sub	count, dstend, dst	/* Count is now 128 too large.	*/
115	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
116	add	dst, dst, 128
117	nop
1181:	dc	zva, dst
119	add	dst, dst, 64
120	subs	count, count, 64
121	b.hi	1b
122	stp	q0, q0, [dst, 0]
123	stp	q0, q0, [dst, 32]
124	stp	q0, q0, [dstend, -64]
125	stp	q0, q0, [dstend, -32]
126	ret
127
128	.p2align 3
129L(zva_128):
130	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
131	b.ne	L(zva_other)
132
133	str	q0, [dst, 16]
134	stp	q0, q0, [dst, 32]
135	stp	q0, q0, [dst, 64]
136	stp	q0, q0, [dst, 96]
137	bic	dst, dst, 127
138	sub	count, dstend, dst	/* Count is now 128 too large.	*/
139	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
140	add	dst, dst, 128
1411:	dc	zva, dst
142	add	dst, dst, 128
143	subs	count, count, 128
144	b.hi	1b
145	stp	q0, q0, [dstend, -128]
146	stp	q0, q0, [dstend, -96]
147	stp	q0, q0, [dstend, -64]
148	stp	q0, q0, [dstend, -32]
149	ret
150
151L(zva_other):
152	mov	tmp2w, 4
153	lsl	zva_lenw, tmp2w, tmp1w
154	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
155	cmp	count, tmp1
156	blo	L(no_zva)
157
158	sub	tmp2, zva_len, 1
159	add	tmp1, dst, zva_len
160	add	dst, dst, 16
161	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
162	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
163	beq	2f
1641:	stp	q0, q0, [dst], 64
165	stp	q0, q0, [dst, -32]
166	subs	count, count, 64
167	b.hi	1b
1682:	mov	dst, tmp1
169	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
170	subs	count, count, zva_len
171	b.lo	4f
1723:	dc	zva, dst
173	add	dst, dst, zva_len
174	subs	count, count, zva_len
175	b.hs	3b
1764:	add	count, count, zva_len
177	b	L(tail64)
178
179END (__memset_aarch64)
180