• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (c) 2012 - 2016, Linaro Limited
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//     * Redistributions of source code must retain the above copyright
8//       notice, this list of conditions and the following disclaimer.
9//     * Redistributions in binary form must reproduce the above copyright
10//       notice, this list of conditions and the following disclaimer in the
11//       documentation and/or other materials provided with the distribution.
12//     * Neither the name of the Linaro nor the
13//       names of its contributors may be used to endorse or promote products
14//       derived from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27//
28
29//
30// Copyright (c) 2015 ARM Ltd
31// All rights reserved.
32//
33// Redistribution and use in source and binary forms, with or without
34// modification, are permitted provided that the following conditions
35// are met:
36// 1. Redistributions of source code must retain the above copyright
37//    notice, this list of conditions and the following disclaimer.
38// 2. Redistributions in binary form must reproduce the above copyright
39//    notice, this list of conditions and the following disclaimer in the
40//    documentation and/or other materials provided with the distribution.
41// 3. The name of the company may not be used to endorse or promote
42//    products derived from this software without specific prior written
43//    permission.
44//
45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55//
56
57// Assumptions:
58//
59// ARMv8-a, AArch64, unaligned accesses
60//
61//
62
63#define dstin     x0
64#define count     x1
65#define val       x2
66#define valw      w2
67#define dst       x3
68#define dstend    x4
69#define tmp1      x5
70#define tmp1w     w5
71#define tmp2      x6
72#define tmp2w     w6
73#define zva_len   x7
74#define zva_lenw  w7
75
76#define L(l) .L ## l
77
78ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
79ASM_PFX(InternalMemSetMem16):
80    dup     v0.8H, valw
81    lsl     count, count, #1
82    b       0f
83
84ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
85ASM_PFX(InternalMemSetMem32):
86    dup     v0.4S, valw
87    lsl     count, count, #2
88    b       0f
89
90ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
91ASM_PFX(InternalMemSetMem64):
92    dup     v0.2D, val
93    lsl     count, count, #3
94    b       0f
95
96ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
97ASM_PFX(InternalMemZeroMem):
98    movi    v0.16B, #0
99    b       0f
100
101ASM_GLOBAL ASM_PFX(InternalMemSetMem)
102ASM_PFX(InternalMemSetMem):
103    dup     v0.16B, valw
1040:  add     dstend, dstin, count
105    mov     val, v0.D[0]
106
107    cmp     count, 96
108    b.hi    L(set_long)
109    cmp     count, 16
110    b.hs    L(set_medium)
111
112    // Set 0..15 bytes.
113    tbz     count, 3, 1f
114    str     val, [dstin]
115    str     val, [dstend, -8]
116    ret
117    nop
1181:  tbz     count, 2, 2f
119    str     valw, [dstin]
120    str     valw, [dstend, -4]
121    ret
1222:  cbz     count, 3f
123    strb    valw, [dstin]
124    tbz     count, 1, 3f
125    strh    valw, [dstend, -2]
1263:  ret
127
128    // Set 17..96 bytes.
129L(set_medium):
130    str     q0, [dstin]
131    tbnz    count, 6, L(set96)
132    str     q0, [dstend, -16]
133    tbz     count, 5, 1f
134    str     q0, [dstin, 16]
135    str     q0, [dstend, -32]
1361:  ret
137
138    .p2align 4
139    // Set 64..96 bytes.  Write 64 bytes from the start and
140    // 32 bytes from the end.
141L(set96):
142    str     q0, [dstin, 16]
143    stp     q0, q0, [dstin, 32]
144    stp     q0, q0, [dstend, -32]
145    ret
146
147    .p2align 3
148    nop
149L(set_long):
150    bic     dst, dstin, 15
151    str     q0, [dstin]
152    cmp     count, 256
153    ccmp    val, 0, 0, cs
154    b.eq    L(try_zva)
155L(no_zva):
156    sub     count, dstend, dst        // Count is 16 too large.
157    add     dst, dst, 16
158    sub     count, count, 64 + 16     // Adjust count and bias for loop.
1591:  stp     q0, q0, [dst], 64
160    stp     q0, q0, [dst, -32]
161L(tail64):
162    subs    count, count, 64
163    b.hi    1b
1642:  stp     q0, q0, [dstend, -64]
165    stp     q0, q0, [dstend, -32]
166    ret
167
168    .p2align 3
169L(try_zva):
170    mrs     tmp1, dczid_el0
171    tbnz    tmp1w, 4, L(no_zva)
172    and     tmp1w, tmp1w, 15
173    cmp     tmp1w, 4                  // ZVA size is 64 bytes.
174    b.ne    L(zva_128)
175
176    // Write the first and last 64 byte aligned block using stp rather
177    // than using DC ZVA.  This is faster on some cores.
178L(zva_64):
179    str     q0, [dst, 16]
180    stp     q0, q0, [dst, 32]
181    bic     dst, dst, 63
182    stp     q0, q0, [dst, 64]
183    stp     q0, q0, [dst, 96]
184    sub     count, dstend, dst         // Count is now 128 too large.
185    sub     count, count, 128+64+64    // Adjust count and bias for loop.
186    add     dst, dst, 128
187    nop
1881:  dc      zva, dst
189    add     dst, dst, 64
190    subs    count, count, 64
191    b.hi    1b
192    stp     q0, q0, [dst, 0]
193    stp     q0, q0, [dst, 32]
194    stp     q0, q0, [dstend, -64]
195    stp     q0, q0, [dstend, -32]
196    ret
197
198    .p2align 3
199L(zva_128):
200    cmp     tmp1w, 5                    // ZVA size is 128 bytes.
201    b.ne    L(zva_other)
202
203    str     q0, [dst, 16]
204    stp     q0, q0, [dst, 32]
205    stp     q0, q0, [dst, 64]
206    stp     q0, q0, [dst, 96]
207    bic     dst, dst, 127
208    sub     count, dstend, dst          // Count is now 128 too large.
209    sub     count, count, 128+128       // Adjust count and bias for loop.
210    add     dst, dst, 128
2111:  dc      zva, dst
212    add     dst, dst, 128
213    subs    count, count, 128
214    b.hi    1b
215    stp     q0, q0, [dstend, -128]
216    stp     q0, q0, [dstend, -96]
217    stp     q0, q0, [dstend, -64]
218    stp     q0, q0, [dstend, -32]
219    ret
220
221L(zva_other):
222    mov     tmp2w, 4
223    lsl     zva_lenw, tmp2w, tmp1w
224    add     tmp1, zva_len, 64           // Max alignment bytes written.
225    cmp     count, tmp1
226    blo     L(no_zva)
227
228    sub     tmp2, zva_len, 1
229    add     tmp1, dst, zva_len
230    add     dst, dst, 16
231    subs    count, tmp1, dst            // Actual alignment bytes to write.
232    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
233    beq     2f
2341:  stp     q0, q0, [dst], 64
235    stp     q0, q0, [dst, -32]
236    subs    count, count, 64
237    b.hi    1b
2382:  mov     dst, tmp1
239    sub     count, dstend, tmp1         // Remaining bytes to write.
240    subs    count, count, zva_len
241    b.lo    4f
2423:  dc      zva, dst
243    add     dst, dst, zva_len
244    subs    count, count, zva_len
245    b.hs    3b
2464:  add     count, count, zva_len
247    b       L(tail64)
248