/*	$NetBSD: memcpy_arm.S,v 1.1 2003/10/14 07:51:45 scw Exp $	*/

/*-
 * Copyright (c) 1997 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Neil A. Carson and Mark Brinicombe
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "asm.h"

// We do not need to check whether the addresses are in the
// kernel or virtual address spaces, since we only access them
// using user privileges.

.syntax unified
.arm

// size_t _arm_user_copy(void *dst, const void *src, size_t len)
FUNCTION(_arm_user_copy)
    /* save leaf functions having to store this away */
    stmdb   sp!, {r0, r1, r2, lr}       /* _arm_user_copy() returns dest addr */

    subs    r2, r2, #4
    blt     .Lmemcpy_l4         /* less than 4 bytes */
    ands    r12, r0, #3
    bne     .Lmemcpy_destul     /* oh unaligned destination addr */
    ands    r12, r1, #3
    bne     .Lmemcpy_srcul      /* oh unaligned source addr */

.Lmemcpy_t8:
    /* We have aligned source and destination */
    subs    r2, r2, #8
    blt     .Lmemcpy_l12        /* less than 12 bytes (4 from above) */
    subs    r2, r2, #0x14
    blt     .Lmemcpy_l32        /* less than 32 bytes (12 from above) */
    stmdb   sp!, {r4}           /* borrow r4 */

    /* blat 32 bytes at a time */
    /* XXX for really big copies perhaps we should use more registers */
.Lmemcpy_loop32:
0:  ldmia   r1!, {r3, r4, r12, lr}
1:  stmia   r0!, {r3, r4, r12, lr}
2:  ldmia   r1!, {r3, r4, r12, lr}
3:  stmia   r0!, {r3, r4, r12, lr}
    subs    r2, r2, #0x20
    bge     .Lmemcpy_loop32

    cmn     r2, #0x10
4:  ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
5:  stmiage r0!, {r3, r4, r12, lr}
    subge   r2, r2, #0x10
    ldmia   sp!, {r4}           /* return r4 */

.Lmemcpy_l32:
    adds    r2, r2, #0x14

    /* blat 12 bytes at a time */
.Lmemcpy_loop12:
6:  ldmiage r1!, {r3, r12, lr}
7:  stmiage r0!, {r3, r12, lr}
    subsge  r2, r2, #0x0c
    bge     .Lmemcpy_loop12

.Lmemcpy_l12:
    adds    r2, r2, #8
    blt     .Lmemcpy_l4

    subs    r2, r2, #4
8:  ldrlt   r3, [r1], #4
9:  strlt   r3, [r0], #4
10: ldmiage r1!, {r3, r12}
11: stmiage r0!, {r3, r12}
    subge   r2, r2, #4

.Lmemcpy_l4:
    /* less than 4 bytes to go */
    adds    r2, r2, #4
    beq     .Lmemcpy_return
    /* copy the crud byte at a time */
    cmp     r2, #2
12: ldrb    r3, [r1], #1
13: strb    r3, [r0], #1
14: ldrbge  r3, [r1], #1
15: strbge  r3, [r0], #1
16: ldrbgt  r3, [r1], #1
17: strbgt  r3, [r0], #1

.Lmemcpy_return:
    ldmia   sp!, {r0, r1, r2, lr}
    mov     r0, 0
    bx      lr

    /* erg - unaligned destination */
.Lmemcpy_destul:
    rsb     r12, r12, #4
    cmp     r12, #2

    /* align destination with byte copies */
18: ldrb    r3, [r1], #1
19: strb    r3, [r0], #1
20: ldrbge  r3, [r1], #1
21: strbge  r3, [r0], #1
22: ldrbgt  r3, [r1], #1
23: strbgt  r3, [r0], #1
    subs    r2, r2, r12
    blt     .Lmemcpy_l4         /* less the 4 bytes */

    ands    r12, r1, #3
    beq     .Lmemcpy_t8         /* we have an aligned source */

    /* erg - unaligned source */
    /* This is where it gets nasty ... */
.Lmemcpy_srcul:
    bic     r1, r1, #3
24: ldr     lr, [r1], #4
    cmp     r12, #2
    bgt     .Lmemcpy_srcul3
    beq     .Lmemcpy_srcul2
    cmp     r2, #0x0c
    blt     .Lmemcpy_srcul1loop4
    sub     r2, r2, #0x0c
    stmdb   sp!, {r4, r5}

.Lmemcpy_srcul1loop16:
    mov     r3, lr, lsr #8
25: ldmia   r1!, {r4, r5, r12, lr}
    orr     r3, r3, r4, lsl #24
    mov     r4, r4, lsr #8
    orr     r4, r4, r5, lsl #24
    mov     r5, r5, lsr #8
    orr     r5, r5, r12, lsl #24
    mov     r12, r12, lsr #8
    orr     r12, r12, lr, lsl #24
26: stmia   r0!, {r3-r5, r12}
    subs    r2, r2, #0x10
    bge     .Lmemcpy_srcul1loop16
    ldmia   sp!, {r4, r5}
    adds    r2, r2, #0x0c
    blt     .Lmemcpy_srcul1l4

.Lmemcpy_srcul1loop4:
    mov     r12, lr, lsr #8
27: ldr     lr, [r1], #4
    orr     r12, r12, lr, lsl #24
28: str     r12, [r0], #4
    subs    r2, r2, #4
    bge     .Lmemcpy_srcul1loop4

.Lmemcpy_srcul1l4:
    sub     r1, r1, #3
    b       .Lmemcpy_l4

.Lmemcpy_srcul2:
    cmp     r2, #0x0c
    blt     .Lmemcpy_srcul2loop4
    sub     r2, r2, #0x0c
    stmdb   sp!, {r4, r5}

.Lmemcpy_srcul2loop16:
    mov     r3, lr, lsr #16
29: ldmia   r1!, {r4, r5, r12, lr}
    orr     r3, r3, r4, lsl #16
    mov     r4, r4, lsr #16
    orr     r4, r4, r5, lsl #16
    mov     r5, r5, lsr #16
    orr     r5, r5, r12, lsl #16
    mov     r12, r12, lsr #16
    orr     r12, r12, lr, lsl #16
30: stmia   r0!, {r3-r5, r12}
    subs    r2, r2, #0x10
    bge     .Lmemcpy_srcul2loop16
    ldmia   sp!, {r4, r5}
    adds    r2, r2, #0x0c
    blt     .Lmemcpy_srcul2l4

.Lmemcpy_srcul2loop4:
    mov     r12, lr, lsr #16
31: ldr     lr, [r1], #4
    orr     r12, r12, lr, lsl #16
32: str     r12, [r0], #4
    subs    r2, r2, #4
    bge     .Lmemcpy_srcul2loop4

.Lmemcpy_srcul2l4:
    sub     r1, r1, #2
    b       .Lmemcpy_l4

.Lmemcpy_srcul3:
    cmp     r2, #0x0c
    blt     .Lmemcpy_srcul3loop4
    sub     r2, r2, #0x0c
    stmdb   sp!, {r4, r5}

.Lmemcpy_srcul3loop16:
    mov     r3, lr, lsr #24
33: ldmia   r1!, {r4, r5, r12, lr}
    orr     r3, r3, r4, lsl #8
    mov     r4, r4, lsr #24
    orr     r4, r4, r5, lsl #8
    mov     r5, r5, lsr #24
    orr     r5, r5, r12, lsl #8
    mov     r12, r12, lsr #24
    orr     r12, r12, lr, lsl #8
34: stmia   r0!, {r3-r5, r12}
    subs    r2, r2, #0x10
    bge     .Lmemcpy_srcul3loop16
    ldmia   sp!, {r4, r5}
    adds    r2, r2, #0x0c
    blt     .Lmemcpy_srcul3l4

.Lmemcpy_srcul3loop4:
    mov     r12, lr, lsr #24
35: ldr     lr, [r1], #4
    orr     r12, r12, lr, lsl #8
36: str     r12, [r0], #4
    subs    r2, r2, #4
    bge     .Lmemcpy_srcul3loop4

.Lmemcpy_srcul3l4:
    sub     r1, r1, #1
    b       .Lmemcpy_l4

.Lfix_return1:
    ldmia   sp!, {r4}
.Lfix_return2:
    ldmia   sp!, {r0, r1}
    cmp     r0, r1
    bhs     .Lfix_return
    sub     r0, r2, r0
    b       .Lfix_return_done
.Lfix_return:
    sub     r0, r2, r1
.Lfix_return_done:
    ldmia   sp!, {r2, lr}
    sub     r0, r2, r0
    bx      lr
.Lfix_return3:
    ldmia   sp!, {r4, r5}
    b       .Lfix_return2

.pushsection __exc_table, "a"
    .long   0b,  .Lfix_return1
    .long   1b,  .Lfix_return1
    .long   2b,  .Lfix_return1
    .long   3b,  .Lfix_return1
    .long   4b,  .Lfix_return1
    .long   5b,  .Lfix_return1
    .long   6b,  .Lfix_return2
    .long   7b,  .Lfix_return2
    .long   8b,  .Lfix_return2
    .long   9b,  .Lfix_return2
    .long   10b, .Lfix_return2
    .long   11b, .Lfix_return2
    .long   12b, .Lfix_return2
    .long   13b, .Lfix_return2
    .long   14b, .Lfix_return2
    .long   15b, .Lfix_return2
    .long   16b, .Lfix_return2
    .long   17b, .Lfix_return2
    .long   18b, .Lfix_return2
    .long   19b, .Lfix_return2
    .long   20b, .Lfix_return2
    .long   21b, .Lfix_return2
    .long   22b, .Lfix_return2
    .long   23b, .Lfix_return2
    .long   24b, .Lfix_return2
    .long   25b, .Lfix_return3
    .long   26b, .Lfix_return3
    .long   27b, .Lfix_return2
    .long   28b, .Lfix_return2
    .long   29b, .Lfix_return3
    .long   30b, .Lfix_return3
    .long   31b, .Lfix_return2
    .long   32b, .Lfix_return2
    .long   33b, .Lfix_return3
    .long   34b, .Lfix_return3
    .long   35b, .Lfix_return2
    .long   36b, .Lfix_return2
.popsection