1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <private/bionic_asm.h> 32 33#include "cache.h" 34 35#ifndef L 36# define L(label) .L##label 37#endif 38 39#ifndef ALIGN 40# define ALIGN(n) .p2align n 41#endif 42 43 .section .text.avx2,"ax",@progbits 44 45ENTRY(__memset_chk_avx2) 46 # %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len 47 cmp %rcx, %rdx 48 ja __memset_chk_fail 49 // Fall through to memset... 50END(__memset_chk_avx2) 51 52ENTRY(memset_avx2) 53 movq %rdi, %rax 54 and $0xff, %rsi 55 mov $0x0101010101010101, %rcx 56 imul %rsi, %rcx 57 cmpq $16, %rdx 58 jae L(16bytesormore) 59 testb $8, %dl 60 jnz L(8_15bytes) 61 testb $4, %dl 62 jnz L(4_7bytes) 63 testb $2, %dl 64 jnz L(2_3bytes) 65 testb $1, %dl 66 jz 1f 67 movb %cl, (%rdi) 681: ret 69 70L(8_15bytes): 71 movq %rcx, (%rdi) 72 movq %rcx, -8(%rdi, %rdx) 73 ret 74 75L(4_7bytes): 76 movl %ecx, (%rdi) 77 movl %ecx, -4(%rdi, %rdx) 78 ret 79 80L(2_3bytes): 81 movw %cx, (%rdi) 82 movw %cx, -2(%rdi, %rdx) 83 ret 84 85 ALIGN (4) 86L(16bytesormore): 87 movd %rcx, %xmm0 88 pshufd $0, %xmm0, %xmm0 89 movdqu %xmm0, (%rdi) 90 movdqu %xmm0, -16(%rdi, %rdx) 91 cmpq $32, %rdx 92 jbe L(done) 93 movdqu %xmm0, 16(%rdi) 94 movdqu %xmm0, -32(%rdi, %rdx) 95 cmpq $64, %rdx 96 jbe L(done) 97 movdqu %xmm0, 32(%rdi) 98 movdqu %xmm0, 48(%rdi) 99 movdqu %xmm0, -64(%rdi, %rdx) 100 movdqu %xmm0, -48(%rdi, %rdx) 101 cmpq $128, %rdx 102 jbe L(done) 103 vpbroadcastb %xmm0, %ymm0 104 vmovdqu %ymm0, 64(%rdi) 105 vmovdqu %ymm0, 96(%rdi) 106 vmovdqu %ymm0, -128(%rdi, %rdx) 107 vmovdqu %ymm0, -96(%rdi, %rdx) 108 cmpq $256, %rdx 109 jbe L(done) 110 111 ALIGN (4) 112 leaq 128(%rdi), %rcx 113 andq $-128, %rcx 114 movq %rdx, %r8 115 addq %rdi, %rdx 116 andq $-128, %rdx 117 cmpq %rcx, %rdx 118 je L(done) 119 120#ifdef SHARED_CACHE_SIZE 121 cmp $SHARED_CACHE_SIZE, %r8 122#else 123 cmp __x86_64_shared_cache_size(%rip), %r8 124#endif 125 ja L(non_temporal_loop) 126 127 ALIGN (4) 128L(normal_loop): 129 vmovdqa %ymm0, (%rcx) 130 vmovdqa %ymm0, 32(%rcx) 131 vmovdqa %ymm0, 64(%rcx) 132 vmovdqa %ymm0, 96(%rcx) 133 addq $128, %rcx 134 cmpq %rcx, %rdx 135 jne L(normal_loop) 136 jmp L(done) 137 138 ALIGN (4) 139L(non_temporal_loop): 140 movntdq %xmm0, (%rcx) 141 movntdq %xmm0, 16(%rcx) 142 movntdq %xmm0, 32(%rcx) 143 movntdq %xmm0, 48(%rcx) 144 movntdq %xmm0, 64(%rcx) 145 movntdq %xmm0, 80(%rcx) 146 movntdq %xmm0, 96(%rcx) 147 movntdq %xmm0, 112(%rcx) 148 leaq 128(%rcx), %rcx 149 cmpq %rcx, %rdx 150 jne L(non_temporal_loop) 151 # We used non-temporal stores, so we need a fence here. 152 sfence 153 154L(done): 155 # We used the ymm registers, and that can break SSE2 performance 156 # unless you do this. 157 vzeroupper 158 ret 159 160END(memset_avx2) 161