1/* Copyright 2002 Andi Kleen, SuSE Labs */ 2 3#include <linux/linkage.h> 4#include <asm/cpufeatures.h> 5#include <asm/alternative-asm.h> 6 7.weak memset 8 9/* 10 * ISO C memset - set a memory block to a byte value. This function uses fast 11 * string to get better performance than the original function. The code is 12 * simpler and shorter than the orignal function as well. 13 * 14 * rdi destination 15 * rsi value (char) 16 * rdx count (bytes) 17 * 18 * rax original destination 19 */ 20ENTRY(memset) 21ENTRY(__memset) 22 /* 23 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 24 * to use it when possible. If not available, use fast string instructions. 25 * 26 * Otherwise, use original memset function. 27 */ 28 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 29 "jmp memset_erms", X86_FEATURE_ERMS 30 31 movq %rdi,%r9 32 movq %rdx,%rcx 33 andl $7,%edx 34 shrq $3,%rcx 35 /* expand byte value */ 36 movzbl %sil,%esi 37 movabs $0x0101010101010101,%rax 38 imulq %rsi,%rax 39 rep stosq 40 movl %edx,%ecx 41 rep stosb 42 movq %r9,%rax 43 ret 44ENDPROC(memset) 45ENDPROC(__memset) 46 47/* 48 * ISO C memset - set a memory block to a byte value. This function uses 49 * enhanced rep stosb to override the fast string function. 50 * The code is simpler and shorter than the fast string function as well. 51 * 52 * rdi destination 53 * rsi value (char) 54 * rdx count (bytes) 55 * 56 * rax original destination 57 */ 58ENTRY(memset_erms) 59 movq %rdi,%r9 60 movb %sil,%al 61 movq %rdx,%rcx 62 rep stosb 63 movq %r9,%rax 64 ret 65ENDPROC(memset_erms) 66 67ENTRY(memset_orig) 68 movq %rdi,%r10 69 70 /* expand byte value */ 71 movzbl %sil,%ecx 72 movabs $0x0101010101010101,%rax 73 imulq %rcx,%rax 74 75 /* align dst */ 76 movl %edi,%r9d 77 andl $7,%r9d 78 jnz .Lbad_alignment 79.Lafter_bad_alignment: 80 81 movq %rdx,%rcx 82 shrq $6,%rcx 83 jz .Lhandle_tail 84 85 .p2align 4 86.Lloop_64: 87 decq %rcx 88 movq %rax,(%rdi) 89 movq %rax,8(%rdi) 90 movq %rax,16(%rdi) 91 movq %rax,24(%rdi) 92 movq %rax,32(%rdi) 93 movq %rax,40(%rdi) 94 movq %rax,48(%rdi) 95 movq %rax,56(%rdi) 96 leaq 64(%rdi),%rdi 97 jnz .Lloop_64 98 99 /* Handle tail in loops. The loops should be faster than hard 100 to predict jump tables. */ 101 .p2align 4 102.Lhandle_tail: 103 movl %edx,%ecx 104 andl $63&(~7),%ecx 105 jz .Lhandle_7 106 shrl $3,%ecx 107 .p2align 4 108.Lloop_8: 109 decl %ecx 110 movq %rax,(%rdi) 111 leaq 8(%rdi),%rdi 112 jnz .Lloop_8 113 114.Lhandle_7: 115 andl $7,%edx 116 jz .Lende 117 .p2align 4 118.Lloop_1: 119 decl %edx 120 movb %al,(%rdi) 121 leaq 1(%rdi),%rdi 122 jnz .Lloop_1 123 124.Lende: 125 movq %r10,%rax 126 ret 127 128.Lbad_alignment: 129 cmpq $7,%rdx 130 jbe .Lhandle_7 131 movq %rax,(%rdi) /* unaligned store */ 132 movq $8,%r8 133 subq %r9,%r8 134 addq %r8,%rdi 135 subq %r8,%rdx 136 jmp .Lafter_bad_alignment 137.Lfinal: 138ENDPROC(memset_orig) 139