1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro STACK_FRAME_CREATE_X3 0 14%if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define max_sad arg(4) 23 %define height dword ptr arg(4) 24 push rbp 25 mov rbp, rsp 26 push rsi 27 push rdi 28 push rbx 29 30 mov rsi, arg(0) ; src_ptr 31 mov rdi, arg(2) ; ref_ptr 32 33 movsxd rax, dword ptr arg(1) ; src_stride 34 movsxd rdx, dword ptr arg(3) ; ref_stride 35%else 36 %if LIBVPX_YASM_WIN64 37 SAVE_XMM 7, u 38 %define src_ptr rcx 39 %define src_stride rdx 40 %define ref_ptr r8 41 %define ref_stride r9 42 %define end_ptr r10 43 %define ret_var r11 44 %define result_ptr [rsp+xmm_stack_space+8+4*8] 45 %define max_sad [rsp+xmm_stack_space+8+4*8] 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] 47 %else 48 %define src_ptr rdi 49 %define src_stride rsi 50 %define ref_ptr rdx 51 %define ref_stride rcx 52 %define end_ptr r9 53 %define ret_var r10 54 %define result_ptr r8 55 %define max_sad r8 56 %define height r8 57 %endif 58%endif 59 60%endmacro 61 62%macro STACK_FRAME_DESTROY_X3 0 63 %define src_ptr 64 %define src_stride 65 %define ref_ptr 66 %define ref_stride 67 %define end_ptr 68 %define ret_var 69 %define result_ptr 70 %define max_sad 71 %define height 72 73%if ABI_IS_32BIT 74 pop rbx 75 pop rdi 76 pop rsi 77 pop rbp 78%else 79 %if LIBVPX_YASM_WIN64 80 RESTORE_XMM 81 %endif 82%endif 83 ret 84%endmacro 85 86SECTION .text 87 88;void vp8_copy32xn_sse3( 89; unsigned char *src_ptr, 90; int src_stride, 91; unsigned char *dst_ptr, 92; int dst_stride, 93; int height); 94global sym(vp8_copy32xn_sse3) PRIVATE 95sym(vp8_copy32xn_sse3): 96 97 STACK_FRAME_CREATE_X3 98 99.block_copy_sse3_loopx4: 100 lea end_ptr, [src_ptr+src_stride*2] 101 102 movdqu xmm0, XMMWORD PTR [src_ptr] 103 movdqu xmm1, XMMWORD PTR [src_ptr + 16] 104 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] 105 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] 106 movdqu xmm4, XMMWORD PTR [end_ptr] 107 movdqu xmm5, XMMWORD PTR [end_ptr + 16] 108 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] 109 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] 110 111 lea src_ptr, [src_ptr+src_stride*4] 112 113 lea end_ptr, [ref_ptr+ref_stride*2] 114 115 movdqa XMMWORD PTR [ref_ptr], xmm0 116 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 117 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 118 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 119 movdqa XMMWORD PTR [end_ptr], xmm4 120 movdqa XMMWORD PTR [end_ptr + 16], xmm5 121 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 122 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 123 124 lea ref_ptr, [ref_ptr+ref_stride*4] 125 126 sub height, 4 127 cmp height, 4 128 jge .block_copy_sse3_loopx4 129 130 ;Check to see if there is more rows need to be copied. 131 cmp height, 0 132 je .copy_is_done 133 134.block_copy_sse3_loop: 135 movdqu xmm0, XMMWORD PTR [src_ptr] 136 movdqu xmm1, XMMWORD PTR [src_ptr + 16] 137 lea src_ptr, [src_ptr+src_stride] 138 139 movdqa XMMWORD PTR [ref_ptr], xmm0 140 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 141 lea ref_ptr, [ref_ptr+ref_stride] 142 143 sub height, 1 144 jne .block_copy_sse3_loop 145 146.copy_is_done: 147 STACK_FRAME_DESTROY_X3 148