1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end 16%macro PROCESS_4x2x4 5-6 0 17 movd m0, [srcq +%2] 18%if %1 == 1 19 movd m6, [ref1q+%3] 20 movd m4, [ref2q+%3] 21 movd m7, [ref3q+%3] 22 movd m5, [ref4q+%3] 23 movd m1, [srcq +%4] 24 movd m2, [ref1q+%5] 25 punpckldq m0, m1 26 punpckldq m6, m2 27 movd m1, [ref2q+%5] 28 movd m2, [ref3q+%5] 29 movd m3, [ref4q+%5] 30 punpckldq m4, m1 31 punpckldq m7, m2 32 punpckldq m5, m3 33 movlhps m0, m0 34 movlhps m6, m4 35 movlhps m7, m5 36 psadbw m6, m0 37 psadbw m7, m0 38%else 39 movd m1, [ref1q+%3] 40 movd m5, [ref1q+%5] 41 movd m2, [ref2q+%3] 42 movd m4, [ref2q+%5] 43 punpckldq m1, m5 44 punpckldq m2, m4 45 movd m3, [ref3q+%3] 46 movd m5, [ref3q+%5] 47 punpckldq m3, m5 48 movd m4, [ref4q+%3] 49 movd m5, [ref4q+%5] 50 punpckldq m4, m5 51 movd m5, [srcq +%4] 52 punpckldq m0, m5 53 movlhps m0, m0 54 movlhps m1, m2 55 movlhps m3, m4 56 psadbw m1, m0 57 psadbw m3, m0 58 paddd m6, m1 59 paddd m7, m3 60%endif 61%if %6 == 1 62 lea srcq, [srcq +src_strideq*2] 63 lea ref1q, [ref1q+ref_strideq*2] 64 lea ref2q, [ref2q+ref_strideq*2] 65 lea ref3q, [ref3q+ref_strideq*2] 66 lea ref4q, [ref4q+ref_strideq*2] 67%endif 68%endmacro 69 70; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end 71%macro PROCESS_8x2x4 5-6 0 72 movh m0, [srcq +%2] 73%if %1 == 1 74 movh m4, [ref1q+%3] 75 movh m5, [ref2q+%3] 76 movh m6, [ref3q+%3] 77 movh m7, [ref4q+%3] 78 movhps m0, [srcq +%4] 79 movhps m4, [ref1q+%5] 80 movhps m5, [ref2q+%5] 81 movhps m6, [ref3q+%5] 82 movhps m7, [ref4q+%5] 83 psadbw m4, m0 84 psadbw m5, m0 85 psadbw m6, m0 86 psadbw m7, m0 87%else 88 movh m1, [ref1q+%3] 89 movh m2, [ref2q+%3] 90 movh m3, [ref3q+%3] 91 movhps m0, [srcq +%4] 92 movhps m1, [ref1q+%5] 93 movhps m2, [ref2q+%5] 94 movhps m3, [ref3q+%5] 95 psadbw m1, m0 96 psadbw m2, m0 97 psadbw m3, m0 98 paddd m4, m1 99 movh m1, [ref4q+%3] 100 movhps m1, [ref4q+%5] 101 paddd m5, m2 102 paddd m6, m3 103 psadbw m1, m0 104 paddd m7, m1 105%endif 106%if %6 == 1 107 lea srcq, [srcq +src_strideq*2] 108 lea ref1q, [ref1q+ref_strideq*2] 109 lea ref2q, [ref2q+ref_strideq*2] 110 lea ref3q, [ref3q+ref_strideq*2] 111 lea ref4q, [ref4q+ref_strideq*2] 112%endif 113%endmacro 114 115; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end 116%macro PROCESS_16x2x4 5-6 0 117 ; 1st 16 px 118 mova m0, [srcq +%2] 119%if %1 == 1 120 movu m4, [ref1q+%3] 121 movu m5, [ref2q+%3] 122 movu m6, [ref3q+%3] 123 movu m7, [ref4q+%3] 124 psadbw m4, m0 125 psadbw m5, m0 126 psadbw m6, m0 127 psadbw m7, m0 128%else 129 movu m1, [ref1q+%3] 130 movu m2, [ref2q+%3] 131 movu m3, [ref3q+%3] 132 psadbw m1, m0 133 psadbw m2, m0 134 psadbw m3, m0 135 paddd m4, m1 136 movu m1, [ref4q+%3] 137 paddd m5, m2 138 paddd m6, m3 139 psadbw m1, m0 140 paddd m7, m1 141%endif 142 143 ; 2nd 16 px 144 mova m0, [srcq +%4] 145 movu m1, [ref1q+%5] 146 movu m2, [ref2q+%5] 147 movu m3, [ref3q+%5] 148 psadbw m1, m0 149 psadbw m2, m0 150 psadbw m3, m0 151 paddd m4, m1 152 movu m1, [ref4q+%5] 153 paddd m5, m2 154 paddd m6, m3 155%if %6 == 1 156 lea srcq, [srcq +src_strideq*2] 157 lea ref1q, [ref1q+ref_strideq*2] 158 lea ref2q, [ref2q+ref_strideq*2] 159 lea ref3q, [ref3q+ref_strideq*2] 160 lea ref4q, [ref4q+ref_strideq*2] 161%endif 162 psadbw m1, m0 163 paddd m7, m1 164%endmacro 165 166; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end 167%macro PROCESS_32x2x4 5-6 0 168 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 169 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 170%endmacro 171 172; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end 173%macro PROCESS_64x2x4 5-6 0 174 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 175 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 176%endmacro 177 178; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, 179; uint8_t *ref[4], int ref_stride, 180; uint32_t res[4]); 181; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 182%macro SADNXN4D 2 183%if UNIX64 184cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 185 res, ref2, ref3, ref4 186%else 187cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 188 ref2, ref3, ref4 189%endif 190 movsxdifnidn src_strideq, src_strided 191 movsxdifnidn ref_strideq, ref_strided 192 mov ref2q, [ref1q+gprsize*1] 193 mov ref3q, [ref1q+gprsize*2] 194 mov ref4q, [ref1q+gprsize*3] 195 mov ref1q, [ref1q+gprsize*0] 196 197 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 198%rep (%2-4)/2 199 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 200%endrep 201 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 202 203%if %1 > 4 204 pslldq m5, 4 205 pslldq m7, 4 206 por m4, m5 207 por m6, m7 208 mova m5, m4 209 mova m7, m6 210 punpcklqdq m4, m6 211 punpckhqdq m5, m7 212 movifnidn r4, r4mp 213 paddd m4, m5 214 movu [r4], m4 215 RET 216%else 217 movifnidn r4, r4mp 218 pshufd m6, m6, 0x08 219 pshufd m7, m7, 0x08 220 movq [r4+0], m6 221 movq [r4+8], m7 222 RET 223%endif 224%endmacro 225 226INIT_XMM sse2 227SADNXN4D 64, 64 228SADNXN4D 64, 32 229SADNXN4D 32, 64 230SADNXN4D 32, 32 231SADNXN4D 32, 16 232SADNXN4D 16, 32 233SADNXN4D 16, 16 234SADNXN4D 16, 8 235SADNXN4D 8, 16 236SADNXN4D 8, 8 237SADNXN4D 8, 4 238SADNXN4D 4, 8 239SADNXN4D 4, 4 240