1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end 16%macro HIGH_PROCESS_4x2x4 5-6 0 17 movh m0, [srcq +%2*2] 18%if %1 == 1 19 movu m4, [ref1q+%3*2] 20 movu m5, [ref2q+%3*2] 21 movu m6, [ref3q+%3*2] 22 movu m7, [ref4q+%3*2] 23 movhps m0, [srcq +%4*2] 24 movhps m4, [ref1q+%5*2] 25 movhps m5, [ref2q+%5*2] 26 movhps m6, [ref3q+%5*2] 27 movhps m7, [ref4q+%5*2] 28 mova m3, m0 29 mova m2, m0 30 psubusw m3, m4 31 psubusw m2, m5 32 psubusw m4, m0 33 psubusw m5, m0 34 por m4, m3 35 por m5, m2 36 pmaddwd m4, m1 37 pmaddwd m5, m1 38 mova m3, m0 39 mova m2, m0 40 psubusw m3, m6 41 psubusw m2, m7 42 psubusw m6, m0 43 psubusw m7, m0 44 por m6, m3 45 por m7, m2 46 pmaddwd m6, m1 47 pmaddwd m7, m1 48%else 49 movu m2, [ref1q+%3*2] 50 movhps m0, [srcq +%4*2] 51 movhps m2, [ref1q+%5*2] 52 mova m3, m0 53 psubusw m3, m2 54 psubusw m2, m0 55 por m2, m3 56 pmaddwd m2, m1 57 paddd m4, m2 58 59 movu m2, [ref2q+%3*2] 60 mova m3, m0 61 movhps m2, [ref2q+%5*2] 62 psubusw m3, m2 63 psubusw m2, m0 64 por m2, m3 65 pmaddwd m2, m1 66 paddd m5, m2 67 68 movu m2, [ref3q+%3*2] 69 mova m3, m0 70 movhps m2, [ref3q+%5*2] 71 psubusw m3, m2 72 psubusw m2, m0 73 por m2, m3 74 pmaddwd m2, m1 75 paddd m6, m2 76 77 movu m2, [ref4q+%3*2] 78 mova m3, m0 79 movhps m2, [ref4q+%5*2] 80 psubusw m3, m2 81 psubusw m2, m0 82 por m2, m3 83 pmaddwd m2, m1 84 paddd m7, m2 85%endif 86%if %6 == 1 87 lea srcq, [srcq +src_strideq*4] 88 lea ref1q, [ref1q+ref_strideq*4] 89 lea ref2q, [ref2q+ref_strideq*4] 90 lea ref3q, [ref3q+ref_strideq*4] 91 lea ref4q, [ref4q+ref_strideq*4] 92%endif 93%endmacro 94 95; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end 96%macro HIGH_PROCESS_8x2x4 5-6 0 97 ; 1st 8 px 98 mova m0, [srcq +%2*2] 99%if %1 == 1 100 movu m4, [ref1q+%3*2] 101 movu m5, [ref2q+%3*2] 102 movu m6, [ref3q+%3*2] 103 movu m7, [ref4q+%3*2] 104 mova m3, m0 105 mova m2, m0 106 psubusw m3, m4 107 psubusw m2, m5 108 psubusw m4, m0 109 psubusw m5, m0 110 por m4, m3 111 por m5, m2 112 pmaddwd m4, m1 113 pmaddwd m5, m1 114 mova m3, m0 115 mova m2, m0 116 psubusw m3, m6 117 psubusw m2, m7 118 psubusw m6, m0 119 psubusw m7, m0 120 por m6, m3 121 por m7, m2 122 pmaddwd m6, m1 123 pmaddwd m7, m1 124%else 125 mova m3, m0 126 movu m2, [ref1q+%3*2] 127 psubusw m3, m2 128 psubusw m2, m0 129 por m2, m3 130 mova m3, m0 131 pmaddwd m2, m1 132 paddd m4, m2 133 movu m2, [ref2q+%3*2] 134 psubusw m3, m2 135 psubusw m2, m0 136 por m2, m3 137 mova m3, m0 138 pmaddwd m2, m1 139 paddd m5, m2 140 movu m2, [ref3q+%3*2] 141 psubusw m3, m2 142 psubusw m2, m0 143 por m2, m3 144 mova m3, m0 145 pmaddwd m2, m1 146 paddd m6, m2 147 movu m2, [ref4q+%3*2] 148 psubusw m3, m2 149 psubusw m2, m0 150 por m2, m3 151 pmaddwd m2, m1 152 paddd m7, m2 153%endif 154 155 ; 2nd 8 px 156 mova m0, [srcq +(%4)*2] 157 mova m3, m0 158 movu m2, [ref1q+(%5)*2] 159 psubusw m3, m2 160 psubusw m2, m0 161 por m2, m3 162 mova m3, m0 163 pmaddwd m2, m1 164 paddd m4, m2 165 movu m2, [ref2q+(%5)*2] 166 psubusw m3, m2 167 psubusw m2, m0 168 por m2, m3 169 mova m3, m0 170 pmaddwd m2, m1 171 paddd m5, m2 172 movu m2, [ref3q+(%5)*2] 173 psubusw m3, m2 174 psubusw m2, m0 175 por m2, m3 176 mova m3, m0 177 pmaddwd m2, m1 178 paddd m6, m2 179 movu m2, [ref4q+(%5)*2] 180 psubusw m3, m2 181 psubusw m2, m0 182%if %6 == 1 183 lea srcq, [srcq +src_strideq*4] 184 lea ref1q, [ref1q+ref_strideq*4] 185 lea ref2q, [ref2q+ref_strideq*4] 186 lea ref3q, [ref3q+ref_strideq*4] 187 lea ref4q, [ref4q+ref_strideq*4] 188%endif 189 por m2, m3 190 pmaddwd m2, m1 191 paddd m7, m2 192%endmacro 193 194; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end 195%macro HIGH_PROCESS_16x2x4 5-6 0 196 HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) 197 HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 198%endmacro 199 200; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end 201%macro HIGH_PROCESS_32x2x4 5-6 0 202 HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) 203 HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 204%endmacro 205 206; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end 207%macro HIGH_PROCESS_64x2x4 5-6 0 208 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) 209 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 210%endmacro 211 212; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, 213; uint8_t *ref[4], int ref_stride, 214; uint32_t res[4]); 215; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 216%macro HIGH_SADNXN4D 2 217%if UNIX64 218cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 219 res, ref2, ref3, ref4 220%else 221cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 222 ref2, ref3, ref4 223%endif 224 225; set m1 226 push srcq 227 mov srcd, 0x00010001 228 movd m1, srcd 229 pshufd m1, m1, 0x0 230 pop srcq 231 232 movsxdifnidn src_strideq, src_strided 233 movsxdifnidn ref_strideq, ref_strided 234 mov ref2q, [ref1q+gprsize*1] 235 mov ref3q, [ref1q+gprsize*2] 236 mov ref4q, [ref1q+gprsize*3] 237 mov ref1q, [ref1q+gprsize*0] 238 239; convert byte pointers to short pointers 240 shl srcq, 1 241 shl ref2q, 1 242 shl ref3q, 1 243 shl ref4q, 1 244 shl ref1q, 1 245 246 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 247%rep (%2-4)/2 248 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 249%endrep 250 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 251 ; N.B. HIGH_PROCESS outputs dwords (32 bits) 252 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM 253 movhlps m0, m4 254 movhlps m1, m5 255 movhlps m2, m6 256 movhlps m3, m7 257 paddd m4, m0 258 paddd m5, m1 259 paddd m6, m2 260 paddd m7, m3 261 punpckldq m4, m5 262 punpckldq m6, m7 263 movhlps m0, m4 264 movhlps m1, m6 265 paddd m4, m0 266 paddd m6, m1 267 punpcklqdq m4, m6 268 movifnidn r4, r4mp 269 movu [r4], m4 270 RET 271%endmacro 272 273 274INIT_XMM sse2 275HIGH_SADNXN4D 64, 64 276HIGH_SADNXN4D 64, 32 277HIGH_SADNXN4D 32, 64 278HIGH_SADNXN4D 32, 32 279HIGH_SADNXN4D 32, 16 280HIGH_SADNXN4D 16, 32 281HIGH_SADNXN4D 16, 16 282HIGH_SADNXN4D 16, 8 283HIGH_SADNXN4D 8, 16 284HIGH_SADNXN4D 8, 8 285HIGH_SADNXN4D 8, 4 286HIGH_SADNXN4D 4, 8 287HIGH_SADNXN4D 4, 4 288