1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15%macro HIGH_SAD_FN 4 16%if %4 == 0 17%if %3 == 5 18cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows 19%else ; %3 == 7 20cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ 21 src_stride3, ref_stride3, n_rows 22%endif ; %3 == 5/7 23%else ; avg 24%if %3 == 5 25cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ 26 second_pred, n_rows 27%else ; %3 == 7 28cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ 29 ref, ref_stride, \ 30 second_pred, \ 31 src_stride3, ref_stride3 32%if ARCH_X86_64 33%define n_rowsd r7d 34%else ; x86-32 35%define n_rowsd dword r0m 36%endif ; x86-32/64 37%endif ; %3 == 5/7 38%endif ; avg/sad 39 movsxdifnidn src_strideq, src_strided 40 movsxdifnidn ref_strideq, ref_strided 41%if %3 == 7 42 lea src_stride3q, [src_strideq*3] 43 lea ref_stride3q, [ref_strideq*3] 44%endif ; %3 == 7 45; convert src, ref & second_pred to short ptrs (from byte ptrs) 46 shl srcq, 1 47 shl refq, 1 48%if %4 == 1 49 shl second_predq, 1 50%endif 51%endmacro 52 53; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, 54; uint8_t *ref, int ref_stride); 55%macro HIGH_SAD64XN 1-2 0 56 HIGH_SAD_FN 64, %1, 5, %2 57 mov n_rowsd, %1 58 pxor m0, m0 59 pxor m6, m6 60 61.loop: 62 ; first half of each row 63 movu m1, [refq] 64 movu m2, [refq+16] 65 movu m3, [refq+32] 66 movu m4, [refq+48] 67%if %2 == 1 68 pavgw m1, [second_predq+mmsize*0] 69 pavgw m2, [second_predq+mmsize*1] 70 pavgw m3, [second_predq+mmsize*2] 71 pavgw m4, [second_predq+mmsize*3] 72 lea second_predq, [second_predq+mmsize*4] 73%endif 74 mova m5, [srcq] 75 psubusw m5, m1 76 psubusw m1, [srcq] 77 por m1, m5 78 mova m5, [srcq+16] 79 psubusw m5, m2 80 psubusw m2, [srcq+16] 81 por m2, m5 82 mova m5, [srcq+32] 83 psubusw m5, m3 84 psubusw m3, [srcq+32] 85 por m3, m5 86 mova m5, [srcq+48] 87 psubusw m5, m4 88 psubusw m4, [srcq+48] 89 por m4, m5 90 paddw m1, m2 91 paddw m3, m4 92 movhlps m2, m1 93 movhlps m4, m3 94 paddw m1, m2 95 paddw m3, m4 96 punpcklwd m1, m6 97 punpcklwd m3, m6 98 paddd m0, m1 99 paddd m0, m3 100 ; second half of each row 101 movu m1, [refq+64] 102 movu m2, [refq+80] 103 movu m3, [refq+96] 104 movu m4, [refq+112] 105%if %2 == 1 106 pavgw m1, [second_predq+mmsize*0] 107 pavgw m2, [second_predq+mmsize*1] 108 pavgw m3, [second_predq+mmsize*2] 109 pavgw m4, [second_predq+mmsize*3] 110 lea second_predq, [second_predq+mmsize*4] 111%endif 112 mova m5, [srcq+64] 113 psubusw m5, m1 114 psubusw m1, [srcq+64] 115 por m1, m5 116 mova m5, [srcq+80] 117 psubusw m5, m2 118 psubusw m2, [srcq+80] 119 por m2, m5 120 mova m5, [srcq+96] 121 psubusw m5, m3 122 psubusw m3, [srcq+96] 123 por m3, m5 124 mova m5, [srcq+112] 125 psubusw m5, m4 126 psubusw m4, [srcq+112] 127 por m4, m5 128 paddw m1, m2 129 paddw m3, m4 130 movhlps m2, m1 131 movhlps m4, m3 132 paddw m1, m2 133 paddw m3, m4 134 punpcklwd m1, m6 135 punpcklwd m3, m6 136 lea refq, [refq+ref_strideq*2] 137 paddd m0, m1 138 lea srcq, [srcq+src_strideq*2] 139 paddd m0, m3 140 141 dec n_rowsd 142 jg .loop 143 144 movhlps m1, m0 145 paddd m0, m1 146 punpckldq m0, m6 147 movhlps m1, m0 148 paddd m0, m1 149 movd eax, m0 150 RET 151%endmacro 152 153INIT_XMM sse2 154HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 155HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 156HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 157HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 158 159 160; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, 161; uint8_t *ref, int ref_stride); 162%macro HIGH_SAD32XN 1-2 0 163 HIGH_SAD_FN 32, %1, 5, %2 164 mov n_rowsd, %1 165 pxor m0, m0 166 pxor m6, m6 167 168.loop: 169 movu m1, [refq] 170 movu m2, [refq+16] 171 movu m3, [refq+32] 172 movu m4, [refq+48] 173%if %2 == 1 174 pavgw m1, [second_predq+mmsize*0] 175 pavgw m2, [second_predq+mmsize*1] 176 pavgw m3, [second_predq+mmsize*2] 177 pavgw m4, [second_predq+mmsize*3] 178 lea second_predq, [second_predq+mmsize*4] 179%endif 180 mova m5, [srcq] 181 psubusw m5, m1 182 psubusw m1, [srcq] 183 por m1, m5 184 mova m5, [srcq+16] 185 psubusw m5, m2 186 psubusw m2, [srcq+16] 187 por m2, m5 188 mova m5, [srcq+32] 189 psubusw m5, m3 190 psubusw m3, [srcq+32] 191 por m3, m5 192 mova m5, [srcq+48] 193 psubusw m5, m4 194 psubusw m4, [srcq+48] 195 por m4, m5 196 paddw m1, m2 197 paddw m3, m4 198 movhlps m2, m1 199 movhlps m4, m3 200 paddw m1, m2 201 paddw m3, m4 202 punpcklwd m1, m6 203 punpcklwd m3, m6 204 lea refq, [refq+ref_strideq*2] 205 paddd m0, m1 206 lea srcq, [srcq+src_strideq*2] 207 paddd m0, m3 208 dec n_rowsd 209 jg .loop 210 211 movhlps m1, m0 212 paddd m0, m1 213 punpckldq m0, m6 214 movhlps m1, m0 215 paddd m0, m1 216 movd eax, m0 217 RET 218%endmacro 219 220INIT_XMM sse2 221HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 222HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 223HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 224HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 225HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 226HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 227 228; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, 229; uint8_t *ref, int ref_stride); 230%macro HIGH_SAD16XN 1-2 0 231 HIGH_SAD_FN 16, %1, 5, %2 232 mov n_rowsd, %1/2 233 pxor m0, m0 234 pxor m6, m6 235 236.loop: 237 movu m1, [refq] 238 movu m2, [refq+16] 239 movu m3, [refq+ref_strideq*2] 240 movu m4, [refq+ref_strideq*2+16] 241%if %2 == 1 242 pavgw m1, [second_predq+mmsize*0] 243 pavgw m2, [second_predq+16] 244 pavgw m3, [second_predq+mmsize*2] 245 pavgw m4, [second_predq+mmsize*2+16] 246 lea second_predq, [second_predq+mmsize*4] 247%endif 248 mova m5, [srcq] 249 psubusw m5, m1 250 psubusw m1, [srcq] 251 por m1, m5 252 mova m5, [srcq+16] 253 psubusw m5, m2 254 psubusw m2, [srcq+16] 255 por m2, m5 256 mova m5, [srcq+src_strideq*2] 257 psubusw m5, m3 258 psubusw m3, [srcq+src_strideq*2] 259 por m3, m5 260 mova m5, [srcq+src_strideq*2+16] 261 psubusw m5, m4 262 psubusw m4, [srcq+src_strideq*2+16] 263 por m4, m5 264 paddw m1, m2 265 paddw m3, m4 266 movhlps m2, m1 267 movhlps m4, m3 268 paddw m1, m2 269 paddw m3, m4 270 punpcklwd m1, m6 271 punpcklwd m3, m6 272 lea refq, [refq+ref_strideq*4] 273 paddd m0, m1 274 lea srcq, [srcq+src_strideq*4] 275 paddd m0, m3 276 dec n_rowsd 277 jg .loop 278 279 movhlps m1, m0 280 paddd m0, m1 281 punpckldq m0, m6 282 movhlps m1, m0 283 paddd m0, m1 284 movd eax, m0 285 RET 286%endmacro 287 288INIT_XMM sse2 289HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 290HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 291HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 292HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 293HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 294HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 295 296 297; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, 298; uint8_t *ref, int ref_stride); 299%macro HIGH_SAD8XN 1-2 0 300 HIGH_SAD_FN 8, %1, 7, %2 301 mov n_rowsd, %1/4 302 pxor m0, m0 303 pxor m6, m6 304 305.loop: 306 movu m1, [refq] 307 movu m2, [refq+ref_strideq*2] 308 movu m3, [refq+ref_strideq*4] 309 movu m4, [refq+ref_stride3q*2] 310%if %2 == 1 311 pavgw m1, [second_predq+mmsize*0] 312 pavgw m2, [second_predq+mmsize*1] 313 pavgw m3, [second_predq+mmsize*2] 314 pavgw m4, [second_predq+mmsize*3] 315 lea second_predq, [second_predq+mmsize*4] 316%endif 317 mova m5, [srcq] 318 psubusw m5, m1 319 psubusw m1, [srcq] 320 por m1, m5 321 mova m5, [srcq+src_strideq*2] 322 psubusw m5, m2 323 psubusw m2, [srcq+src_strideq*2] 324 por m2, m5 325 mova m5, [srcq+src_strideq*4] 326 psubusw m5, m3 327 psubusw m3, [srcq+src_strideq*4] 328 por m3, m5 329 mova m5, [srcq+src_stride3q*2] 330 psubusw m5, m4 331 psubusw m4, [srcq+src_stride3q*2] 332 por m4, m5 333 paddw m1, m2 334 paddw m3, m4 335 movhlps m2, m1 336 movhlps m4, m3 337 paddw m1, m2 338 paddw m3, m4 339 punpcklwd m1, m6 340 punpcklwd m3, m6 341 lea refq, [refq+ref_strideq*8] 342 paddd m0, m1 343 lea srcq, [srcq+src_strideq*8] 344 paddd m0, m3 345 dec n_rowsd 346 jg .loop 347 348 movhlps m1, m0 349 paddd m0, m1 350 punpckldq m0, m6 351 movhlps m1, m0 352 paddd m0, m1 353 movd eax, m0 354 RET 355%endmacro 356 357INIT_XMM sse2 358HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 359HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 360HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 361HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 362HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 363HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 364