1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, 16; int64_t *ssz) 17 18INIT_XMM sse2 19cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz 20 pxor m4, m4 ; sse accumulator 21 pxor m6, m6 ; ssz accumulator 22 pxor m5, m5 ; dedicated zero register 23 lea uqcq, [uqcq+sizeq*2] 24 lea dqcq, [dqcq+sizeq*2] 25 neg sizeq 26.loop: 27 mova m2, [uqcq+sizeq*2] 28 mova m0, [dqcq+sizeq*2] 29 mova m3, [uqcq+sizeq*2+mmsize] 30 mova m1, [dqcq+sizeq*2+mmsize] 31 psubw m0, m2 32 psubw m1, m3 33 ; individual errors are max. 15bit+sign, so squares are 30bit, and 34 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 35 pmaddwd m0, m0 36 pmaddwd m1, m1 37 pmaddwd m2, m2 38 pmaddwd m3, m3 39 ; accumulate in 64bit 40 punpckldq m7, m0, m5 41 punpckhdq m0, m5 42 paddq m4, m7 43 punpckldq m7, m1, m5 44 paddq m4, m0 45 punpckhdq m1, m5 46 paddq m4, m7 47 punpckldq m7, m2, m5 48 paddq m4, m1 49 punpckhdq m2, m5 50 paddq m6, m7 51 punpckldq m7, m3, m5 52 paddq m6, m2 53 punpckhdq m3, m5 54 paddq m6, m7 55 paddq m6, m3 56 add sizeq, mmsize 57 jl .loop 58 59 ; accumulate horizontally and store in return value 60 movhlps m5, m4 61 movhlps m7, m6 62 paddq m4, m5 63 paddq m6, m7 64%if ARCH_X86_64 65 movq rax, m4 66 movq [sszq], m6 67%else 68 mov eax, sszm 69 pshufd m5, m4, 0x1 70 movq [eax], m6 71 movd eax, m4 72 movd edx, m5 73%endif 74 RET 75