1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; void aom_subtract_block(int rows, int cols, 19; int16_t *diff, ptrdiff_t diff_stride, 20; const uint8_t *src, ptrdiff_t src_stride, 21; const uint8_t *pred, ptrdiff_t pred_stride) 22 23INIT_XMM sse2 24cglobal subtract_block, 7, 7, 8, \ 25 rows, cols, diff, diff_stride, src, src_stride, \ 26 pred, pred_stride 27%define pred_str colsq 28 pxor m7, m7 ; dedicated zero register 29 cmp colsd, 4 30 je .case_4 31 cmp colsd, 8 32 je .case_8 33 cmp colsd, 16 34 je .case_16 35 cmp colsd, 32 36 je .case_32 37 cmp colsd, 64 38 je .case_64 39 40%macro loop16 6 41 mova m0, [srcq+%1] 42 mova m4, [srcq+%2] 43 mova m1, [predq+%3] 44 mova m5, [predq+%4] 45 punpckhbw m2, m0, m7 46 punpckhbw m3, m1, m7 47 punpcklbw m0, m7 48 punpcklbw m1, m7 49 psubw m2, m3 50 psubw m0, m1 51 punpckhbw m1, m4, m7 52 punpckhbw m3, m5, m7 53 punpcklbw m4, m7 54 punpcklbw m5, m7 55 psubw m1, m3 56 psubw m4, m5 57 mova [diffq+mmsize*0+%5], m0 58 mova [diffq+mmsize*1+%5], m2 59 mova [diffq+mmsize*0+%6], m4 60 mova [diffq+mmsize*1+%6], m1 61%endmacro 62 63 mov pred_str, pred_stridemp 64.loop_128: 65 loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize 66 loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize 67 loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize 68 loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize 69 lea diffq, [diffq+diff_strideq*2] 70 add predq, pred_str 71 add srcq, src_strideq 72 sub rowsd, 1 73 jnz .loop_128 74 RET 75 76.case_64: 77 mov pred_str, pred_stridemp 78.loop_64: 79 loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize 80 loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize 81 lea diffq, [diffq+diff_strideq*2] 82 add predq, pred_str 83 add srcq, src_strideq 84 dec rowsd 85 jg .loop_64 86 RET 87 88.case_32: 89 mov pred_str, pred_stridemp 90.loop_32: 91 loop16 0, mmsize, 0, mmsize, 0, 2*mmsize 92 lea diffq, [diffq+diff_strideq*2] 93 add predq, pred_str 94 add srcq, src_strideq 95 dec rowsd 96 jg .loop_32 97 RET 98 99.case_16: 100 mov pred_str, pred_stridemp 101.loop_16: 102 loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 103 lea diffq, [diffq+diff_strideq*4] 104 lea predq, [predq+pred_str*2] 105 lea srcq, [srcq+src_strideq*2] 106 sub rowsd, 2 107 jg .loop_16 108 RET 109 110%macro loop_h 0 111 movh m0, [srcq] 112 movh m2, [srcq+src_strideq] 113 movh m1, [predq] 114 movh m3, [predq+pred_str] 115 punpcklbw m0, m7 116 punpcklbw m1, m7 117 punpcklbw m2, m7 118 punpcklbw m3, m7 119 psubw m0, m1 120 psubw m2, m3 121 mova [diffq], m0 122 mova [diffq+diff_strideq*2], m2 123%endmacro 124 125.case_8: 126 mov pred_str, pred_stridemp 127.loop_8: 128 loop_h 129 lea diffq, [diffq+diff_strideq*4] 130 lea srcq, [srcq+src_strideq*2] 131 lea predq, [predq+pred_str*2] 132 sub rowsd, 2 133 jg .loop_8 134 RET 135 136INIT_MMX 137.case_4: 138 mov pred_str, pred_stridemp 139.loop_4: 140 loop_h 141 lea diffq, [diffq+diff_strideq*4] 142 lea srcq, [srcq+src_strideq*2] 143 lea predq, [predq+pred_str*2] 144 sub rowsd, 2 145 jg .loop_4 146 RET 147