• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; void aom_subtract_block(int rows, int cols,
19;                         int16_t *diff, ptrdiff_t diff_stride,
20;                         const uint8_t *src, ptrdiff_t src_stride,
21;                         const uint8_t *pred, ptrdiff_t pred_stride)
22
23INIT_XMM sse2
24cglobal subtract_block, 7, 7, 8, \
25                        rows, cols, diff, diff_stride, src, src_stride, \
26                        pred, pred_stride
27%define pred_str colsq
28  pxor                  m7, m7         ; dedicated zero register
29  cmp                colsd, 4
30  je .case_4
31  cmp                colsd, 8
32  je .case_8
33  cmp                colsd, 16
34  je .case_16
35  cmp                colsd, 32
36  je .case_32
37  cmp                colsd, 64
38  je .case_64
39
40%macro loop16 6
41  mova                  m0, [srcq+%1]
42  mova                  m4, [srcq+%2]
43  mova                  m1, [predq+%3]
44  mova                  m5, [predq+%4]
45  punpckhbw             m2, m0, m7
46  punpckhbw             m3, m1, m7
47  punpcklbw             m0, m7
48  punpcklbw             m1, m7
49  psubw                 m2, m3
50  psubw                 m0, m1
51  punpckhbw             m1, m4, m7
52  punpckhbw             m3, m5, m7
53  punpcklbw             m4, m7
54  punpcklbw             m5, m7
55  psubw                 m1, m3
56  psubw                 m4, m5
57  mova [diffq+mmsize*0+%5], m0
58  mova [diffq+mmsize*1+%5], m2
59  mova [diffq+mmsize*0+%6], m4
60  mova [diffq+mmsize*1+%6], m1
61%endmacro
62
63  mov             pred_str, pred_stridemp
64.loop_128:
65  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
66  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
67  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
68  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
69  lea                diffq, [diffq+diff_strideq*2]
70  add                predq, pred_str
71  add                 srcq, src_strideq
72  sub                rowsd, 1
73  jnz .loop_128
74  RET
75
76.case_64:
77  mov             pred_str, pred_stridemp
78.loop_64:
79  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
80  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
81  lea                diffq, [diffq+diff_strideq*2]
82  add                predq, pred_str
83  add                 srcq, src_strideq
84  dec                rowsd
85  jg .loop_64
86  RET
87
88.case_32:
89  mov             pred_str, pred_stridemp
90.loop_32:
91  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
92  lea                diffq, [diffq+diff_strideq*2]
93  add                predq, pred_str
94  add                 srcq, src_strideq
95  dec                rowsd
96  jg .loop_32
97  RET
98
99.case_16:
100  mov             pred_str, pred_stridemp
101.loop_16:
102  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
103  lea                diffq, [diffq+diff_strideq*4]
104  lea                predq, [predq+pred_str*2]
105  lea                 srcq, [srcq+src_strideq*2]
106  sub                rowsd, 2
107  jg .loop_16
108  RET
109
110%macro loop_h 0
111  movh                  m0, [srcq]
112  movh                  m2, [srcq+src_strideq]
113  movh                  m1, [predq]
114  movh                  m3, [predq+pred_str]
115  punpcklbw             m0, m7
116  punpcklbw             m1, m7
117  punpcklbw             m2, m7
118  punpcklbw             m3, m7
119  psubw                 m0, m1
120  psubw                 m2, m3
121  mova             [diffq], m0
122  mova [diffq+diff_strideq*2], m2
123%endmacro
124
125.case_8:
126  mov             pred_str, pred_stridemp
127.loop_8:
128  loop_h
129  lea                diffq, [diffq+diff_strideq*4]
130  lea                 srcq, [srcq+src_strideq*2]
131  lea                predq, [predq+pred_str*2]
132  sub                rowsd, 2
133  jg .loop_8
134  RET
135
136INIT_MMX
137.case_4:
138  mov             pred_str, pred_stridemp
139.loop_4:
140  loop_h
141  lea                diffq, [diffq+diff_strideq*4]
142  lea                 srcq, [srcq+src_strideq*2]
143  lea                predq, [predq+pred_str*2]
144  sub                rowsd, 2
145  jg .loop_4
146  RET
147