• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
15global sym(vp8_short_inv_walsh4x4_1_mmx)
16sym(vp8_short_inv_walsh4x4_1_mmx):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 2
20    push        rsi
21    push        rdi
22    ; end prolog
23
24    mov     rsi, arg(0)
25    mov     rax, 3
26
27    mov     rdi, arg(1)
28    add     rax, [rsi]          ;input[0] + 3
29
30    movd    mm0, eax
31
32    punpcklwd mm0, mm0          ;x x val val
33
34    punpckldq mm0, mm0          ;val val val val
35
36    psraw   mm0, 3            ;(input[0] + 3) >> 3
37
38    movq  [rdi + 0], mm0
39    movq  [rdi + 8], mm0
40    movq  [rdi + 16], mm0
41    movq  [rdi + 24], mm0
42
43    ; begin epilog
44    pop rdi
45    pop rsi
46    UNSHADOW_ARGS
47    pop         rbp
48    ret
49
50;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
51global sym(vp8_short_inv_walsh4x4_mmx)
52sym(vp8_short_inv_walsh4x4_mmx):
53    push        rbp
54    mov         rbp, rsp
55    SHADOW_ARGS_TO_STACK 2
56    push        rsi
57    push        rdi
58    ; end prolog
59
60    mov     rax, 3
61    mov     rsi, arg(0)
62    mov     rdi, arg(1)
63    shl     rax, 16
64
65    movq    mm0, [rsi + 0]        ;ip[0]
66    movq    mm1, [rsi + 8]        ;ip[4]
67    or      rax, 3            ;00030003h
68
69    movq    mm2, [rsi + 16]       ;ip[8]
70    movq    mm3, [rsi + 24]       ;ip[12]
71
72    movq    mm7, rax
73    movq    mm4, mm0
74
75    punpcklwd mm7, mm7          ;0003000300030003h
76    movq    mm5, mm1
77
78    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
79    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
80
81    movq    mm6, mm4          ;temp al
82
83    paddw   mm4, mm5          ;al + bl
84    psubw   mm6, mm5          ;al - bl
85
86    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
87    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
88
89    movq    mm5, mm0          ;temp dl
90
91    paddw   mm0, mm1          ;dl + cl
92    psubw   mm5, mm1          ;dl - cl
93
94    ; 03 02 01 00
95    ; 13 12 11 10
96    ; 23 22 21 20
97    ; 33 32 31 30
98
99    movq    mm3, mm4          ; 03 02 01 00
100    punpcklwd mm4, mm0          ; 11 01 10 00
101    punpckhwd mm3, mm0          ; 13 03 12 02
102
103    movq    mm1, mm6          ; 23 22 21 20
104    punpcklwd mm6, mm5          ; 31 21 30 20
105    punpckhwd mm1, mm5          ; 33 23 32 22
106
107    movq    mm0, mm4          ; 11 01 10 00
108    movq    mm2, mm3          ; 13 03 12 02
109
110    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
111    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
112
113    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
114    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
115;~~~~~~~~~~~~~~~~~~~~~
116    movq    mm1, mm0
117    movq    mm5, mm4
118
119    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
120    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
121
122    movq    mm6, mm1          ;temp al
123
124    paddw   mm1, mm5          ;al + bl
125    psubw   mm6, mm5          ;al - bl
126
127    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
128    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
129
130    movq    mm5, mm0          ;temp dl
131
132    paddw   mm0, mm4          ;dl + cl
133    psubw   mm5, mm4          ;dl - cl
134;~~~~~~~~~~~~~~~~~~~~~
135    movq    mm3, mm1          ; 03 02 01 00
136    punpcklwd mm1, mm0          ; 11 01 10 00
137    punpckhwd mm3, mm0          ; 13 03 12 02
138
139    movq    mm4, mm6          ; 23 22 21 20
140    punpcklwd mm6, mm5          ; 31 21 30 20
141    punpckhwd mm4, mm5          ; 33 23 32 22
142
143    movq    mm0, mm1          ; 11 01 10 00
144    movq    mm2, mm3          ; 13 03 12 02
145
146    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
147    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
148
149    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
150    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
151
152    paddw   mm0, mm7
153    paddw   mm1, mm7
154    paddw   mm2, mm7
155    paddw   mm3, mm7
156
157    psraw   mm0, 3
158    psraw   mm1, 3
159    psraw   mm2, 3
160    psraw   mm3, 3
161
162    movq  [rdi + 0], mm0
163    movq  [rdi + 8], mm1
164    movq  [rdi + 16], mm2
165    movq  [rdi + 24], mm3
166
167    ; begin epilog
168    pop rdi
169    pop rsi
170    UNSHADOW_ARGS
171    pop         rbp
172    ret
173
174