• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14; /****************************************************************************
15; * Notes:
16; *
17; * This implementation makes use of 16 bit fixed point verio of two multiply
18; * constants:
19; *        1.   sqrt(2) * cos (pi/8)
20; *         2.   sqrt(2) * sin (pi/8)
21; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
22; * fixed point prrcision as the second one, we use a trick of
23; *        x * a = x + x*(a-1)
24; * so
25; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
26; *
27; * For     the second constant, becuase of the 16bit version is 35468, which
28; * is bigger than 32768, in signed 16 bit multiply, it become a negative
29; * number.
30; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
31; *
32; **************************************************************************/
33
34
35;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
36global sym(vp8_short_idct4x4llm_mmx)
37sym(vp8_short_idct4x4llm_mmx):
38    push        rbp
39    mov         rbp, rsp
40    SHADOW_ARGS_TO_STACK 3
41    GET_GOT     rbx
42    ; end prolog
43
44        mov         rax,            arg(0) ;input
45        mov         rdx,            arg(1) ;output
46
47        movq        mm0,            [rax   ]
48        movq        mm1,            [rax+ 8]
49
50        movq        mm2,            [rax+16]
51        movq        mm3,            [rax+24]
52
53        movsxd      rax,            dword ptr arg(2) ;pitch
54
55        psubw       mm0,            mm2             ; b1= 0-2
56        paddw       mm2,            mm2             ;
57
58        movq        mm5,            mm1
59        paddw       mm2,            mm0             ; a1 =0+2
60
61        pmulhw      mm5,            [GLOBAL(x_s1sqr2)]       ;
62        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
63
64        movq        mm7,            mm3             ;
65        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
66
67        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
68        psubw       mm7,            mm5             ; c1
69
70        movq        mm5,            mm1
71        movq        mm4,            mm3
72
73        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
74        paddw       mm5,            mm1
75
76        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
77        paddw       mm3,            mm4
78
79        paddw       mm3,            mm5             ; d1
80        movq        mm6,            mm2             ; a1
81
82        movq        mm4,            mm0             ; b1
83        paddw       mm2,            mm3             ;0
84
85        paddw       mm4,            mm7             ;1
86        psubw       mm0,            mm7             ;2
87
88        psubw       mm6,            mm3             ;3
89
90        movq        mm1,            mm2             ; 03 02 01 00
91        movq        mm3,            mm4             ; 23 22 21 20
92
93        punpcklwd   mm1,            mm0             ; 11 01 10 00
94        punpckhwd   mm2,            mm0             ; 13 03 12 02
95
96        punpcklwd   mm3,            mm6             ; 31 21 30 20
97        punpckhwd   mm4,            mm6             ; 33 23 32 22
98
99        movq        mm0,            mm1             ; 11 01 10 00
100        movq        mm5,            mm2             ; 13 03 12 02
101
102        punpckldq   mm0,            mm3             ; 30 20 10 00
103        punpckhdq   mm1,            mm3             ; 31 21 11 01
104
105        punpckldq   mm2,            mm4             ; 32 22 12 02
106        punpckhdq   mm5,            mm4             ; 33 23 13 03
107
108        movq        mm3,            mm5             ; 33 23 13 03
109
110        psubw       mm0,            mm2             ; b1= 0-2
111        paddw       mm2,            mm2             ;
112
113        movq        mm5,            mm1
114        paddw       mm2,            mm0             ; a1 =0+2
115
116        pmulhw      mm5,            [GLOBAL(x_s1sqr2)]        ;
117        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
118
119        movq        mm7,            mm3             ;
120        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
121
122        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
123        psubw       mm7,            mm5             ; c1
124
125        movq        mm5,            mm1
126        movq        mm4,            mm3
127
128        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
129        paddw       mm5,            mm1
130
131        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
132        paddw       mm3,            mm4
133
134        paddw       mm3,            mm5             ; d1
135        paddw       mm0,            [GLOBAL(fours)]
136
137        paddw       mm2,            [GLOBAL(fours)]
138        movq        mm6,            mm2             ; a1
139
140        movq        mm4,            mm0             ; b1
141        paddw       mm2,            mm3             ;0
142
143        paddw       mm4,            mm7             ;1
144        psubw       mm0,            mm7             ;2
145
146        psubw       mm6,            mm3             ;3
147        psraw       mm2,            3
148
149        psraw       mm0,            3
150        psraw       mm4,            3
151
152        psraw       mm6,            3
153
154        movq        mm1,            mm2             ; 03 02 01 00
155        movq        mm3,            mm4             ; 23 22 21 20
156
157        punpcklwd   mm1,            mm0             ; 11 01 10 00
158        punpckhwd   mm2,            mm0             ; 13 03 12 02
159
160        punpcklwd   mm3,            mm6             ; 31 21 30 20
161        punpckhwd   mm4,            mm6             ; 33 23 32 22
162
163        movq        mm0,            mm1             ; 11 01 10 00
164        movq        mm5,            mm2             ; 13 03 12 02
165
166        punpckldq   mm0,            mm3             ; 30 20 10 00
167        punpckhdq   mm1,            mm3             ; 31 21 11 01
168
169        punpckldq   mm2,            mm4             ; 32 22 12 02
170        punpckhdq   mm5,            mm4             ; 33 23 13 03
171
172        movq        [rdx],          mm0
173
174        movq        [rdx+rax],      mm1
175        movq        [rdx+rax*2],    mm2
176
177        add         rdx,            rax
178        movq        [rdx+rax*2],    mm5
179
180    ; begin epilog
181    RESTORE_GOT
182    UNSHADOW_ARGS
183    pop         rbp
184    ret
185
186
187;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
188global sym(vp8_short_idct4x4llm_1_mmx)
189sym(vp8_short_idct4x4llm_1_mmx):
190    push        rbp
191    mov         rbp, rsp
192    SHADOW_ARGS_TO_STACK 3
193    GET_GOT     rbx
194    ; end prolog
195
196        mov         rax,            arg(0) ;input
197        movd        mm0,            [rax]
198
199        paddw       mm0,            [GLOBAL(fours)]
200        mov         rdx,            arg(1) ;output
201
202        psraw       mm0,            3
203        movsxd      rax,            dword ptr arg(2) ;pitch
204
205        punpcklwd   mm0,            mm0
206        punpckldq   mm0,            mm0
207
208        movq        [rdx],          mm0
209        movq        [rdx+rax],      mm0
210
211        movq        [rdx+rax*2],    mm0
212        add         rdx,            rax
213
214        movq        [rdx+rax*2],    mm0
215
216
217    ; begin epilog
218    RESTORE_GOT
219    UNSHADOW_ARGS
220    pop         rbp
221    ret
222
223;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
224global sym(vp8_dc_only_idct_add_mmx)
225sym(vp8_dc_only_idct_add_mmx):
226    push        rbp
227    mov         rbp, rsp
228    SHADOW_ARGS_TO_STACK 5
229    GET_GOT     rbx
230    push        rsi
231    push        rdi
232    ; end prolog
233
234        mov         rsi,            arg(1) ;s -- prediction
235        mov         rdi,            arg(2) ;d -- destination
236        movsxd      rax,            dword ptr arg(4) ;stride
237        movsxd      rdx,            dword ptr arg(3) ;pitch
238        pxor        mm0,            mm0
239
240        movd        mm5,            arg(0) ;input_dc
241
242        paddw       mm5,            [GLOBAL(fours)]
243
244        psraw       mm5,            3
245
246        punpcklwd   mm5,            mm5
247        punpckldq   mm5,            mm5
248
249        movd        mm1,            [rsi]
250        punpcklbw   mm1,            mm0
251        paddsw      mm1,            mm5
252        packuswb    mm1,            mm0              ; pack and unpack to saturate
253        movd        [rdi],          mm1
254
255        movd        mm2,            [rsi+rdx]
256        punpcklbw   mm2,            mm0
257        paddsw      mm2,            mm5
258        packuswb    mm2,            mm0              ; pack and unpack to saturate
259        movd        [rdi+rax],      mm2
260
261        movd        mm3,            [rsi+2*rdx]
262        punpcklbw   mm3,            mm0
263        paddsw      mm3,            mm5
264        packuswb    mm3,            mm0              ; pack and unpack to saturate
265        movd        [rdi+2*rax],    mm3
266
267        add         rdi,            rax
268        add         rsi,            rdx
269        movd        mm4,            [rsi+2*rdx]
270        punpcklbw   mm4,            mm0
271        paddsw      mm4,            mm5
272        packuswb    mm4,            mm0              ; pack and unpack to saturate
273        movd        [rdi+2*rax],    mm4
274
275    ; begin epilog
276    pop rdi
277    pop rsi
278    RESTORE_GOT
279    UNSHADOW_ARGS
280    pop         rbp
281    ret
282
283SECTION_RODATA
284align 16
285x_s1sqr2:
286    times 4 dw 0x8A8C
287align 16
288x_c1sqr2less1:
289    times 4 dw 0x4E7B
290align 16
291fours:
292    times 4 dw 0x0004
293