• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2004-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  mc_chroma.asm
33;*
34;*  Abstract
35;*      mmx motion compensation for chroma
36;*
37;*  History
38;*      10/13/2004 Created
39;*
40;*
41;*************************************************************************/
42%include "asm_inc.asm"
43
44;***********************************************************************
45; Local Data (Read Only)
46;***********************************************************************
47
48SECTION .rodata align=16
49
50;***********************************************************************
51; Various memory constants (trigonometric values or rounding values)
52;***********************************************************************
53
54ALIGN 16
55h264_d0x20_sse2:
56    dw 32,32,32,32,32,32,32,32
57ALIGN 16
58h264_d0x20_mmx:
59    dw 32,32,32,32
60
61
62;=============================================================================
63; Code
64;=============================================================================
65
66SECTION .text
67
68;*******************************************************************************
69; void McChromaWidthEq4_mmx( const uint8_t *src,
70;                           int32_t iSrcStride,
71;                           uint8_t *pDst,
72;                           int32_t iDstStride,
73;                           const uint8_t *pABCD,
74;                           int32_t iHeigh );
75;*******************************************************************************
76WELS_EXTERN McChromaWidthEq4_mmx
77    %assign  push_num 0
78    LOAD_6_PARA
79    SIGN_EXTENSION  r1, r1d
80    SIGN_EXTENSION  r3, r3d
81    SIGN_EXTENSION  r5, r5d
82
83    movd mm3, [r4]; [eax]
84    WELS_Zero mm7
85    punpcklbw mm3, mm3
86    movq      mm4, mm3
87    punpcklwd mm3, mm3
88    punpckhwd mm4, mm4
89
90    movq      mm5, mm3
91    punpcklbw mm3, mm7
92    punpckhbw mm5, mm7
93
94    movq      mm6, mm4
95    punpcklbw mm4, mm7
96    punpckhbw mm6, mm7
97
98    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
99    movd mm0, [r0]
100    movd mm1, [r0+1]
101    punpcklbw mm0, mm7
102    punpcklbw mm1, mm7
103.xloop:
104
105    pmullw mm0, mm3
106    pmullw mm1, mm5
107    paddw  mm0, mm1
108
109    movd  mm1, [r4]
110    punpcklbw mm1, mm7
111    movq mm2, mm1
112    pmullw mm1, mm4
113    paddw mm0, mm1
114
115    movd mm1, [r4+1]
116    punpcklbw mm1, mm7
117    movq mm7, mm1
118    pmullw mm1,mm6
119    paddw mm0, mm1
120    movq mm1,mm7
121
122%ifdef X86_32_PICASM
123    pcmpeqw mm7, mm7
124    psrlw   mm7, 15
125    psllw   mm7, 5
126    paddw   mm0, mm7
127%else
128    paddw mm0, [h264_d0x20_mmx]
129%endif
130    psrlw mm0, 6
131
132    WELS_Zero mm7
133    packuswb mm0, mm7
134    movd [r2], mm0
135
136    movq mm0, mm2
137
138    lea r2, [r2 + r3]
139    lea r4, [r4 + r1]
140
141    dec r5
142    jnz near .xloop
143    WELSEMMS
144    LOAD_6_PARA_POP
145    ret
146
147
148;*******************************************************************************
149; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
150;                       int32_t iSrcStride,
151;                       uint8_t *pDst,
152;                       int32_t iDstStride,
153;                       const uint8_t *pABCD,
154;                       int32_t iheigh );
155;*******************************************************************************
156WELS_EXTERN McChromaWidthEq8_sse2
157    %assign  push_num 0
158    LOAD_6_PARA
159    PUSH_XMM 8
160    SIGN_EXTENSION  r1, r1d
161    SIGN_EXTENSION  r3, r3d
162    SIGN_EXTENSION  r5, r5d
163
164    movd xmm3, [r4]
165    WELS_Zero xmm7
166    punpcklbw  xmm3, xmm3
167    punpcklwd  xmm3, xmm3
168
169    movdqa     xmm4, xmm3
170    punpckldq  xmm3, xmm3
171    punpckhdq  xmm4, xmm4
172    movdqa     xmm5, xmm3
173    movdqa     xmm6, xmm4
174
175    punpcklbw  xmm3, xmm7
176    punpckhbw  xmm5, xmm7
177    punpcklbw  xmm4, xmm7
178    punpckhbw  xmm6, xmm7
179
180    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
181    movq xmm0, [r0]
182    movq xmm1, [r0+1]
183    punpcklbw xmm0, xmm7
184    punpcklbw xmm1, xmm7
185.xloop:
186
187    pmullw xmm0, xmm3
188    pmullw xmm1, xmm5
189    paddw  xmm0, xmm1
190
191    movq  xmm1, [r4]
192    punpcklbw xmm1, xmm7
193    movdqa xmm2, xmm1
194    pmullw xmm1, xmm4
195    paddw xmm0, xmm1
196
197    movq xmm1, [r4+1]
198    punpcklbw xmm1, xmm7
199    movdqa xmm7, xmm1
200    pmullw xmm1, xmm6
201    paddw xmm0, xmm1
202    movdqa xmm1,xmm7
203
204%ifdef X86_32_PICASM
205    pcmpeqw xmm7, xmm7
206    psrlw   xmm7, 15
207    psllw   xmm7, 5
208    paddw   xmm0, xmm7
209%else
210    paddw xmm0, [h264_d0x20_sse2]
211%endif
212    psrlw xmm0, 6
213
214    WELS_Zero xmm7
215    packuswb xmm0, xmm7
216    movq [r2], xmm0
217
218    movdqa xmm0, xmm2
219
220    lea r2, [r2 + r3]
221    lea r4, [r4 + r1]
222
223    dec r5
224    jnz near .xloop
225
226    POP_XMM
227    LOAD_6_PARA_POP
228
229    ret
230
231
232
233
234;***********************************************************************
235; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
236;                        int32_t iSrcStride,
237;                        uint8_t *pDst,
238;                        int32_t iDstStride,
239;                        const uint8_t *pABCD,
240;                        int32_t iHeigh);
241;***********************************************************************
242WELS_EXTERN McChromaWidthEq8_ssse3
243    %assign  push_num 0
244    LOAD_6_PARA
245    PUSH_XMM 8
246    SIGN_EXTENSION  r1, r1d
247    SIGN_EXTENSION  r3, r3d
248    SIGN_EXTENSION  r5, r5d
249
250    pxor      xmm7, xmm7
251    movd   xmm5, [r4]
252    punpcklwd xmm5, xmm5
253    punpckldq xmm5, xmm5
254    movdqa    xmm6, xmm5
255    punpcklqdq xmm5, xmm5
256    punpckhqdq xmm6, xmm6
257
258    sub r2, r3 ;sub esi, edi
259    sub r2, r3
260%ifdef X86_32_PICASM
261    pcmpeqw xmm7, xmm7
262    psrlw   xmm7, 15
263    psllw   xmm7, 5
264%else
265    movdqa xmm7, [h264_d0x20_sse2]
266%endif
267
268    movdqu xmm0, [r0]
269    movdqa xmm1, xmm0
270    psrldq xmm1, 1
271    punpcklbw xmm0, xmm1
272
273.hloop_chroma:
274    lea r2, [r2+2*r3]
275
276    movdqu xmm2, [r0+r1]
277    movdqa xmm3, xmm2
278    psrldq xmm3, 1
279    punpcklbw xmm2, xmm3
280    movdqa      xmm4, xmm2
281
282    pmaddubsw  xmm0, xmm5
283    pmaddubsw  xmm2, xmm6
284    paddw      xmm0, xmm2
285    paddw      xmm0, xmm7
286    psrlw      xmm0, 6
287    packuswb   xmm0, xmm0
288    movq       [r2],xmm0
289
290    lea r0, [r0+2*r1]
291    movdqu xmm2, [r0]
292    movdqa xmm3, xmm2
293    psrldq xmm3, 1
294    punpcklbw xmm2, xmm3
295    movdqa      xmm0, xmm2
296
297    pmaddubsw  xmm4, xmm5
298    pmaddubsw  xmm2, xmm6
299    paddw      xmm4, xmm2
300    paddw      xmm4, xmm7
301    psrlw      xmm4, 6
302    packuswb   xmm4, xmm4
303    movq       [r2+r3],xmm4
304
305    sub r5, 2
306    jnz .hloop_chroma
307
308    POP_XMM
309    LOAD_6_PARA_POP
310
311    ret
312
313
314